From 02ec6cafd78c2052283516afc74c309745d20271 Mon Sep 17 00:00:00 2001 From: Hoang Le Date: Tue, 19 Mar 2019 18:49:48 +0700 Subject: tipc: support broadcast/replicast configurable for bc-link Currently, a multicast stream uses either broadcast or replicast as transmission method, based on the ratio between number of actual destinations nodes and cluster size. However, when an L2 interface (e.g., VXLAN) provides pseudo broadcast support, this becomes very inefficient, as it blindly replicates multicast packets to all cluster/subnet nodes, irrespective of whether they host actual target sockets or not. The TIPC multicast algorithm is able to distinguish real destination nodes from other nodes, and hence provides a smarter and more efficient method for transferring multicast messages than pseudo broadcast can do. Because of this, we now make it possible for users to force the broadcast link to permanently switch to using replicast, irrespective of which capabilities the bearer provides, or pretend to provide. Conversely, we also make it possible to force the broadcast link to always use true broadcast. While maybe less useful in deployed systems, this may at least be useful for testing the broadcast algorithm in small clusters. We retain the current AUTOSELECT ability, i.e., to let the broadcast link automatically select which algorithm to use, and to switch back and forth between broadcast and replicast as the ratio between destination node number and cluster size changes. This remains the default method. Furthermore, we make it possible to configure the threshold ratio for such switches. The default ratio is now set to 10%, down from 25% in the earlier implementation. Acked-by: Jon Maloy Signed-off-by: Hoang Le Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 0ebe02ef1a86..efb958fd167d 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -281,6 +281,8 @@ enum { TIPC_NLA_PROP_TOL, /* u32 */ TIPC_NLA_PROP_WIN, /* u32 */ TIPC_NLA_PROP_MTU, /* u32 */ + TIPC_NLA_PROP_BROADCAST, /* u32 */ + TIPC_NLA_PROP_BROADCAST_RATIO, /* u32 */ __TIPC_NLA_PROP_MAX, TIPC_NLA_PROP_MAX = __TIPC_NLA_PROP_MAX - 1 -- cgit v1.2.3 From 9403cf2302588022d06f1878b072d3f6933021f0 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 19 Mar 2019 16:05:44 +0100 Subject: tcp: free request sock directly upon TFO or syncookies error Since the request socket is created locally, it'd make more sense to use reqsk_free() instead of reqsk_put() in TFO and syncookies' error path. However, tcp_get_cookie_sock() may set ->rsk_refcnt before freeing the socket; tcp_conn_request() may also have non-null ->rsk_refcnt because of tcp_try_fastopen(). In both cases 'req' hasn't been exposed to the outside world and is safe to free immediately, but that'd trigger the WARN_ON_ONCE in reqsk_free(). Define __reqsk_free() for these situations where we know nobody's referencing the socket, even though ->rsk_refcnt might be non-null. Now we can consolidate the error path of tcp_get_cookie_sock() and tcp_conn_request(). Signed-off-by: Guillaume Nault Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 21a5243fecd1..9dfd7960d90a 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -106,10 +106,8 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, return req; } -static inline void reqsk_free(struct request_sock *req) +static inline void __reqsk_free(struct request_sock *req) { - WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0); - req->rsk_ops->destructor(req); if (req->rsk_listener) sock_put(req->rsk_listener); @@ -117,6 +115,12 @@ static inline void reqsk_free(struct request_sock *req) kmem_cache_free(req->rsk_ops->slab, req); } +static inline void reqsk_free(struct request_sock *req) +{ + WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0); + __reqsk_free(req); +} + static inline void reqsk_put(struct request_sock *req) { if (refcount_dec_and_test(&req->rsk_refcnt)) -- cgit v1.2.3 From 03f1eccc7a69c965351e6bee41c62afa2844752f Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Tue, 19 Mar 2019 12:37:12 -0400 Subject: ipv6: Add icmp_echo_ignore_multicast support for ICMPv6 IPv4 has icmp_echo_ignore_broadcast to prevent responding to broadcast pings. IPv6 needs a similar mechanism. v1->v2: - Remove NET_IPV6_ICMP_ECHO_IGNORE_MULTICAST. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index b028a1dc150d..e29aff15acc9 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -33,6 +33,7 @@ struct netns_sysctl_ipv6 { int auto_flowlabels; int icmpv6_time; int icmpv6_echo_ignore_all; + int icmpv6_echo_ignore_multicast; int anycast_src_echo_reply; int ip_nonlocal_bind; int fwmark_reflect; -- cgit v1.2.3 From f295b3ae9f5927e084bd5decdff82390e3471801 Mon Sep 17 00:00:00 2001 From: Vakul Garg Date: Wed, 20 Mar 2019 02:03:36 +0000 Subject: net/tls: Add support of AES128-CCM based ciphers Added support for AES128-CCM based record encryption. AES128-CCM is similar to AES128-GCM. Both of them have same salt/iv/mac size. The notable difference between the two is that while invoking AES128-CCM operation, the salt||nonce (which is passed as IV) has to be prefixed with a hardcoded value '2'. Further, CCM implementation in kernel requires IV passed in crypto_aead_request() to be full '16' bytes. Therefore, the record structure 'struct tls_rec' has been modified to reserve '16' bytes for IV. This works for both GCM and CCM based cipher. Signed-off-by: Vakul Garg Signed-off-by: David S. Miller --- include/net/tls.h | 15 +++++++++++++-- include/uapi/linux/tls.h | 15 +++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index a5a938583295..3ce71d78414c 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -60,6 +60,17 @@ #define TLS_AAD_SPACE_SIZE 13 #define TLS_DEVICE_NAME_MAX 32 +#define MAX_IV_SIZE 16 + +/* For AES-CCM, the full 16-bytes of IV is made of '4' fields of given sizes. + * + * IV[16] = b0[1] || implicit nonce[4] || explicit nonce[8] || length[3] + * + * The field 'length' is encoded in field 'b0' as '(length width - 1)'. + * Hence b0 contains (3 - 1) = 2. + */ +#define TLS_AES_CCM_IV_B0_BYTE 2 + /* * This structure defines the routines for Inline TLS driver. * The following routines are optional and filled with a @@ -123,8 +134,7 @@ struct tls_rec { struct scatterlist sg_content_type; char aad_space[TLS_AAD_SPACE_SIZE]; - u8 iv_data[TLS_CIPHER_AES_GCM_128_IV_SIZE + - TLS_CIPHER_AES_GCM_128_SALT_SIZE]; + u8 iv_data[MAX_IV_SIZE]; struct aead_request aead_req; u8 aead_req_ctx[]; }; @@ -219,6 +229,7 @@ struct tls_prot_info { u16 tag_size; u16 overhead_size; u16 iv_size; + u16 salt_size; u16 rec_seq_size; u16 aad_size; u16 tail_size; diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index 401d6f01de6a..5b9c26753e46 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -70,6 +70,13 @@ #define TLS_CIPHER_AES_GCM_256_TAG_SIZE 16 #define TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE 8 +#define TLS_CIPHER_AES_CCM_128 53 +#define TLS_CIPHER_AES_CCM_128_IV_SIZE 8 +#define TLS_CIPHER_AES_CCM_128_KEY_SIZE 16 +#define TLS_CIPHER_AES_CCM_128_SALT_SIZE 4 +#define TLS_CIPHER_AES_CCM_128_TAG_SIZE 16 +#define TLS_CIPHER_AES_CCM_128_REC_SEQ_SIZE 8 + #define TLS_SET_RECORD_TYPE 1 #define TLS_GET_RECORD_TYPE 2 @@ -94,4 +101,12 @@ struct tls12_crypto_info_aes_gcm_256 { unsigned char rec_seq[TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE]; }; +struct tls12_crypto_info_aes_ccm_128 { + struct tls_crypto_info info; + unsigned char iv[TLS_CIPHER_AES_CCM_128_IV_SIZE]; + unsigned char key[TLS_CIPHER_AES_CCM_128_KEY_SIZE]; + unsigned char salt[TLS_CIPHER_AES_CCM_128_SALT_SIZE]; + unsigned char rec_seq[TLS_CIPHER_AES_CCM_128_REC_SEQ_SIZE]; +}; + #endif /* _UAPI_LINUX_TLS_H */ -- cgit v1.2.3 From 4bd97d51a5e602ea1fbdab8c2d653513dea17115 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Mar 2019 11:02:04 +0100 Subject: net: dev: rename queue selection helpers. With the following patches, we are going to use __netdev_pick_tx() in many modules. Rename it to netdev_pick_tx(), to make it clear is a public API. Also rename the existing netdev_pick_tx() to netdev_core_pick_tx(), to avoid name clashes. Suggested-by: Eric Dumazet Suggested-by: David Miller Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 26f69cf763f4..57cd2bdd9f78 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2152,9 +2152,9 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, &qdisc_xmit_lock_key); \ } -struct netdev_queue *netdev_pick_tx(struct net_device *dev, - struct sk_buff *skb, - struct net_device *sb_dev); +struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, + struct sk_buff *skb, + struct net_device *sb_dev); /* returns the headroom that the master device needs to take in account * when forwarding to this dev -- cgit v1.2.3 From b71b5837f8711dbc4bc0424cb5c75e5921be055c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Mar 2019 11:02:05 +0100 Subject: packet: rework packet_pick_tx_queue() to use common code selection Currently packet_pick_tx_queue() is the only caller of ndo_select_queue() using a fallback argument other than netdev_pick_tx. Leveraging rx queue, we can obtain a similar queue selection behavior using core helpers. After this change, ndo_select_queue() is always invoked with netdev_pick_tx() as fallback. We can change ndo_select_queue() signature in a followup patch, dropping an indirect call per transmitted packet in some scenarios (e.g. TCP syn and XDP generic xmit) This changes slightly how af packet queue selection happens when PACKET_QDISC_BYPASS is set. It's now more similar to plan dev_queue_xmit() tacking in account both XPS and TC mapping. v1 -> v2: - rebased after helper name change RFC -> v1: - initialize sender_cpu to the expected value Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 57cd2bdd9f78..0ff28db4239f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2152,6 +2152,8 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, &qdisc_xmit_lock_key); \ } +u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, + struct net_device *sb_dev); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); -- cgit v1.2.3 From a350eccee5830d9a1f29e393a88dc05a15326d44 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Mar 2019 11:02:06 +0100 Subject: net: remove 'fallback' argument from dev->ndo_select_queue() After the previous patch, all the callers of ndo_select_queue() provide as a 'fallback' argument netdev_pick_tx. The only exceptions are nested calls to ndo_select_queue(), which pass down the 'fallback' available in the current scope - still netdev_pick_tx. We can drop such argument and replace fallback() invocation with netdev_pick_tx(). This avoids an indirect call per xmit packet in some scenarios (TCP syn, UDP unconnected, XDP generic, pktgen) with device drivers implementing such ndo. It also clean the code a bit. Tested with ixgbe and CONFIG_FCOE=m With pktgen using queue xmit: threads vanilla patched (kpps) (kpps) 1 2334 2428 2 4166 4278 4 7895 8100 v1 -> v2: - rebased after helper's name change Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0ff28db4239f..823762291ebf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -986,8 +986,7 @@ struct devlink; * those the driver believes to be appropriate. * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - * struct net_device *sb_dev, - * select_queue_fallback_t fallback); + * struct net_device *sb_dev); * Called to decide which queue to use when device supports multiple * transmit queues. * @@ -1268,8 +1267,7 @@ struct net_device_ops { netdev_features_t features); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev, - select_queue_fallback_t fallback); + struct net_device *sb_dev); void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); @@ -2641,11 +2639,9 @@ void dev_close_many(struct list_head *head, bool unlink); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev, - select_queue_fallback_t fallback); + struct net_device *sb_dev); u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb, - struct net_device *sb_dev, - select_queue_fallback_t fallback); + struct net_device *sb_dev); int dev_queue_xmit(struct sk_buff *skb); int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev); int dev_direct_xmit(struct sk_buff *skb, u16 queue_id); -- cgit v1.2.3 From 0b03a5ca8b14321366eec4a903922d2b46d585ff Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Wed, 20 Mar 2019 10:29:27 -0400 Subject: ipv6: Add icmp_echo_ignore_anycast for ICMPv6 In addition to icmp_echo_ignore_multicast, there is a need to also prevent responding to pings to anycast addresses for security. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index e29aff15acc9..64e29b58bb5e 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -34,6 +34,7 @@ struct netns_sysctl_ipv6 { int icmpv6_time; int icmpv6_echo_ignore_all; int icmpv6_echo_ignore_multicast; + int icmpv6_echo_ignore_anycast; int anycast_src_echo_reply; int ip_nonlocal_bind; int fwmark_reflect; -- cgit v1.2.3 From c7a1ce397adacaf5d4bb2eab0a738b5f80dc3e43 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 21 Mar 2019 05:21:35 -0700 Subject: ipv6: Change addrconf_f6i_alloc to use ip6_route_info_create Change addrconf_f6i_alloc to generate a fib6_config and call ip6_route_info_create. addrconf_f6i_alloc is the last caller to fib6_info_alloc besides ip6_route_info_create, and there is no reason for it to do its own initialization on a fib6_info. Host routes need to be created even if the device is down, so add a new flag, fc_ignore_dev_down, to fib6_config and update fib6_nh_init to not error out if device is not up. Notes on the conversion: - ip_fib_metrics_init is the same as fib6_config has fc_mx set to NULL and fc_mx_len set to 0 - dst_nocount is handled by the RTF_ADDRCONF flag - dst_host is handled by fc_dst_len = 128 nh_gw does not get set after the conversion to ip6_route_info_create but it should not be set in addrconf_f6i_alloc since this is a host route not a gateway route. Everything else is a straight forward map between fib6_info and fib6_config. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 84097010237c..2acb78a762ee 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -50,7 +50,8 @@ struct fib6_config { u32 fc_protocol; u16 fc_type; /* only 8 bits are used */ u16 fc_delete_all_nh : 1, - __unused : 15; + fc_ignore_dev_down:1, + __unused : 14; struct in6_addr fc_dst; struct in6_addr fc_src; -- cgit v1.2.3 From 0c3e0e3bb623c3735b8c9ab8aa8332f944f83a9f Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 20 Mar 2019 12:16:42 +0300 Subject: tun: Add ioctl() TUNGETDEVNETNS cmd to allow obtaining real net ns of tun device In commit f2780d6d7475 "tun: Add ioctl() SIOCGSKNS cmd to allow obtaining net ns of tun device" it was missed that tun may change its net ns, while net ns of socket remains the same as it was created initially. SIOCGSKNS returns net ns of socket, so it is not suitable for obtaining net ns of device. We may have two tun devices with the same names in two net ns, and in this case it's not possible to determ, which of them fd refers to (TUNGETIFF will return the same name). This patch adds new ioctl() cmd for obtaining net ns of a device. Reported-by: Harald Albrecht Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/uapi/linux/if_tun.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 23a6753b37df..454ae31b93c7 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -60,6 +60,7 @@ #define TUNSETSTEERINGEBPF _IOR('T', 224, int) #define TUNSETFILTEREBPF _IOR('T', 225, int) #define TUNSETCARRIER _IOW('T', 226, int) +#define TUNGETDEVNETNS _IO('T', 227) /* TUNSETIFF ifr flags */ #define IFF_TUN 0x0001 -- cgit v1.2.3 From 9ab948a91b2c2abc8e82845c0e61f4b1683e3a4f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 20 Mar 2019 09:18:59 -0700 Subject: ipv4: Allow amount of dirty memory from fib resizing to be controllable fib_trie implementation calls synchronize_rcu when a certain amount of pages are dirty from freed entries. The number of pages was determined experimentally in 2009 (commit c3059477fce2d). At the current setting, synchronize_rcu is called often -- 51 times in a second in one test with an average of an 8 msec delay adding a fib entry. The total impact is a lot of slow down modifying the fib. This is seen in the output of 'time' - the difference between real time and sys+user. For example, using 720,022 single path routes and 'ip -batch'[1]: $ time ./ip -batch ipv4/routes-1-hops real 0m14.214s user 0m2.513s sys 0m6.783s So roughly 35% of the actual time to install the routes is from the ip command getting scheduled out, most notably due to synchronize_rcu (this is observed using 'perf sched timehist'). This patch makes the amount of dirty memory configurable between 64k where the synchronize_rcu is called often (small, low end systems that are memory sensitive) to 64M where synchronize_rcu is called rarely during a large FIB change (for high end systems with lots of memory). The default is 512kB which corresponds to the current setting of 128 pages with a 4kB page size. As an example, at 16MB the worst interval shows 4 calls to synchronize_rcu in a second blocking for up to 30 msec in a single instance, and a total of almost 100 msec across the 4 calls in the second. The trade off is allowing FIB entries to consume more memory in a given time window but but with much better fib insertion rates (~30% increase in prefixes/sec). With this patch and net.ipv4.fib_sync_mem set to 16MB, the same batch file runs in: $ time ./ip -batch ipv4/routes-1-hops real 0m9.692s user 0m2.491s sys 0m6.769s So the dead time is reduced to about 1/2 second or <5% of the real time. [1] 'ip' modified to not request ACK messages which improves route insertion times by about 20% Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index be3cad9c2e4c..aa09ae5f01a5 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -38,6 +38,10 @@ #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MIN_MTU 68 /* RFC 791 */ +extern unsigned int sysctl_fib_sync_mem; +extern unsigned int sysctl_fib_sync_mem_min; +extern unsigned int sysctl_fib_sync_mem_max; + struct sock; struct inet_skb_parm { -- cgit v1.2.3 From 02afc7ad45bd6cfc9fd51fdbc132455371b63469 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Wed, 20 Mar 2019 20:02:56 +0100 Subject: net: dst: remove gc leftovers Get rid of some obsolete gc-related documentation and macros that were missed in commit 5b7c9a8ff828 ("net: remove dst gc related code"). CC: Wei Wang Signed-off-by: Julian Wiedmann Acked-by: Wei Wang Signed-off-by: David S. Miller --- include/net/dst.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 6cf0870414c7..12b31c602cb0 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -19,17 +19,6 @@ #include #include -#define DST_GC_MIN (HZ/10) -#define DST_GC_INC (HZ/2) -#define DST_GC_MAX (120*HZ) - -/* Each dst_entry has reference count and sits in some parent list(s). - * When it is removed from parent list, it is "freed" (dst_free). - * After this it enters dead state (dst->obsolete > 0) and if its refcnt - * is zero, it can be destroyed immediately, otherwise it is added - * to gc list and garbage collector periodically checks the refcnt. - */ - struct sk_buff; struct dst_entry { -- cgit v1.2.3 From 4feb7c7a4fbb8f63371be31cda79433c7cf3da86 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 21 Mar 2019 14:42:40 +1100 Subject: rhashtable: don't hold lock on first table throughout insertion. rhashtable_try_insert() currently holds a lock on the bucket in the first table, while also locking buckets in subsequent tables. This is unnecessary and looks like a hold-over from some earlier version of the implementation. As insert and remove always lock a bucket in each table in turn, and as insert only inserts in the final table, there cannot be any races that are not covered by simply locking a bucket in each table in turn. When an insert call reaches that last table it can be sure that there is no matchinf entry in any other table as it has searched them all, and insertion never happens anywhere but in the last table. The fact that code tests for the existence of future_tbl while holding a lock on the relevant bucket ensures that two threads inserting the same key will make compatible decisions about which is the "last" table. This simplifies the code and allows the ->rehash field to be discarded. We still need a way to ensure that a dead bucket_table is never re-linked by rhashtable_walk_stop(). This can be achieved by calling call_rcu() inside the locked region, and checking with rcu_head_after_call_rcu() in rhashtable_walk_stop() to see if the bucket table is empty and dead. Acked-by: Herbert Xu Reviewed-by: Paul E. McKenney Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index ae9c0f71f311..3864193d5e2e 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -63,7 +63,6 @@ struct bucket_table { unsigned int size; unsigned int nest; - unsigned int rehash; u32 hash_rnd; unsigned int locks_mask; spinlock_t *locks; @@ -776,12 +775,6 @@ static inline int rhltable_insert( * @obj: pointer to hash head inside object * @params: hash table parameters * - * Locks down the bucket chain in both the old and new table if a resize - * is in progress to ensure that writers can't remove from the old table - * and can't insert to the new table during the atomic operation of search - * and insertion. Searches for duplicates in both the old and new table if - * a resize is in progress. - * * This lookup function may only be used for fixed key hash table (key_len * parameter set). It will BUG() if used inappropriately. * @@ -837,12 +830,6 @@ static inline void *rhashtable_lookup_get_insert_fast( * @obj: pointer to hash head inside object * @params: hash table parameters * - * Locks down the bucket chain in both the old and new table if a resize - * is in progress to ensure that writers can't remove from the old table - * and can't insert to the new table during the atomic operation of search - * and insertion. Searches for duplicates in both the old and new table if - * a resize is in progress. - * * Lookups may occur in parallel with hashtable mutations and resizing. * * Will trigger an automatic deferred table resizing if residency in the -- cgit v1.2.3 From f7ad68bf98506f48129267438ada1255fc4edfa2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 21 Mar 2019 14:42:40 +1100 Subject: rhashtable: rename rht_for_each*continue as *from. The pattern set by list.h is that for_each..continue() iterators start at the next entry after the given one, while for_each..from() iterators start at the given entry. The rht_for_each*continue() iterators are documented as though the start at the 'next' entry, but actually start at the given entry, and they are used expecting that behaviour. So fix the documentation and change the names to *from for consistency with list.h Acked-by: Herbert Xu Acked-by: Miguel Ojeda Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 3864193d5e2e..86dfa417848d 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -306,13 +306,13 @@ static inline struct rhash_head __rcu **rht_bucket_insert( } /** - * rht_for_each_continue - continue iterating over hash chain + * rht_for_each_from - iterate over hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. - * @head: the previous &struct rhash_head to continue from + * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ -#define rht_for_each_continue(pos, head, tbl, hash) \ +#define rht_for_each_from(pos, head, tbl, hash) \ for (pos = rht_dereference_bucket(head, tbl, hash); \ !rht_is_a_nulls(pos); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) @@ -324,18 +324,18 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ - rht_for_each_continue(pos, *rht_bucket(tbl, hash), tbl, hash) + rht_for_each_from(pos, *rht_bucket(tbl, hash), tbl, hash) /** - * rht_for_each_entry_continue - continue iterating over hash chain + * rht_for_each_entry_from - iterate over hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. - * @head: the previous &struct rhash_head to continue from + * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ -#define rht_for_each_entry_continue(tpos, pos, head, tbl, hash, member) \ +#define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \ for (pos = rht_dereference_bucket(head, tbl, hash); \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) @@ -349,7 +349,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_continue(tpos, pos, *rht_bucket(tbl, hash), \ + rht_for_each_entry_from(tpos, pos, *rht_bucket(tbl, hash), \ tbl, hash, member) /** @@ -374,9 +374,9 @@ static inline struct rhash_head __rcu **rht_bucket_insert( rht_dereference_bucket(pos->next, tbl, hash) : NULL) /** - * rht_for_each_rcu_continue - continue iterating over rcu hash chain + * rht_for_each_rcu_from - iterate over rcu hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. - * @head: the previous &struct rhash_head to continue from + * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @@ -384,7 +384,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ -#define rht_for_each_rcu_continue(pos, head, tbl, hash) \ +#define rht_for_each_rcu_from(pos, head, tbl, hash) \ for (({barrier(); }), \ pos = rht_dereference_bucket_rcu(head, tbl, hash); \ !rht_is_a_nulls(pos); \ @@ -401,13 +401,13 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu(pos, tbl, hash) \ - rht_for_each_rcu_continue(pos, *rht_bucket(tbl, hash), tbl, hash) + rht_for_each_rcu_from(pos, *rht_bucket(tbl, hash), tbl, hash) /** - * rht_for_each_entry_rcu_continue - continue iterating over rcu hash chain + * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. - * @head: the previous &struct rhash_head to continue from + * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. @@ -416,7 +416,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ -#define rht_for_each_entry_rcu_continue(tpos, pos, head, tbl, hash, member) \ +#define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \ for (({barrier(); }), \ pos = rht_dereference_bucket_rcu(head, tbl, hash); \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ @@ -435,7 +435,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_rcu_continue(tpos, pos, *rht_bucket(tbl, hash), \ + rht_for_each_entry_rcu_from(tpos, pos, *rht_bucket(tbl, hash), \ tbl, hash, member) /** @@ -491,7 +491,7 @@ restart: hash = rht_key_hashfn(ht, tbl, key, params); head = rht_bucket(tbl, hash); do { - rht_for_each_rcu_continue(he, *head, tbl, hash) { + rht_for_each_rcu_from(he, *head, tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) @@ -625,7 +625,7 @@ slow_path: if (!pprev) goto out; - rht_for_each_continue(head, *pprev, tbl, hash) { + rht_for_each_from(head, *pprev, tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; @@ -890,7 +890,7 @@ static inline int __rhashtable_remove_fast_one( spin_lock_bh(lock); pprev = rht_bucket_var(tbl, hash); - rht_for_each_continue(he, *pprev, tbl, hash) { + rht_for_each_from(he, *pprev, tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); @@ -1042,7 +1042,7 @@ static inline int __rhashtable_replace_fast( spin_lock_bh(lock); pprev = rht_bucket_var(tbl, hash); - rht_for_each_continue(he, *pprev, tbl, hash) { + rht_for_each_from(he, *pprev, tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; -- cgit v1.2.3 From 85a51f8c28b9812642d76db6889f3f39dc3fbab3 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:00 +0800 Subject: bpf: allow helpers to return PTR_TO_SOCK_COMMON It's currently not possible to access timewait or request sockets from eBPF, since there is no way to return a PTR_TO_SOCK_COMMON from a helper. Introduce RET_PTR_TO_SOCK_COMMON to enable this behaviour. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f02367faa58d..f62897198844 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -205,6 +205,7 @@ enum bpf_return_type { RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ + RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs -- cgit v1.2.3 From edbf8c01de5a104a71ed6df2bf6421ceb2836a8e Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:01 +0800 Subject: bpf: add skc_lookup_tcp helper Allow looking up a sock_common. This gives eBPF programs access to timewait and request sockets. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 929c8e537a14..fab05317f5e7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2431,6 +2431,23 @@ union bpf_attr { * Return * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. + * + * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to bpf_sk_lookup_tcp, except that it + * also returns timewait or request sockets. Use bpf_sk_fullsock + * or bpf_tcp_socket to access the full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from **reuse->socks**\ [] using the hash of the tuple. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2531,7 +2548,8 @@ union bpf_attr { FN(sk_fullsock), \ FN(tcp_sock), \ FN(skb_ecn_set_ce), \ - FN(get_listener_sock), + FN(get_listener_sock), \ + FN(skc_lookup_tcp), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 399040847084a69f345e0a52fd62f04654e0fce3 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 22 Mar 2019 09:54:02 +0800 Subject: bpf: add helper to check for a valid SYN cookie Using bpf_skc_lookup_tcp it's possible to ascertain whether a packet belongs to a known connection. However, there is one corner case: no sockets are created if SYN cookies are active. This means that the final ACK in the 3WHS is misclassified. Using the helper, we can look up the listening socket via bpf_skc_lookup_tcp and then check whether a packet is a valid SYN cookie ACK. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fab05317f5e7..3c04410137d9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2448,6 +2448,21 @@ union bpf_attr { * Pointer to **struct bpf_sock**, or **NULL** in case of failure. * For sockets with reuseport option, the **struct bpf_sock** * result is from **reuse->socks**\ [] using the hash of the tuple. + * + * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Check whether iph and th contain a valid SYN cookie ACK for + * the listening socket in sk. + * + * iph points to the start of the IPv4 or IPv6 header, while + * iph_len contains sizeof(struct iphdr) or sizeof(struct ip6hdr). + * + * th points to the start of the TCP header, while th_len contains + * sizeof(struct tcphdr). + * + * Return + * 0 if iph and th are a valid SYN cookie ACK, or a negative error + * otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2549,7 +2564,8 @@ union bpf_attr { FN(tcp_sock), \ FN(skb_ecn_set_ce), \ FN(get_listener_sock), \ - FN(skc_lookup_tcp), + FN(skc_lookup_tcp), \ + FN(tcp_check_syncookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 3b0f31f2b8c9fb348e4530b88f6b64f9621f83d6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 21 Mar 2019 22:51:02 +0100 Subject: genetlink: make policy common to family Since maxattr is common, the policy can't really differ sanely, so make it common as well. The only user that did in fact manage to make a non-common policy is taskstats, which has to be really careful about it (since it's still using a common maxattr!). This is no longer supported, but we can fake it using pre_doit. This reduces the size of e.g. nl80211.o (which has lots of commands): text data bss dec hex filename 398745 14323 2240 415308 6564c net/wireless/nl80211.o (before) 397913 14331 2240 414484 65314 net/wireless/nl80211.o (after) -------------------------------- -832 +8 0 -824 Which is obviously just 8 bytes for each command, and an added 8 bytes for the new policy pointer. I'm not sure why the ops list is counted as .text though. Most of the code transformations were done using the following spatch: @ops@ identifier OPS; expression POLICY; @@ struct genl_ops OPS[] = { ..., { - .policy = POLICY, }, ... }; @@ identifier ops.OPS; expression ops.POLICY; identifier fam; expression M; @@ struct genl_family fam = { .ops = OPS, .maxattr = M, + .policy = POLICY, ... }; This also gets rid of devlink_nl_cmd_region_read_dumpit() accessing the cb->data as ops, which we want to change in a later genl patch. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/linux/genl_magic_func.h | 4 ++-- include/net/genetlink.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 83f81ac53282..6cb82301d8e9 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -233,7 +233,6 @@ const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd) { \ handler \ .cmd = op_name, \ - .policy = CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy), \ }, #define ZZZ_genl_ops CONCAT_(GENL_MAGIC_FAMILY, _genl_ops) @@ -290,7 +289,8 @@ static struct genl_family ZZZ_genl_family __ro_after_init = { #ifdef GENL_MAGIC_FAMILY_HDRSZ .hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ), #endif - .maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1, + .maxattr = ARRAY_SIZE(CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy))-1, + .policy = CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy), .ops = ZZZ_genl_ops, .n_ops = ARRAY_SIZE(ZZZ_genl_ops), .mcgrps = ZZZ_genl_mcgrps, diff --git a/include/net/genetlink.h b/include/net/genetlink.h index aa2e5888f18d..6850c7b1a3a6 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -26,6 +26,7 @@ struct genl_info; * @name: name of family * @version: protocol version * @maxattr: maximum number of attributes supported + * @policy: netlink policy * @netnsok: set to true if the family can handle network * namespaces and should be presented in all of them * @parallel_ops: operations can be called in parallel and aren't @@ -56,6 +57,7 @@ struct genl_family { unsigned int maxattr; bool netnsok; bool parallel_ops; + const struct nla_policy *policy; int (*pre_doit)(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info); @@ -124,14 +126,12 @@ static inline int genl_err_attr(struct genl_info *info, int err, * @cmd: command identifier * @internal_flags: flags used by the family * @flags: flags - * @policy: attribute validation policy * @doit: standard command callback * @start: start callback for dumps * @dumpit: callback for dumpers * @done: completion callback for dumps */ struct genl_ops { - const struct nla_policy *policy; int (*doit)(struct sk_buff *skb, struct genl_info *info); int (*start)(struct netlink_callback *cb); -- cgit v1.2.3 From 974eff2b5793eeaa2eb433bca7eba9640d890c4a Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 21 Mar 2019 15:51:36 -0700 Subject: net: Move the definition of the default Geneve udp port to public header file Move the definition of the default Geneve udp port from the geneve source to the header file, so we can re-use it from drivers. Modify existing drivers to use it. Signed-off-by: Moshe Shemesh Reviewed-by: Or Gerlitz Cc: John Hurley Cc: Jakub Kicinski Reviewed-by: Tariq Toukan Acked-by: Jakub Kicinski Signed-off-by: Saeed Mahameed --- include/net/geneve.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/geneve.h b/include/net/geneve.h index fc6a7e0a874a..bced0b1d9fe4 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -4,6 +4,8 @@ #include +#define GENEVE_UDP_PORT 6081 + /* Geneve Header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |Ver| Opt Len |O|C| Rsvd. | Protocol Type | -- cgit v1.2.3 From bea964107fa78ffe484ef8659ecc26f9ae2bcd2f Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 21 Mar 2019 15:51:39 -0700 Subject: net: Add IANA_VXLAN_UDP_PORT definition to vxlan header file Added IANA_VXLAN_UDP_PORT (4789) definition to vxlan header file so it can be used by drivers instead of local definition. Updated drivers which locally defined it as 4789 to use it. Signed-off-by: Moshe Shemesh Reviewed-by: Or Gerlitz Cc: John Hurley Cc: Jakub Kicinski Cc: Yunsheng Lin Cc: Peng Li Reviewed-by: Tariq Toukan Acked-by: Jakub Kicinski Signed-off-by: Saeed Mahameed --- include/net/vxlan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 00254a58824b..83b5999a2587 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -8,6 +8,8 @@ #include #include +#define IANA_VXLAN_UDP_PORT 4789 + /* VXLAN protocol (RFC 7348) header: * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |R|R|R|R|I|R|R|R| Reserved | -- cgit v1.2.3 From 0eb69bb9962973f4852bb35b8151332c98741770 Mon Sep 17 00:00:00 2001 From: Eli Britstein Date: Thu, 21 Mar 2019 15:51:40 -0700 Subject: net/mlx5e: Add VLAN ID rewrite fields Add VLAN ID rewrite fields as a pre-step to support this rewrite. Signed-off-by: Eli Britstein Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3b83288749c6..b0e17c94566c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5110,6 +5110,7 @@ enum { MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0 = 0x14, MLX5_ACTION_IN_FIELD_OUT_SIPV4 = 0x15, MLX5_ACTION_IN_FIELD_OUT_DIPV4 = 0x16, + MLX5_ACTION_IN_FIELD_OUT_FIRST_VID = 0x17, MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT = 0x47, }; -- cgit v1.2.3 From 14aa31929b724b70fb63a9b0e7877da325b25cfe Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:54 -0400 Subject: bpf: add bpf_skb_adjust_room mode BPF_ADJ_ROOM_MAC bpf_skb_adjust_room net allows inserting room in an skb. Existing mode BPF_ADJ_ROOM_NET inserts room after the network header by pulling the skb, moving the network header forward and zeroing the new space. Add new mode BPF_ADJUST_ROOM_MAC that inserts room after the mac header. This allows inserting tunnel headers in front of the network header without having to recreate the network header in the original space, avoiding two copies. Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3c04410137d9..7c8fd0647070 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1478,7 +1478,10 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * - * There is a single supported mode at this time: + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed below the layer 2 header). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). @@ -2627,6 +2630,7 @@ enum bpf_func_id { /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, }; /* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ -- cgit v1.2.3 From 2278f6cc151a8bef6ba0b3fe3009d14dc3c51c4a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:55 -0400 Subject: bpf: add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_FIXED_GSO bpf_skb_adjust_room adjusts gso_size of gso packets to account for the pushed or popped header room. This is not allowed with UDP, where gso_size delineates datagrams. Add an option to avoid these updates and allow this call for datagrams. It can also be used with TCP, when MSS is known to allow headroom, e.g., through MSS clamping or route MTU. Changes v1->v2: - document flag BPF_F_ADJ_ROOM_FIXED_GSO - do not expose BPF_F_ADJ_ROOM_MASK through uapi, as it may change. Link: https://patchwork.ozlabs.org/patch/1052497/ Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7c8fd0647070..4f157d0ec571 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1486,8 +1486,10 @@ union bpf_attr { * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * All values for *flags* are reserved for future usage, and must - * be left at zero. + * There is one supported flag at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2627,6 +2629,9 @@ enum bpf_func_id { /* Current network namespace */ #define BPF_F_CURRENT_NETNS (-1L) +/* BPF_FUNC_skb_adjust_room flags. */ +#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, -- cgit v1.2.3 From 868d523535c2d00b696753ece606e641a816e91e Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 22 Mar 2019 14:32:56 -0400 Subject: bpf: add bpf_skb_adjust_room encap flags When pushing tunnel headers, annotate skbs in the same way as tunnel devices. For GSO packets, the network stack requires certain fields set to segment packets with tunnel headers. gro_gse_segment depends on transport and inner mac header, for instance. Add an option to pass this information. Remove the restriction on len_diff to network header length, which is too short, e.g., for GRE protocols. Changes v1->v2: - document new flags - BPF_F_ADJ_ROOM_MASK moved v2->v3: - BPF_F_ADJ_ROOM_ENCAP_L3_MASK moved Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4f157d0ec571..837024512baf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1486,11 +1486,20 @@ union bpf_attr { * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer * (room space is added or removed below the layer 3 header). * - * There is one supported flag at this time: + * The following flags are supported at this time: * * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. * Adjusting mss in this way is not allowed for datagrams. * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 **: + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 **: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE **: + * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -2632,6 +2641,11 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) +#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) +#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) +#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, -- cgit v1.2.3 From 576fd2f7cac3daa36025f0039f9e7cb75b4b4ae0 Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Fri, 22 Mar 2019 10:59:47 -0400 Subject: tcp: add documentation for tcp_ca_state Add documentation to the tcp_ca_state enum, since this enum is exposed in uapi. Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Cc: Sowmini Varadhan Acked-by: Sowmini Varadhan Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8bb6cc5f3235..b521464ea962 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -160,15 +160,42 @@ enum { #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ +/* + * Sender's congestion state indicating normal or abnormal situations + * in the last round of packets sent. The state is driven by the ACK + * information and timer events. + */ enum tcp_ca_state { + /* + * Nothing bad has been observed recently. + * No apparent reordering, packet loss, or ECN marks. + */ TCP_CA_Open = 0, #define TCPF_CA_Open (1< Date: Fri, 22 Mar 2019 16:01:55 +0100 Subject: net: sched: add empty status flag for NOLOCK qdisc The queue is marked not empty after acquiring the seqlock, and it's up to the NOLOCK qdisc clearing such flag on dequeue. Since the empty status lays on the same cache-line of the seqlock, it's always hot on cache during the updates. This makes the empty flag update a little bit loosy. Given the lack of synchronization between enqueue and dequeue, this is unavoidable. v2 -> v3: - qdisc_is_empty() has a const argument (Eric) v1 -> v2: - use really an 'empty' flag instead of 'not_empty', as suggested by Eric Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Reviewed-by: Ivan Vecera Signed-off-by: David S. Miller --- include/net/sch_generic.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 31284c078d06..e227475e78ca 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -113,6 +113,9 @@ struct Qdisc { spinlock_t busylock ____cacheline_aligned_in_smp; spinlock_t seqlock; + + /* for NOLOCK qdisc, true if there are no enqueued skbs */ + bool empty; struct rcu_head rcu; }; @@ -143,11 +146,19 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; } +static inline bool qdisc_is_empty(const struct Qdisc *qdisc) +{ + if (qdisc->flags & TCQ_F_NOLOCK) + return qdisc->empty; + return !qdisc->q.qlen; +} + static inline bool qdisc_run_begin(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) { if (!spin_trylock(&qdisc->seqlock)) return false; + qdisc->empty = false; } else if (qdisc_is_running(qdisc)) { return false; } -- cgit v1.2.3 From dc05360fee660a9dbe59824b3f7896534210432b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Mar 2019 08:56:38 -0700 Subject: net: convert rps_needed and rfs_needed to new static branch api We prefer static_branch_unlikely() over static_key_false() these days. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- include/net/sock.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 823762291ebf..166fdc0a78b4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -194,8 +194,8 @@ struct net_device_stats { #ifdef CONFIG_RPS #include -extern struct static_key rps_needed; -extern struct static_key rfs_needed; +extern struct static_key_false rps_needed; +extern struct static_key_false rfs_needed; #endif struct neighbour; diff --git a/include/net/sock.h b/include/net/sock.h index 8de5ee258b93..fecdf639225c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -966,7 +966,7 @@ static inline void sock_rps_record_flow_hash(__u32 hash) static inline void sock_rps_record_flow(const struct sock *sk) { #ifdef CONFIG_RPS - if (static_key_false(&rfs_needed)) { + if (static_branch_unlikely(&rfs_needed)) { /* Reading sk->sk_rxhash might incur an expensive cache line * miss. * -- cgit v1.2.3 From 472c2e07eef045145bc1493cc94a01c87140780a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Mar 2019 08:56:39 -0700 Subject: tcp: add one skb cache for tx On hosts with a lot of cores, RPC workloads suffer from heavy contention on slab spinlocks. 20.69% [kernel] [k] queued_spin_lock_slowpath 5.64% [kernel] [k] _raw_spin_lock 3.83% [kernel] [k] syscall_return_via_sysret 3.48% [kernel] [k] __entry_text_start 1.76% [kernel] [k] __netif_receive_skb_core 1.64% [kernel] [k] __fget For each sendmsg(), we allocate one skb, and free it at the time ACK packet comes. In many cases, ACK packets are handled by another cpus, and this unfortunately incurs heavy costs for slab layer. This patch uses an extra pointer in socket structure, so that we try to reuse the same skb and avoid these expensive costs. We cache at most one skb per socket so this should be safe as far as memory pressure is concerned. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index fecdf639225c..314c47a8f5d1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -414,6 +414,7 @@ struct sock { struct sk_buff *sk_send_head; struct rb_root tcp_rtx_queue; }; + struct sk_buff *sk_tx_skb_cache; struct sk_buff_head sk_write_queue; __s32 sk_peek_off; int sk_write_pending; @@ -1463,6 +1464,10 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { + if (!sk->sk_tx_skb_cache) { + sk->sk_tx_skb_cache = skb; + return; + } sock_set_flag(sk, SOCK_QUEUE_SHRUNK); sk->sk_wmem_queued -= skb->truesize; sk_mem_uncharge(sk, skb->truesize); -- cgit v1.2.3 From 8b27dae5a2e89a61c46c6dbc76c040c0e6d0ed4c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Mar 2019 08:56:40 -0700 Subject: tcp: add one skb cache for rx Often times, recvmsg() system calls and BH handling for a particular TCP socket are done on different cpus. This means the incoming skb had to be allocated on a cpu, but freed on another. This incurs a high spinlock contention in slab layer for small rpc, but also a high number of cache line ping pongs for larger packets. A full size GRO packet might use 45 page fragments, meaning that up to 45 put_page() can be involved. More over performing the __kfree_skb() in the recvmsg() context adds a latency for user applications, and increase probability of trapping them in backlog processing, since the BH handler might found the socket owned by the user. This patch, combined with the prior one increases the rpc performance by about 10 % on servers with large number of cores. (tcp_rr workload with 10,000 flows and 112 threads reach 9 Mpps instead of 8 Mpps) This also increases single bulk flow performance on 40Gbit+ links, since in this case there are often two cpus working in tandem : - CPU handling the NIC rx interrupts, feeding the receive queue, and (after this patch) freeing the skbs that were consumed. - CPU in recvmsg() system call, essentially 100 % busy copying out data to user space. Having at most one skb in a per-socket cache has very little risk of memory exhaustion, and since it is protected by socket lock, its management is essentially free. Note that if rps/rfs is used, we do not enable this feature, because there is high chance that the same cpu is handling both the recvmsg() system call and the TCP rx path, but that another cpu did the skb allocations in the device driver right before the RPS/RFS logic. To properly handle this case, it seems we would need to record on which cpu skb was allocated, and use a different channel to give skbs back to this cpu. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/net/sock.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 314c47a8f5d1..577d91fb5626 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -368,6 +368,7 @@ struct sock { atomic_t sk_drops; int sk_rcvlowat; struct sk_buff_head sk_error_queue; + struct sk_buff *sk_rx_skb_cache; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with @@ -2438,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); + if ( +#ifdef CONFIG_RPS + !static_branch_unlikely(&rps_needed) && +#endif + !sk->sk_rx_skb_cache) { + sk->sk_rx_skb_cache = skb; + skb_orphan(skb); + return; + } __kfree_skb(skb); } -- cgit v1.2.3 From b8f975545cdbcc316cf20e827e7966d4410b5c5a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 24 Mar 2019 11:14:37 +0100 Subject: net: devlink: add port type spinlock Add spinlock to protect port type and type_dev pointer consistency. Without that, userspace may see inconsistent type and type_dev combinations. Signed-off-by: Jiri Pirko v1->v2: - rebased Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 63de99e09f04..cb9b060033e1 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -53,6 +54,9 @@ struct devlink_port { struct devlink *devlink; unsigned index; bool registered; + spinlock_t type_lock; /* Protects type and type_dev + * pointer consistency. + */ enum devlink_port_type type; enum devlink_port_type desired_type; void *type_dev; -- cgit v1.2.3 From f6b19b354d50c5ae46ad66b5273f92e563fbc847 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 24 Mar 2019 11:14:38 +0100 Subject: net: devlink: select NET_DEVLINK from drivers Some drivers are becoming more dependent on NET_DEVLINK being selected in configuration. With upcoming compat functions, the behavior would be wrong in case devlink was not compiled in. So make the drivers select NET_DEVLINK and rely on the functions being there, not just stubs. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 495 +------------------------------------------------- 1 file changed, 3 insertions(+), 492 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index cb9b060033e1..03fb16f4fb6c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -549,17 +549,13 @@ static inline struct devlink *priv_to_devlink(void *priv) static inline struct devlink *netdev_to_devlink(struct net_device *dev) { -#if IS_ENABLED(CONFIG_NET_DEVLINK) if (dev->netdev_ops->ndo_get_devlink) return dev->netdev_ops->ndo_get_devlink(dev); -#endif return NULL; } struct ib_device; -#if IS_ENABLED(CONFIG_NET_DEVLINK) - struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size); int devlink_register(struct devlink *devlink, struct device *dev); void devlink_unregister(struct devlink *devlink); @@ -728,500 +724,14 @@ void devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, enum devlink_health_reporter_state state); +#if IS_ENABLED(CONFIG_NET_DEVLINK) + void devlink_compat_running_version(struct net_device *dev, char *buf, size_t len); int devlink_compat_flash_update(struct net_device *dev, const char *file_name); #else -static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, - size_t priv_size) -{ - return kzalloc(sizeof(struct devlink) + priv_size, GFP_KERNEL); -} - -static inline int devlink_register(struct devlink *devlink, struct device *dev) -{ - return 0; -} - -static inline void devlink_unregister(struct devlink *devlink) -{ -} - -static inline void devlink_params_publish(struct devlink *devlink) -{ -} - -static inline void devlink_params_unpublish(struct devlink *devlink) -{ -} - -static inline void devlink_free(struct devlink *devlink) -{ - kfree(devlink); -} - -static inline int devlink_port_register(struct devlink *devlink, - struct devlink_port *devlink_port, - unsigned int port_index) -{ - return 0; -} - -static inline void devlink_port_unregister(struct devlink_port *devlink_port) -{ -} - -static inline void devlink_port_type_eth_set(struct devlink_port *devlink_port, - struct net_device *netdev) -{ -} - -static inline void devlink_port_type_ib_set(struct devlink_port *devlink_port, - struct ib_device *ibdev) -{ -} - -static inline void devlink_port_type_clear(struct devlink_port *devlink_port) -{ -} - -static inline void devlink_port_attrs_set(struct devlink_port *devlink_port, - enum devlink_port_flavour flavour, - u32 port_number, bool split, - u32 split_subport_number) -{ -} - -static inline int -devlink_port_get_phys_port_name(struct devlink_port *devlink_port, - char *name, size_t len) -{ - return -EOPNOTSUPP; -} - -static inline int devlink_sb_register(struct devlink *devlink, - unsigned int sb_index, u32 size, - u16 ingress_pools_count, - u16 egress_pools_count, - u16 ingress_tc_count, - u16 egress_tc_count) -{ - return 0; -} - -static inline void devlink_sb_unregister(struct devlink *devlink, - unsigned int sb_index) -{ -} - -static inline int -devlink_dpipe_table_register(struct devlink *devlink, - const char *table_name, - struct devlink_dpipe_table_ops *table_ops, - void *priv, bool counter_control_extern) -{ - return 0; -} - -static inline void devlink_dpipe_table_unregister(struct devlink *devlink, - const char *table_name) -{ -} - -static inline int devlink_dpipe_headers_register(struct devlink *devlink, - struct devlink_dpipe_headers * - dpipe_headers) -{ - return 0; -} - -static inline void devlink_dpipe_headers_unregister(struct devlink *devlink) -{ -} - -static inline bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, - const char *table_name) -{ - return false; -} - -static inline int -devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) -{ - return 0; -} - -static inline int -devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx, - struct devlink_dpipe_entry *entry) -{ - return 0; -} - -static inline int -devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx) -{ - return 0; -} - -static inline void -devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry) -{ -} - -static inline int -devlink_dpipe_action_put(struct sk_buff *skb, - struct devlink_dpipe_action *action) -{ - return 0; -} - -static inline int -devlink_dpipe_match_put(struct sk_buff *skb, - struct devlink_dpipe_match *match) -{ - return 0; -} - -static inline int -devlink_resource_register(struct devlink *devlink, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params) -{ - return 0; -} - -static inline void -devlink_resources_unregister(struct devlink *devlink, - struct devlink_resource *resource) -{ -} - -static inline int -devlink_resource_size_get(struct devlink *devlink, u64 resource_id, - u64 *p_resource_size) -{ - return -EOPNOTSUPP; -} - -static inline int -devlink_dpipe_table_resource_set(struct devlink *devlink, - const char *table_name, u64 resource_id, - u64 resource_units) -{ - return -EOPNOTSUPP; -} - -static inline void -devlink_resource_occ_get_register(struct devlink *devlink, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv) -{ -} - -static inline void -devlink_resource_occ_get_unregister(struct devlink *devlink, - u64 resource_id) -{ -} - -static inline int -devlink_params_register(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) -{ - return 0; -} - -static inline void -devlink_params_unregister(struct devlink *devlink, - const struct devlink_param *params, - size_t params_count) -{ - -} - -static inline int -devlink_port_params_register(struct devlink_port *devlink_port, - const struct devlink_param *params, - size_t params_count) -{ - return 0; -} - -static inline void -devlink_port_params_unregister(struct devlink_port *devlink_port, - const struct devlink_param *params, - size_t params_count) -{ -} - -static inline int -devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, - union devlink_param_value *init_val) -{ - return -EOPNOTSUPP; -} - -static inline int -devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, - union devlink_param_value init_val) -{ - return -EOPNOTSUPP; -} - -static inline int -devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port, - u32 param_id, - union devlink_param_value *init_val) -{ - return -EOPNOTSUPP; -} - -static inline int -devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port, - u32 param_id, - union devlink_param_value init_val) -{ - return -EOPNOTSUPP; -} - -static inline void -devlink_param_value_changed(struct devlink *devlink, u32 param_id) -{ -} - -static inline void -devlink_port_param_value_changed(struct devlink_port *devlink_port, - u32 param_id) -{ -} - -static inline void -devlink_param_value_str_fill(union devlink_param_value *dst_val, - const char *src) -{ -} - -static inline struct devlink_region * -devlink_region_create(struct devlink *devlink, - const char *region_name, - u32 region_max_snapshots, - u64 region_size) -{ - return NULL; -} - -static inline void -devlink_region_destroy(struct devlink_region *region) -{ -} - -static inline u32 -devlink_region_shapshot_id_get(struct devlink *devlink) -{ - return 0; -} - -static inline int -devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, - u8 *data, u32 snapshot_id, - devlink_snapshot_data_dest_t *data_destructor) -{ - return 0; -} - -static inline int -devlink_info_driver_name_put(struct devlink_info_req *req, const char *name) -{ - return 0; -} - -static inline int -devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) -{ - return 0; -} - -static inline int -devlink_info_version_fixed_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return 0; -} - -static inline int -devlink_info_version_stored_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return 0; -} - -static inline int -devlink_info_version_running_put(struct devlink_info_req *req, - const char *version_name, - const char *version_value) -{ - return 0; -} - -static inline int -devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) -{ - return 0; -} - -static inline int -devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) -{ - return 0; -} - -static inline int -devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) -{ - return 0; -} - -static inline int -devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) -{ - return 0; -} - -static inline int -devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, - const char *name) -{ - return 0; -} - -static inline int -devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) -{ - return 0; -} - -static inline int -devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) -{ - return 0; -} - -static inline int -devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) -{ - return 0; -} - -static inline int -devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) -{ - return 0; -} - -static inline int -devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) -{ - return 0; -} - -static inline int -devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) -{ - return 0; -} - -static inline int -devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, - u16 value_len) -{ - return 0; -} - -static inline int -devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, - bool value) -{ - return 0; -} - -static inline int -devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, - u8 value) -{ - return 0; -} - -static inline int -devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, - u32 value) -{ - return 0; -} - -static inline int -devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, - u64 value) -{ - return 0; -} - -static inline int -devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, - const char *value) -{ - return 0; -} - -static inline int -devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, - const void *value, u16 value_len) -{ - return 0; -} - -static inline struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, bool auto_recover, - void *priv) -{ - return NULL; -} - -static inline void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ -} - -static inline void * -devlink_health_reporter_priv(struct devlink_health_reporter *reporter) -{ - return NULL; -} - -static inline int -devlink_health_report(struct devlink_health_reporter *reporter, - const char *msg, void *priv_ctx) -{ - return 0; -} - -static inline void -devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, - enum devlink_health_reporter_state state) -{ -} - static inline void devlink_compat_running_version(struct net_device *dev, char *buf, size_t len) { @@ -1232,6 +742,7 @@ devlink_compat_flash_update(struct net_device *dev, const char *file_name) { return -EOPNOTSUPP; } + #endif #endif /* _NET_DEVLINK_H_ */ -- cgit v1.2.3 From 0d5f20c42b24adffa1505ec3d4930d11dfaea82f Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 3 Mar 2019 15:52:07 +0100 Subject: batman-adv: Drop license boilerplate All files got a SPDX-License-Identifier with commit 7db7d9f369a4 ("batman-adv: Add SPDX license identifier above copyright header"). All the required information about the license conditions can be found in LICENSES/. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 12 ------------ include/uapi/linux/batman_adv.h | 18 ------------------ 2 files changed, 30 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index c99336f4eefe..4ebc2135e950 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -2,18 +2,6 @@ /* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _UAPI_LINUX_BATADV_PACKET_H_ diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 305bf316dd03..e53f2b5e7ee7 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -2,24 +2,6 @@ /* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: * * Matthias Schiffer - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. */ #ifndef _UAPI_LINUX_BATMAN_ADV_H_ -- cgit v1.2.3 From 32e727449c792b689c2a06a8b4cc9fef6270c5a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Sat, 23 Mar 2019 05:47:41 +0100 Subject: batman-adv: Add multicast-to-unicast support for multiple targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this patch multicast packets with a limited number of destinations (current default: 16) will be split and transmitted by the originator as individual unicast transmissions. Wifi broadcasts with their low bitrate are still a costly undertaking. In a mesh network this cost multiplies with the overall size of the mesh network. Therefore using multiple unicast transmissions instead of broadcast flooding is almost always less burdensome for the mesh network. The maximum amount of unicast packets can be configured via the newly introduced multicast_fanout parameter. If this limit is exceeded distribution will fall back to classic broadcast flooding. The multicast-to-unicast conversion is performed on the initial multicast sender node and counts on a final destination node, mesh-wide basis (and not next hop, neighbor node basis). Signed-off-by: Linus Lüssing Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index e53f2b5e7ee7..67f4636758af 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -473,6 +473,13 @@ enum batadv_nl_attrs { */ BATADV_ATTR_THROUGHPUT_OVERRIDE, + /** + * @BATADV_ATTR_MULTICAST_FANOUT: defines the maximum number of packet + * copies that may be generated for a multicast-to-unicast conversion. + * Once this limit is exceeded distribution will fall back to broadcast. + */ + BATADV_ATTR_MULTICAST_FANOUT, + /* add attributes above here, update the policy in netlink.c */ /** -- cgit v1.2.3 From 1713cb37bf671e5d98919536941a8b56337874fd Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Wed, 27 Mar 2019 11:16:03 +0100 Subject: fou: Support binding FoU socket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An FoU socket is currently bound to the wildcard-address. While this works fine, there are several use-cases where the use of the wildcard-address is not desirable. For example, I use FoU on some multi-homed servers and would like to use FoU on only one of the interfaces. This commit adds support for binding FoU sockets to a given source address/interface, as well as connecting the socket to a given destination address/port. udp_tunnel already provides the required infrastructure, so most of the code added is for exposing and setting the different attributes (local address, peer address, etc.). The lookups performed when we add, delete or get an FoU-socket has also been updated to compare all the attributes a user can set. Since the comparison now involves several elements, I have added a separate comparison-function instead of open-coding. In order to test the code and ensure that the new comparison code works correctly, I started by creating a wildcard socket bound to port 1234 on my machine. I then tried to create a non-wildcarded socket bound to the same port, as well as fetching and deleting the socket (including source address, peer address or interface index in the netlink request). Both the create, fetch and delete request failed. Deleting/fetching the socket was only successful when my netlink request attributes matched those used to create the socket. I then repeated the tests, but with a socket bound to a local ip address, a socket bound to a local address + interface, and a bound socket that was also «connected» to a peer. Add only worked when no socket with the matching source address/interface (or wildcard) existed, while fetch/delete was only successful when all attributes matched. In addition to testing that the new code work, I also checked that the current behavior is kept. If none of the new attributes are provided, then an FoU-socket is configured as before (i.e., wildcarded). If any of the new attributes are provided, the FoU-socket is configured as expected. v1->v2: * Fixed building with IPv6 disabled (kbuild). * Fixed a return type warning and make the ugly comparison function more readable (kbuild). * Describe more in detail what has been tested (thanks David Miller). * Make peer port required if peer address is specified. Signed-off-by: Kristian Evensen Signed-off-by: David S. Miller --- include/uapi/linux/fou.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h index f2ea833a2812..87c2c9f08803 100644 --- a/include/uapi/linux/fou.h +++ b/include/uapi/linux/fou.h @@ -16,6 +16,12 @@ enum { FOU_ATTR_IPPROTO, /* u8 */ FOU_ATTR_TYPE, /* u8 */ FOU_ATTR_REMCSUM_NOPARTIAL, /* flag */ + FOU_ATTR_LOCAL_V4, /* u32 */ + FOU_ATTR_LOCAL_V6, /* in6_addr */ + FOU_ATTR_PEER_V4, /* u32 */ + FOU_ATTR_PEER_V6, /* in6_addr */ + FOU_ATTR_PEER_PORT, /* u16 */ + FOU_ATTR_IFINDEX, /* s32 */ __FOU_ATTR_MAX, }; -- cgit v1.2.3 From 3aeb0803f7ea11ff2fc478f7d58f2b8e713af380 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 25 Mar 2019 19:34:58 +0100 Subject: ethtool: add PHY Fast Link Down support This adds support for Fast Link Down as new PHY tunable. Fast Link Down reduces the time until a link down event is reported for 1000BaseT. According to the standard it's 750ms what is too long for several use cases. v2: - add comment describing the constants Signed-off-by: Heiner Kallweit Reviewed-by: Florian Fainelli Reviewed-by: Michal Kubecek Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 3652b239dad1..50c76f4fa402 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -252,9 +252,17 @@ struct ethtool_tunable { #define DOWNSHIFT_DEV_DEFAULT_COUNT 0xff #define DOWNSHIFT_DEV_DISABLE 0 +/* Time in msecs after which link is reported as down + * 0 = lowest time supported by the PHY + * 0xff = off, link down detection according to standard + */ +#define ETHTOOL_PHY_FAST_LINK_DOWN_ON 0 +#define ETHTOOL_PHY_FAST_LINK_DOWN_OFF 0xff + enum phy_tunable_id { ETHTOOL_PHY_ID_UNSPEC, ETHTOOL_PHY_DOWNSHIFT, + ETHTOOL_PHY_FAST_LINK_DOWN, /* * Add your fresh new phy tunable attribute above and remember to update * phy_tunable_strings[] in net/core/ethtool.c -- cgit v1.2.3 From 4d5ec89fc8d14dcdab7214a0c13a1c7321dc6ea9 Mon Sep 17 00:00:00 2001 From: Numan Siddique Date: Tue, 26 Mar 2019 06:13:46 +0530 Subject: net: openvswitch: Add a new action check_pkt_len This patch adds a new action - 'check_pkt_len' which checks the packet length and executes a set of actions if the packet length is greater than the specified length or executes another set of actions if the packet length is lesser or equal to. This action takes below nlattrs * OVS_CHECK_PKT_LEN_ATTR_PKT_LEN - 'pkt_len' to check for * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER - Nested actions to apply if the packet length is greater than the specified 'pkt_len' * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL - Nested actions to apply if the packet length is lesser or equal to the specified 'pkt_len'. The main use case for adding this action is to solve the packet drops because of MTU mismatch in OVN virtual networking solution. When a VM (which belongs to a logical switch of OVN) sends a packet destined to go via the gateway router and if the nic which provides external connectivity, has a lesser MTU, OVS drops the packet if the packet length is greater than this MTU. With the help of this action, OVN will check the packet length and if it is greater than the MTU size, it will generate an ICMP packet (type 3, code 4) and includes the next hop mtu in it so that the sender can fragment the packets. Reported-at: https://mail.openvswitch.org/pipermail/ovs-discuss/2018-July/047039.html Suggested-by: Ben Pfaff Signed-off-by: Numan Siddique CC: Gregory Rose CC: Pravin B Shelar Acked-by: Pravin B Shelar Tested-by: Greg Rose Reviewed-by: Greg Rose Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 42 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index dbe0cbe4f1b7..dfabacee6903 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -798,6 +798,44 @@ struct ovs_action_push_eth { struct ovs_key_ethernet addresses; }; +/* + * enum ovs_check_pkt_len_attr - Attributes for %OVS_ACTION_ATTR_CHECK_PKT_LEN. + * + * @OVS_CHECK_PKT_LEN_ATTR_PKT_LEN: u16 Packet length to check for. + * @OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER: Nested OVS_ACTION_ATTR_* + * actions to apply if the packer length is greater than the specified + * length in the attr - OVS_CHECK_PKT_LEN_ATTR_PKT_LEN. + * @OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL - Nested OVS_ACTION_ATTR_* + * actions to apply if the packer length is lesser or equal to the specified + * length in the attr - OVS_CHECK_PKT_LEN_ATTR_PKT_LEN. + */ +enum ovs_check_pkt_len_attr { + OVS_CHECK_PKT_LEN_ATTR_UNSPEC, + OVS_CHECK_PKT_LEN_ATTR_PKT_LEN, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL, + __OVS_CHECK_PKT_LEN_ATTR_MAX, + +#ifdef __KERNEL__ + OVS_CHECK_PKT_LEN_ATTR_ARG /* struct check_pkt_len_arg */ +#endif +}; + +#define OVS_CHECK_PKT_LEN_ATTR_MAX (__OVS_CHECK_PKT_LEN_ATTR_MAX - 1) + +#ifdef __KERNEL__ +struct check_pkt_len_arg { + u16 pkt_len; /* Same value as OVS_CHECK_PKT_LEN_ATTR_PKT_LEN'. */ + bool exec_for_greater; /* When true, actions in IF_GREATER will + * not change flow keys. False otherwise. + */ + bool exec_for_lesser_equal; /* When true, actions in IF_LESS_EQUAL + * will not change flow keys. False + * otherwise. + */ +}; +#endif + /** * enum ovs_action_attr - Action types. * @@ -842,6 +880,9 @@ struct ovs_action_push_eth { * packet, or modify the packet (e.g., change the DSCP field). * @OVS_ACTION_ATTR_CLONE: make a copy of the packet and execute a list of * actions without affecting the original packet and key. + * @OVS_ACTION_ATTR_CHECK_PKT_LEN: Check the packet length and execute a set + * of actions if greater than the specified packet length, else execute + * another set of actions. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -876,6 +917,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_POP_NSH, /* No argument. */ OVS_ACTION_ATTR_METER, /* u32 meter ID. */ OVS_ACTION_ATTR_CLONE, /* Nested OVS_CLONE_ATTR_*. */ + OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ -- cgit v1.2.3 From 4f661542a40217713f2cee0bb6678fbb30d9d367 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Mar 2019 08:34:55 -0700 Subject: tcp: fix zerocopy and notsent_lowat issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My recent patch had at least three problems : 1) TX zerocopy wants notification when skb is acknowledged, thus we need to call skb_zcopy_clear() if the skb is cached into sk->sk_tx_skb_cache 2) Some applications might expect precise EPOLLOUT notifications, so we need to update sk->sk_wmem_queued and call sk_mem_uncharge() from sk_wmem_free_skb() in all cases. The SOCK_QUEUE_SHRUNK flag must also be set. 3) Reuse of saved skb should have used skb_cloned() instead of simply checking if the fast clone has been freed. Fixes: 472c2e07eef0 ("tcp: add one skb cache for tx") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Tested-by: Holger Hoffstätte Signed-off-by: David S. Miller --- include/net/sock.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 577d91fb5626..7fa223278522 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1465,13 +1465,14 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + sk->sk_wmem_queued -= skb->truesize; + sk_mem_uncharge(sk, skb->truesize); if (!sk->sk_tx_skb_cache) { + skb_zcopy_clear(skb, true); sk->sk_tx_skb_cache = skb; return; } - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); - sk->sk_wmem_queued -= skb->truesize; - sk_mem_uncharge(sk, skb->truesize); __kfree_skb(skb); } -- cgit v1.2.3 From df453700e8d81b1bdafdf684365ee2b9431fb702 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Mar 2019 12:40:33 -0700 Subject: inet: switch IP ID generator to siphash According to Amit Klein and Benny Pinkas, IP ID generation is too weak and might be used by attackers. Even with recent net_hash_mix() fix (netns: provide pure entropy for net_hash_mix()) having 64bit key and Jenkins hash is risky. It is time to switch to siphash and its 128bit keys. Signed-off-by: Eric Dumazet Reported-by: Amit Klein Reported-by: Benny Pinkas Signed-off-by: David S. Miller --- include/linux/siphash.h | 5 +++++ include/net/netns/ipv4.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/siphash.h b/include/linux/siphash.h index fa7a6b9cedbf..bf21591a9e5e 100644 --- a/include/linux/siphash.h +++ b/include/linux/siphash.h @@ -21,6 +21,11 @@ typedef struct { u64 key[2]; } siphash_key_t; +static inline bool siphash_key_is_zero(const siphash_key_t *key) +{ + return !(key->key[0] | key->key[1]); +} + u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 104a6669e344..7698460a3dd1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -9,6 +9,7 @@ #include #include #include +#include struct tcpm_hash_bucket; struct ctl_table_header; @@ -217,5 +218,6 @@ struct netns_ipv4 { unsigned int ipmr_seq; /* protected by rtnl_mutex */ atomic_t rt_genid; + siphash_key_t ip_id_key; }; #endif -- cgit v1.2.3 From 5dc37bb9b03586e8fdeb47d25e8d2a0399984936 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 28 Mar 2019 13:56:36 +0100 Subject: net: replace ndo_get_devlink with ndo_get_devlink_port Follow-up patch is going to need a devlink port instance according to a netdev. Devlink port instance should be always available when devlink is used. So change the recently introduced ndo_get_devlink to ndo_get_devlink_port. With that, adjust the wrapper for the only user to get devlink pointer. Signed-off-by: Jiri Pirko Reviewed-by: Michal Kubecek Reviewed-by: Florian Fainelli Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 +++--- include/net/devlink.h | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 166fdc0a78b4..78f5ec4ebf64 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1250,8 +1250,8 @@ struct devlink; * that got dropped are freed/returned via xdp_return_frame(). * Returns negative number, means general error invoking ndo, meaning * no frames were xmit'ed and core-caller will free all frames. - * struct devlink *(*ndo_get_devlink)(struct net_device *dev); - * Get devlink instance associated with a given netdev. + * struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev); + * Get devlink port instance associated with a given netdev. * Called with a reference on the netdevice and devlink locks only, * rtnl_lock is not held. */ @@ -1451,7 +1451,7 @@ struct net_device_ops { u32 flags); int (*ndo_xsk_async_xmit)(struct net_device *dev, u32 queue_id); - struct devlink * (*ndo_get_devlink)(struct net_device *dev); + struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); }; /** diff --git a/include/net/devlink.h b/include/net/devlink.h index 03fb16f4fb6c..81b5ed04a341 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -547,10 +547,20 @@ static inline struct devlink *priv_to_devlink(void *priv) return container_of(priv, struct devlink, priv); } +static inline struct devlink_port * +netdev_to_devlink_port(struct net_device *dev) +{ + if (dev->netdev_ops->ndo_get_devlink_port) + return dev->netdev_ops->ndo_get_devlink_port(dev); + return NULL; +} + static inline struct devlink *netdev_to_devlink(struct net_device *dev) { - if (dev->netdev_ops->ndo_get_devlink) - return dev->netdev_ops->ndo_get_devlink(dev); + struct devlink_port *devlink_port = netdev_to_devlink_port(dev); + + if (devlink_port) + return devlink_port->devlink; return NULL; } -- cgit v1.2.3 From af3836df9a59e7339d60c9c46729a7d9094d0582 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 28 Mar 2019 13:56:37 +0100 Subject: net: devlink: introduce devlink_compat_phys_port_name_get() Introduce devlink_compat_phys_port_name_get() helper that gets the physical port name for specified netdevice according to devlink port attributes. Call this helper from dev_get_phys_port_name() in case ndo_get_phys_port_name is not defined. Signed-off-by: Jiri Pirko Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 81b5ed04a341..85e577d6ec3b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -739,6 +739,8 @@ devlink_health_reporter_state_update(struct devlink_health_reporter *reporter, void devlink_compat_running_version(struct net_device *dev, char *buf, size_t len); int devlink_compat_flash_update(struct net_device *dev, const char *file_name); +int devlink_compat_phys_port_name_get(struct net_device *dev, + char *name, size_t len); #else @@ -753,6 +755,13 @@ devlink_compat_flash_update(struct net_device *dev, const char *file_name) return -EOPNOTSUPP; } +static inline int +devlink_compat_phys_port_name_get(struct net_device *dev, + char *name, size_t len) +{ + return -EOPNOTSUPP; +} + #endif #endif /* _NET_DEVLINK_H_ */ -- cgit v1.2.3 From 14c03ac4c100e4b81ec4747f5ec861701ff52de2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 28 Mar 2019 13:56:40 +0100 Subject: net: devlink: remove unused devlink_port_get_phys_port_name() function Now it is unused, remove it. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 85e577d6ec3b..31d5cec4d06b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -583,8 +583,6 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour, u32 port_number, bool split, u32 split_subport_number); -int devlink_port_get_phys_port_name(struct devlink_port *devlink_port, - char *name, size_t len); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, -- cgit v1.2.3 From 717700d183d65bd2e6511566aa6d32395419d158 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Tue, 26 Mar 2019 11:31:13 -0700 Subject: netfilter: Export nf_ct_{set,destroy}_timeout() This patch exports nf_ct_set_timeout() and nf_ct_destroy_timeout(). The two functions are derived from xt_ct_destroy_timeout() and xt_ct_set_timeout() in xt_CT.c, and moved to nf_conntrack_timeout.c without any functional change. It would be useful for other users (i.e. OVS) that utilizes the finer-grain conntrack timeout feature. CC: Pablo Neira Ayuso CC: Pravin Shelar Signed-off-by: Yi-Hung Wei Signed-off-by: David S. Miller --- include/net/netfilter/nf_conntrack_timeout.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 3394d75e1c80..00a8fbb2d735 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -88,6 +88,9 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) int nf_conntrack_timeout_init(void); void nf_conntrack_timeout_fini(void); void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout); +int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, + const char *timeout_name); +void nf_ct_destroy_timeout(struct nf_conn *ct); #else static inline int nf_conntrack_timeout_init(void) { @@ -98,6 +101,18 @@ static inline void nf_conntrack_timeout_fini(void) { return; } + +static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, + u8 l3num, u8 l4num, + const char *timeout_name) +{ + return -EOPNOTSUPP; +} + +static inline void nf_ct_destroy_timeout(struct nf_conn *ct) +{ + return; +} #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -- cgit v1.2.3 From 06bd2bdf19d2f3d22731625e1a47fa1dff5ac407 Mon Sep 17 00:00:00 2001 From: Yi-Hung Wei Date: Tue, 26 Mar 2019 11:31:14 -0700 Subject: openvswitch: Add timeout support to ct action Add support for fine-grain timeout support to conntrack action. The new OVS_CT_ATTR_TIMEOUT attribute of the conntrack action specifies a timeout to be associated with this connection. If no timeout is specified, it acts as is, that is the default timeout for the connection will be automatically applied. Example usage: $ nfct timeout add timeout_1 inet tcp syn_sent 100 established 200 $ ovs-ofctl add-flow br0 in_port=1,ip,tcp,action=ct(commit,timeout=timeout_1) CC: Pravin Shelar CC: Pablo Neira Ayuso Signed-off-by: Yi-Hung Wei Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index dfabacee6903..0cac5d802c6a 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -734,6 +734,7 @@ struct ovs_action_hash { * be received on NFNLGRP_CONNTRACK_NEW and NFNLGRP_CONNTRACK_DESTROY groups, * respectively. Remaining bits control the changes for which an event is * delivered on the NFNLGRP_CONNTRACK_UPDATE group. + * @OVS_CT_ATTR_TIMEOUT: Variable length string defining conntrack timeout. */ enum ovs_ct_attr { OVS_CT_ATTR_UNSPEC, @@ -746,6 +747,8 @@ enum ovs_ct_attr { OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */ OVS_CT_ATTR_FORCE_COMMIT, /* No argument */ OVS_CT_ATTR_EVENTMASK, /* u32 mask of IPCT_* events. */ + OVS_CT_ATTR_TIMEOUT, /* Associate timeout with this connection for + * fine-grain timeout tuning. */ __OVS_CT_ATTR_MAX }; -- cgit v1.2.3 From 331c7a402358de6206232f6aab7aa48ec6c1088a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:47 -0700 Subject: ipv4: Move IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN to helper in_dev lookup followed by IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN check is called in several places, some with the rcu lock and others with the rtnl held. Move the check to a helper similar to what IPv6 has. Since the helper can be invoked from either context use rcu_dereference_rtnl to dereference ip_ptr. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index a64f21a97369..367dc2a0f84a 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -237,6 +237,20 @@ static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev) return rtnl_dereference(dev->ip_ptr); } +/* called with rcu_read_lock or rtnl held */ +static inline bool ip_ignore_linkdown(const struct net_device *dev) +{ + struct in_device *in_dev; + bool rc = false; + + in_dev = rcu_dereference_rtnl(dev->ip_ptr); + if (in_dev && + IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev)) + rc = true; + + return rc; +} + static inline struct neigh_parms *__in_dev_arp_parms_get_rcu(const struct net_device *dev) { struct in_device *in_dev = __in_dev_get_rcu(dev); -- cgit v1.2.3 From e4516ef65490ef29d48a98ad4d7c90dccf39068f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:48 -0700 Subject: ipv4: Create init helper for fib_nh Consolidate the fib_nh initialization which is duplicated between fib_create_info for single path and fib_get_nhs for multipath. Export the helper to allow for use with nexthop objects in the future. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 9c8214d2116d..1af1f552644a 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -416,6 +416,10 @@ void fib_select_multipath(struct fib_result *res, int hash); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); +int fib_nh_init(struct net *net, struct fib_nh *fib_nh, + struct fib_config *cfg, int nh_weight, + struct netlink_ext_ack *extack); + /* Exported by fib_trie.c */ void fib_trie_init(void); struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); -- cgit v1.2.3 From faa041a40b9fa64913789fcc0161c7c73161f57e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:49 -0700 Subject: ipv4: Create cleanup helper for fib_nh Move the fib_nh cleanup code from free_fib_info_rcu into a new helper, fib_nh_release. Move classid accounting into fib_nh_release which is called per fib_nh to make accounting symmetrical with fib_nh_init. Export the helper to allow for use with nexthop objects in the future. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 1af1f552644a..5a4df0ba175e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -419,6 +419,7 @@ void fib_select_path(struct net *net, struct fib_result *res, int fib_nh_init(struct net *net, struct fib_nh *fib_nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack); +void fib_nh_release(struct net *net, struct fib_nh *fib_nh); /* Exported by fib_trie.c */ void fib_trie_init(void); -- cgit v1.2.3 From 83c442515917812d4ff643e90cd456c630b7e762 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:50 -0700 Subject: ipv6: Create init helper for fib6_nh Similar to IPv4, consolidate the fib6_nh initialization into a helper. As a new standalone function, add a cleanup path to put lwtstate on error. To avoid modifying fib6_config flags, move the reject check to a helper that is invoked once by fib6_nh_init to reset the device and then again in ip6_route_info_create to set the fib6_flags. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 2acb78a762ee..ce1f81345c8e 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -444,6 +444,10 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) return f6i->fib6_nh.nh_dev; } +int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, + struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack); + static inline struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) { -- cgit v1.2.3 From dac7d0f27075ce54017a7efdd6ae0a55352a0f80 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:51 -0700 Subject: ipv6: Create cleanup helper for fib6_nh Move the fib6_nh cleanup code to a new helper, fib6_nh_release. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index ce1f81345c8e..2d2a468b3d6d 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -447,6 +447,7 @@ static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); +void fib6_nh_release(struct fib6_nh *fib6_nh); static inline struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) -- cgit v1.2.3 From 2b2450ca4a2d9d772dc45e1220c04cb3ba761843 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:52 -0700 Subject: ipv6: Move gateway checks to a fib6_nh setting The gateway setting is not per fib6_info entry but per-fib6_nh. Add a new fib_nh_has_gw flag to fib6_nh and convert references to RTF_GATEWAY to the new flag. For IPv6 address the flag is cheaper than checking that nh_gw is non-0 like IPv4 does. While this increases fib6_nh by 8-bytes, the effective allocation size of a fib6_info is unchanged. The 8 bytes is recovered later with a fib_nh_common change. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + include/net/ip6_route.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 2d2a468b3d6d..3b04b318cf13 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -126,6 +126,7 @@ struct rt6_exception { struct fib6_nh { struct in6_addr nh_gw; + bool fib_nh_has_gw; struct net_device *nh_dev; struct lwtunnel_state *nh_lwtstate; diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 7ab119936e69..95cd8a2f6284 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -68,8 +68,8 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { - return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; + return !(f6i->fib6_flags & (RTF_ADDRCONF|RTF_DYNAMIC)) && + f6i->fib6_nh.fib_nh_has_gw; } void ip6_route_input(struct sk_buff *skb); -- cgit v1.2.3 From 6d3d07b45c86f984424ccbad110ca500397fd18c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:53 -0700 Subject: ipv6: Refactor fib6_ignore_linkdown fib6_ignore_linkdown takes a fib6_info but only looks at the net_device and its IPv6 config. Change it to take a net_device over a fib6_info as its input argument. In addition, move it to a header file to make the check inline and usable later with IPv4 code without going through the ipv6 stub, and rename to ip6_ignore_linkdown since it is only checking the setting based on the ipv6 struct on a device. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/addrconf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 269ec27385e9..ec8e6784a6f7 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -425,6 +425,14 @@ static inline void in6_dev_hold(struct inet6_dev *idev) refcount_inc(&idev->refcnt); } +/* called with rcu_read_lock held */ +static inline bool ip6_ignore_linkdown(const struct net_device *dev) +{ + const struct inet6_dev *idev = __in6_dev_get(dev); + + return !!idev->cnf.ignore_routes_with_linkdown; +} + void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); static inline void in6_ifa_put(struct inet6_ifaddr *ifp) -- cgit v1.2.3 From b75ed8b1aa9c3a99702159c3be8b0c1d54972ae5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:55 -0700 Subject: ipv4: Rename fib_nh entries Rename fib_nh entries that will be moved to a fib_nh_common struct. Specifically, the device, oif, gateway, flags, scope, lwtstate, nh_weight and nh_upper_bound are common with all nexthop definitions. In the process shorten fib_nh_lwtstate to fib_nh_lws to avoid really long lines. Rename only; no functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 24 ++++++++++++------------ include/trace/events/fib.h | 7 +++++-- 2 files changed, 17 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 5a4df0ba175e..029acd333d29 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -77,26 +77,26 @@ struct fnhe_hash_bucket { #define FNHE_RECLAIM_DEPTH 5 struct fib_nh { - struct net_device *nh_dev; + struct net_device *fib_nh_dev; struct hlist_node nh_hash; struct fib_info *nh_parent; - unsigned int nh_flags; - unsigned char nh_scope; + unsigned int fib_nh_flags; + unsigned char fib_nh_scope; #ifdef CONFIG_IP_ROUTE_MULTIPATH - int nh_weight; - atomic_t nh_upper_bound; + int fib_nh_weight; + atomic_t fib_nh_upper_bound; #endif #ifdef CONFIG_IP_ROUTE_CLASSID __u32 nh_tclassid; #endif - int nh_oif; - __be32 nh_gw; + int fib_nh_oif; + __be32 fib_nh_gw4; __be32 nh_saddr; int nh_saddr_genid; struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket __rcu *nh_exceptions; - struct lwtunnel_state *nh_lwtstate; + struct lwtunnel_state *fib_nh_lws; }; /* @@ -125,7 +125,7 @@ struct fib_info { int fib_nhs; struct rcu_head rcu; struct fib_nh fib_nh[0]; -#define fib_dev fib_nh[0].nh_dev +#define fib_dev fib_nh[0].fib_nh_dev }; @@ -180,9 +180,9 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh); atomic_read(&(net)->ipv4.dev_addr_genid)) ? \ FIB_RES_NH(res).nh_saddr : \ fib_info_update_nh_saddr((net), &FIB_RES_NH(res))) -#define FIB_RES_GW(res) (FIB_RES_NH(res).nh_gw) -#define FIB_RES_DEV(res) (FIB_RES_NH(res).nh_dev) -#define FIB_RES_OIF(res) (FIB_RES_NH(res).nh_oif) +#define FIB_RES_GW(res) (FIB_RES_NH(res).fib_nh_gw4) +#define FIB_RES_DEV(res) (FIB_RES_NH(res).fib_nh_dev) +#define FIB_RES_OIF(res) (FIB_RES_NH(res).fib_nh_oif) #define FIB_RES_PREFSRC(net, res) ((res).fi->fib_prefsrc ? : \ FIB_RES_SADDR(net, res)) diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h index 6271bab63bfb..61ea7a24c8e5 100644 --- a/include/trace/events/fib.h +++ b/include/trace/events/fib.h @@ -63,13 +63,16 @@ TRACE_EVENT(fib_table_lookup, } if (nh) { + struct net_device *dev; + p32 = (__be32 *) __entry->saddr; *p32 = nh->nh_saddr; p32 = (__be32 *) __entry->gw; - *p32 = nh->nh_gw; + *p32 = nh->fib_nh_gw4; - __assign_str(name, nh->nh_dev ? nh->nh_dev->name : "-"); + dev = nh->fib_nh_dev; + __assign_str(name, dev ? dev->name : "-"); } else { p32 = (__be32 *) __entry->saddr; *p32 = 0; -- cgit v1.2.3 From ad1601ae0260551f85691ca1ac814773fdcec239 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:56 -0700 Subject: ipv6: Rename fib6_nh entries Rename fib6_nh entries that will be moved to a fib_nh_common struct. Specifically, the device, gateway, flags, and lwtstate are common with all nexthop definitions. In some places new temporary variables are declared or local variables renamed to maintain line lengths. Rename only; no functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 16 ++++++++-------- include/net/ip6_route.h | 8 +++++--- include/trace/events/fib6.h | 6 +++--- 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 3b04b318cf13..aff8570725c8 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -125,14 +125,14 @@ struct rt6_exception { #define FIB6_MAX_DEPTH 5 struct fib6_nh { - struct in6_addr nh_gw; + struct in6_addr fib_nh_gw6; bool fib_nh_has_gw; - struct net_device *nh_dev; - struct lwtunnel_state *nh_lwtstate; + struct net_device *fib_nh_dev; + struct lwtunnel_state *fib_nh_lws; - unsigned int nh_flags; - atomic_t nh_upper_bound; - int nh_weight; + unsigned int fib_nh_flags; + atomic_t fib_nh_upper_bound; + int fib_nh_weight; }; struct fib6_info { @@ -442,7 +442,7 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr) static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i) { - return f6i->fib6_nh.nh_dev; + return f6i->fib6_nh.fib_nh_dev; } int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, @@ -453,7 +453,7 @@ void fib6_nh_release(struct fib6_nh *fib6_nh); static inline struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) { - return f6i->fib6_nh.nh_lwtstate; + return f6i->fib6_nh.fib_nh_lws; } void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 95cd8a2f6284..342180a7285c 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -274,9 +274,11 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt, static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { - return a->fib6_nh.nh_dev == b->fib6_nh.nh_dev && - ipv6_addr_equal(&a->fib6_nh.nh_gw, &b->fib6_nh.nh_gw) && - !lwtunnel_cmp_encap(a->fib6_nh.nh_lwtstate, b->fib6_nh.nh_lwtstate); + struct fib6_nh *nha = &a->fib6_nh, *nhb = &b->fib6_nh; + + return nha->fib_nh_dev == nhb->fib_nh_dev && + ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) && + !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws); } static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index b088b54d699c..6d05ebdd669c 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -62,8 +62,8 @@ TRACE_EVENT(fib6_table_lookup, __entry->dport = 0; } - if (f6i->fib6_nh.nh_dev) { - __assign_str(name, f6i->fib6_nh.nh_dev); + if (f6i->fib6_nh.fib_nh_dev) { + __assign_str(name, f6i->fib6_nh.fib_nh_dev); } else { __assign_str(name, "-"); } @@ -75,7 +75,7 @@ TRACE_EVENT(fib6_table_lookup, } else if (f6i) { in6 = (struct in6_addr *)__entry->gw; - *in6 = f6i->fib6_nh.nh_gw; + *in6 = f6i->fib6_nh.fib_nh_gw6; } ), -- cgit v1.2.3 From f1741730dd18828fe3ea5fa91c22f41cf001c625 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:57 -0700 Subject: net: Add fib_nh_common and update fib_nh and fib6_nh Add fib_nh_common struct with common nexthop attributes. Convert fib_nh and fib6_nh to use it. Use macros to move existing fib_nh_* references to the new nh_common.nhc_*. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 10 ++-------- include/net/ip_fib.h | 41 +++++++++++++++++++++++++++++++---------- 2 files changed, 33 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index aff8570725c8..58dbb4e82908 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -125,14 +126,7 @@ struct rt6_exception { #define FIB6_MAX_DEPTH 5 struct fib6_nh { - struct in6_addr fib_nh_gw6; - bool fib_nh_has_gw; - struct net_device *fib_nh_dev; - struct lwtunnel_state *fib_nh_lws; - - unsigned int fib_nh_flags; - atomic_t fib_nh_upper_bound; - int fib_nh_weight; + struct fib_nh_common nh_common; }; struct fib6_info { diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 029acd333d29..70548b1a6322 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -76,27 +76,48 @@ struct fnhe_hash_bucket { #define FNHE_HASH_SIZE (1 << FNHE_HASH_SHIFT) #define FNHE_RECLAIM_DEPTH 5 +struct fib_nh_common { + struct net_device *nhc_dev; + int nhc_oif; + unsigned int nhc_flags; + struct lwtunnel_state *nhc_lwtstate; + unsigned char nhc_scope; + u8 nhc_family; + u8 nhc_has_gw:1, + unused:7; + union { + __be32 ipv4; + struct in6_addr ipv6; + } nhc_gw; + + int nhc_weight; + atomic_t nhc_upper_bound; +}; + struct fib_nh { - struct net_device *fib_nh_dev; + struct fib_nh_common nh_common; struct hlist_node nh_hash; struct fib_info *nh_parent; - unsigned int fib_nh_flags; - unsigned char fib_nh_scope; -#ifdef CONFIG_IP_ROUTE_MULTIPATH - int fib_nh_weight; - atomic_t fib_nh_upper_bound; -#endif #ifdef CONFIG_IP_ROUTE_CLASSID __u32 nh_tclassid; #endif - int fib_nh_oif; - __be32 fib_nh_gw4; __be32 nh_saddr; int nh_saddr_genid; struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket __rcu *nh_exceptions; - struct lwtunnel_state *fib_nh_lws; +#define fib_nh_family nh_common.nhc_family +#define fib_nh_dev nh_common.nhc_dev +#define fib_nh_oif nh_common.nhc_oif +#define fib_nh_flags nh_common.nhc_flags +#define fib_nh_lws nh_common.nhc_lwtstate +#define fib_nh_scope nh_common.nhc_scope +#define fib_nh_family nh_common.nhc_family +#define fib_nh_has_gw nh_common.nhc_has_gw +#define fib_nh_gw4 nh_common.nhc_gw.ipv4 +#define fib_nh_gw6 nh_common.nhc_gw.ipv6 +#define fib_nh_weight nh_common.nhc_weight +#define fib_nh_upper_bound nh_common.nhc_upper_bound }; /* -- cgit v1.2.3 From 979e276ebebd537782797c439c9cb42b6d3aba27 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 27 Mar 2019 20:53:58 -0700 Subject: net: Use common nexthop init and release helpers With fib_nh_common in place, move common initialization and release code into helpers used by both ipv4 and ipv6. For the moment, the init is just the lwt encap and the release is both the netdev reference and the the lwt state reference. More will be added later. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/ip_fib.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 70548b1a6322..12a6d759cf57 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -441,6 +441,10 @@ int fib_nh_init(struct net *net, struct fib_nh *fib_nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack); void fib_nh_release(struct net *net, struct fib_nh *fib_nh); +int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *fc_encap, + u16 fc_encap_type, void *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack); +void fib_nh_common_release(struct fib_nh_common *nhc); /* Exported by fib_trie.c */ void fib_trie_init(void); -- cgit v1.2.3 From 3616d08bcbb564c7765187cd45ad392e49bad73a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 22 Mar 2019 06:06:09 -0700 Subject: ipv6: Move ipv6 stubs to a separate header file The number of stubs is growing and has nothing to do with addrconf. Move the definition of the stubs to a separate header file and update users. In the move, drop the vxlan specific comment before ipv6_stub. Code move only; no functional change intended. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/addrconf.h | 47 ------------------------------------ include/net/ipv6_stubs.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ include/net/udp_tunnel.h | 2 +- 3 files changed, 64 insertions(+), 48 deletions(-) create mode 100644 include/net/ipv6_stubs.h (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index ec8e6784a6f7..2f67ae854ff0 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -238,53 +238,6 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, void ipv6_mc_dad_complete(struct inet6_dev *idev); -/* A stub used by vxlan module. This is ugly, ideally these - * symbols should be built into the core kernel. - */ -struct ipv6_stub { - int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex, - const struct in6_addr *addr); - int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex, - const struct in6_addr *addr); - int (*ipv6_dst_lookup)(struct net *net, struct sock *sk, - struct dst_entry **dst, struct flowi6 *fl6); - int (*ipv6_route_input)(struct sk_buff *skb); - - struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); - struct fib6_info *(*fib6_lookup)(struct net *net, int oif, - struct flowi6 *fl6, int flags); - struct fib6_info *(*fib6_table_lookup)(struct net *net, - struct fib6_table *table, - int oif, struct flowi6 *fl6, - int flags); - struct fib6_info *(*fib6_multipath_select)(const struct net *net, - struct fib6_info *f6i, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, - int strict); - u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, - struct in6_addr *saddr); - - void (*udpv6_encap_enable)(void); - void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, - const struct in6_addr *solicited_addr, - bool router, bool solicited, bool override, bool inc_opt); - struct neigh_table *nd_tbl; -}; -extern const struct ipv6_stub *ipv6_stub __read_mostly; - -/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ -struct ipv6_bpf_stub { - int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); - struct sock *(*udp6_lib_lookup)(struct net *net, - const struct in6_addr *saddr, __be16 sport, - const struct in6_addr *daddr, __be16 dport, - int dif, int sdif, struct udp_table *tbl, - struct sk_buff *skb); -}; -extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; - /* * identify MLD packets for MLD filter exceptions */ diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h new file mode 100644 index 000000000000..d8d9c0b0e8c0 --- /dev/null +++ b/include/net/ipv6_stubs.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IPV6_STUBS_H +#define _IPV6_STUBS_H + +#include +#include +#include +#include +#include +#include +#include + +/* structs from net/ip6_fib.h */ +struct fib6_info; + +/* This is ugly, ideally these symbols should be built + * into the core kernel. + */ +struct ipv6_stub { + int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex, + const struct in6_addr *addr); + int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex, + const struct in6_addr *addr); + int (*ipv6_dst_lookup)(struct net *net, struct sock *sk, + struct dst_entry **dst, struct flowi6 *fl6); + int (*ipv6_route_input)(struct sk_buff *skb); + + struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); + struct fib6_info *(*fib6_lookup)(struct net *net, int oif, + struct flowi6 *fl6, int flags); + struct fib6_info *(*fib6_table_lookup)(struct net *net, + struct fib6_table *table, + int oif, struct flowi6 *fl6, + int flags); + struct fib6_info *(*fib6_multipath_select)(const struct net *net, + struct fib6_info *f6i, + struct flowi6 *fl6, int oif, + const struct sk_buff *skb, + int strict); + u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, + struct in6_addr *saddr); + + void (*udpv6_encap_enable)(void); + void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, + const struct in6_addr *solicited_addr, + bool router, bool solicited, bool override, bool inc_opt); + struct neigh_table *nd_tbl; +}; +extern const struct ipv6_stub *ipv6_stub __read_mostly; + +/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ +struct ipv6_bpf_stub { + int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); + struct sock *(*udp6_lib_lookup)(struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, + int dif, int sdif, struct udp_table *tbl, + struct sk_buff *skb); +}; +extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; + +#endif diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index b8137953fea3..4b1f95e08307 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -7,7 +7,7 @@ #if IS_ENABLED(CONFIG_IPV6) #include -#include +#include #endif struct udp_port_cfg { -- cgit v1.2.3 From 18b6f717483a835fb98de9f0df6c724df9324e78 Mon Sep 17 00:00:00 2001 From: wenxu Date: Thu, 28 Mar 2019 12:43:23 +0800 Subject: openvswitch: Make metadata_dst tunnel work in IP_TUNNEL_INFO_BRIDGE mode There is currently no support for the multicast/broadcast aspects of VXLAN in ovs. In the datapath flow the tun_dst must specific. But in the IP_TUNNEL_INFO_BRIDGE mode the tun_dst can not be specific. And the packet can forward through the fdb table of vxlan devcice. In this mode the broadcast/multicast packet can be sent through the following ways in ovs. ovs-vsctl add-port br0 vxlan -- set in vxlan type=vxlan \ options:key=1000 options:remote_ip=flow ovs-ofctl add-flow br0 in_port=LOCAL,dl_dst=ff:ff:ff:ff:ff:ff, \ action=output:vxlan bridge fdb append ff:ff:ff:ff:ff:ff dev vxlan_sys_4789 dst 172.168.0.1 \ src_vni 1000 vni 1000 self bridge fdb append ff:ff:ff:ff:ff:ff dev vxlan_sys_4789 dst 172.168.0.2 \ src_vni 1000 vni 1000 self Signed-off-by: wenxu Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 0cac5d802c6a..f271f1ec50ae 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -364,6 +364,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ OVS_TUNNEL_KEY_ATTR_PAD, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* struct erspan_metadata */ + OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE, /* No argument. IPV4_INFO_BRIDGE mode.*/ __OVS_TUNNEL_KEY_ATTR_MAX }; -- cgit v1.2.3 From a4e76ba6b4994773fbe7a4eed8228e47862ac8a3 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 31 Mar 2019 06:49:41 +0000 Subject: mlxsw: spectrum_acl: Rename rehash_dis trace The name of the trace is no longer correct, since there is no disable of rehash done. So name it "rehash_rollback_failed". Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/trace/events/mlxsw.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/mlxsw.h b/include/trace/events/mlxsw.h index 6a4cfaef33a2..19a25ed323a5 100644 --- a/include/trace/events/mlxsw.h +++ b/include/trace/events/mlxsw.h @@ -93,7 +93,7 @@ TRACE_EVENT(mlxsw_sp_acl_tcam_vregion_migrate_end, __entry->mlxsw_sp, __entry->vregion) ); -TRACE_EVENT(mlxsw_sp_acl_tcam_vregion_rehash_dis, +TRACE_EVENT(mlxsw_sp_acl_tcam_vregion_rehash_rollback_failed, TP_PROTO(const struct mlxsw_sp *mlxsw_sp, const struct mlxsw_sp_acl_tcam_vregion *vregion), -- cgit v1.2.3 From a2c7023f7075ca9b80f944d3f20f60e6574538e2 Mon Sep 17 00:00:00 2001 From: Xiaofei Shen Date: Fri, 29 Mar 2019 11:04:58 +0530 Subject: net: dsa: read mac address from DT for slave device Before creating a slave netdevice, get the mac address from DTS and apply in case it is valid. Signed-off-by: Xiaofei Shen Signed-off-by: Vinod Koul Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index ae480bba11f5..0cfc2f828b87 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -140,6 +140,7 @@ struct dsa_port { unsigned int index; const char *name; const struct dsa_port *cpu_dp; + const char *mac; struct device_node *dn; unsigned int ageing_time; u8 stp_state; -- cgit v1.2.3 From 97cdcf37b57e3f204be3000b9eab9686f38b4356 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 1 Apr 2019 16:42:13 +0200 Subject: net: place xmit recursion in softnet data This fills a hole in softnet data, so no change in structure size. Also prepares for xmit_more placement in the same spot; skb->xmit_more will be removed in followup patch. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netdevice.h | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 78f5ec4ebf64..2b25824642fa 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2659,14 +2659,6 @@ void netdev_freemem(struct net_device *dev); void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); -DECLARE_PER_CPU(int, xmit_recursion); -#define XMIT_RECURSION_LIMIT 10 - -static inline int dev_recursion_level(void) -{ - return this_cpu_read(xmit_recursion); -} - struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); @@ -3015,6 +3007,11 @@ struct softnet_data { #ifdef CONFIG_XFRM_OFFLOAD struct sk_buff_head xfrm_backlog; #endif + /* written and read only by owning cpu: */ + struct { + u16 recursion; + u8 more; + } xmit; #ifdef CONFIG_RPS /* input_queue_head should be written by cpu owning this struct, * and only read by other cpus. Worth using a cache line. @@ -3050,6 +3047,28 @@ static inline void input_queue_tail_incr_save(struct softnet_data *sd, DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); +static inline int dev_recursion_level(void) +{ + return __this_cpu_read(softnet_data.xmit.recursion); +} + +#define XMIT_RECURSION_LIMIT 10 +static inline bool dev_xmit_recursion(void) +{ + return unlikely(__this_cpu_read(softnet_data.xmit.recursion) > + XMIT_RECURSION_LIMIT); +} + +static inline void dev_xmit_recursion_inc(void) +{ + __this_cpu_inc(softnet_data.xmit.recursion); +} + +static inline void dev_xmit_recursion_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.recursion); +} + void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); @@ -4409,6 +4428,11 @@ static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, return ops->ndo_start_xmit(skb, dev); } +static inline bool netdev_xmit_more(void) +{ + return __this_cpu_read(softnet_data.xmit.more); +} + static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, bool more) { -- cgit v1.2.3 From 6b16f9ee89b8d5709f24bc3ac89ae8b5452c0d7c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 1 Apr 2019 16:42:14 +0200 Subject: net: move skb->xmit_more hint to softnet data There are two reasons for this. First, the xmit_more flag conceptually doesn't fit into the skb, as xmit_more is not a property related to the skb. Its only a hint to the driver that the stack is about to transmit another packet immediately. Second, it was only done this way to not have to pass another argument to ndo_start_xmit(). We can place xmit_more in the softnet data, next to the device recursion. The recursion counter is already written to on each transmit. The "more" indicator is placed right next to it. Drivers can use the netdev_xmit_more() helper instead of skb->xmit_more to check the "more packets coming" hint. skb->xmit_more is retained (but always 0) to not cause build breakage. This change takes care of the simple s/skb->xmit_more/netdev_xmit_more()/ conversions. Remaining drivers are converted in the next patches. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2b25824642fa..eb9f05e0863d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4424,7 +4424,7 @@ static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops, struct sk_buff *skb, struct net_device *dev, bool more) { - skb->xmit_more = more ? 1 : 0; + __this_cpu_write(softnet_data.xmit.more, more); return ops->ndo_start_xmit(skb, dev); } -- cgit v1.2.3 From 4f296edeb9d4cf76b876869461a7ae627c307110 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 1 Apr 2019 16:42:17 +0200 Subject: drivers: net: aurora: use netdev_xmit_more helper This is the last driver using always-0 skb->xmit_more. Switch it to netdev_xmit_more and remove the now unused xmit_more flag from sk_buff. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9027a8c4219f..69b5538adcea 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -657,7 +657,6 @@ typedef unsigned char *sk_buff_data_t; * @tc_index: Traffic control index * @hash: the packet hash * @queue_mapping: Queue mapping for multiqueue devices - * @xmit_more: More SKBs are pending for this queue * @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves * @active_extensions: active extensions (skb_ext_id types) * @ndisc_nodetype: router type (from link layer) @@ -764,7 +763,6 @@ struct sk_buff { fclone:2, peeked:1, head_frag:1, - xmit_more:1, pfmemalloc:1; #ifdef CONFIG_SKB_EXTENSIONS __u8 active_extensions; -- cgit v1.2.3 From 38702cce547a74493687fd8bb925fbb5c3898ce3 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Fri, 29 Mar 2019 15:37:51 -0700 Subject: net/mlx5: Remove unused MLX5_*_DOORBELL_LOCK macros MLX5_*_DOORBELL_LOCK macros provided a way to avoid locking for mlx5_write64 on 64-bit platforms where it's not necessary. Currently all calls to mlx5_write64 don't use a spinlock, so the macros became unused. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- include/linux/mlx5/doorbell.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/doorbell.h b/include/linux/mlx5/doorbell.h index 0787de28f2fc..9ef3f9d00154 100644 --- a/include/linux/mlx5/doorbell.h +++ b/include/linux/mlx5/doorbell.h @@ -42,10 +42,6 @@ * PCI so we won't worry about it. */ -#define MLX5_DECLARE_DOORBELL_LOCK(name) -#define MLX5_INIT_DOORBELL_LOCK(ptr) do { } while (0) -#define MLX5_GET_DOORBELL_LOCK(ptr) (NULL) - static inline void mlx5_write64(__be32 val[2], void __iomem *dest, spinlock_t *doorbell_lock) { @@ -59,10 +55,6 @@ static inline void mlx5_write64(__be32 val[2], void __iomem *dest, * MMIO writes. */ -#define MLX5_DECLARE_DOORBELL_LOCK(name) spinlock_t name; -#define MLX5_INIT_DOORBELL_LOCK(ptr) spin_lock_init(ptr) -#define MLX5_GET_DOORBELL_LOCK(ptr) (ptr) - static inline void mlx5_write64(__be32 val[2], void __iomem *dest, spinlock_t *doorbell_lock) { -- cgit v1.2.3 From bbf29f618e8c5bfd6efdad5fdc050a84bab795ab Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Fri, 29 Mar 2019 15:37:52 -0700 Subject: net/mlx5: Remove spinlock support from mlx5_write64 As there is no user of mlx5_write64 that passes a spinlock to mlx5_write64, remove this functionality and simplify the function. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- include/linux/mlx5/cq.h | 2 +- include/linux/mlx5/doorbell.h | 31 +++++++++---------------------- 2 files changed, 10 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 612c8c2f2466..769326ea1d9b 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -170,7 +170,7 @@ static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd, doorbell[0] = cpu_to_be32(sn << 28 | cmd | ci); doorbell[1] = cpu_to_be32(cq->cqn); - mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL, NULL); + mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL); } static inline void mlx5_cq_hold(struct mlx5_core_cq *cq) diff --git a/include/linux/mlx5/doorbell.h b/include/linux/mlx5/doorbell.h index 9ef3f9d00154..5c267707e1df 100644 --- a/include/linux/mlx5/doorbell.h +++ b/include/linux/mlx5/doorbell.h @@ -36,38 +36,25 @@ #define MLX5_BF_OFFSET 0x800 #define MLX5_CQ_DOORBELL 0x20 -#if BITS_PER_LONG == 64 /* Assume that we can just write a 64-bit doorbell atomically. s390 * actually doesn't have writeq() but S/390 systems don't even have * PCI so we won't worry about it. + * + * Note that the write is not atomic on 32-bit systems! In contrast to 64-bit + * ones, it requires proper locking. mlx5_write64 doesn't do any locking, so use + * it at your own discretion, protected by some kind of lock on 32 bits. + * + * TODO: use write{q,l}_relaxed() */ -static inline void mlx5_write64(__be32 val[2], void __iomem *dest, - spinlock_t *doorbell_lock) +static inline void mlx5_write64(__be32 val[2], void __iomem *dest) { +#if BITS_PER_LONG == 64 __raw_writeq(*(u64 *)val, dest); -} - #else - -/* Just fall back to a spinlock to protect the doorbell if - * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit - * MMIO writes. - */ - -static inline void mlx5_write64(__be32 val[2], void __iomem *dest, - spinlock_t *doorbell_lock) -{ - unsigned long flags; - - if (doorbell_lock) - spin_lock_irqsave(doorbell_lock, flags); __raw_writel((__force u32) val[0], dest); __raw_writel((__force u32) val[1], dest + 4); - if (doorbell_lock) - spin_unlock_irqrestore(doorbell_lock, flags); -} - #endif +} #endif /* MLX5_DOORBELL_H */ -- cgit v1.2.3 From 52c368dc3da7beb7b283133024af1b6d07bf93b9 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 29 Mar 2019 15:37:55 -0700 Subject: net/mlx5: Move health and page alloc init to mdev_init Software structure initialization should be in mdev_init stage. This provides a better logical separation of mlx5 core device initialization flow and will help to seamlessly support creating different mlx5 device types such as PF, VF and SF mlx5 sub-function virtual device. This patch does not change any functionality. Signed-off-by: Vu Pham Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c5454f985e1d..d7f5c0e8c47a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -883,6 +883,7 @@ void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome); int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn); int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn); +void mlx5_health_flush(struct mlx5_core_dev *dev); void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); void mlx5_start_health_poll(struct mlx5_core_dev *dev); -- cgit v1.2.3 From aa8106f137b93628d531ef5ecbbcbecef99370d7 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Fri, 29 Mar 2019 15:38:01 -0700 Subject: net/mlx5: Add explicit bar address field Add bar_addr field to store bar-0 address to avoid calling pci_resource_start with hard-coded bar-0 as parameter. Also note that different mlx5 device types will have bar_addr on different bars. This patch does not change any functionality. Signed-off-by: Huy Nguyen Signed-off-by: Vu Pham Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d7f5c0e8c47a..c0ee597f5457 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -658,6 +658,7 @@ struct mlx5_core_dev { u64 sys_image_guid; phys_addr_t iseg_base; struct mlx5_init_seg __iomem *iseg; + phys_addr_t bar_addr; enum mlx5_device_state state; /* sync interface state */ struct mutex intf_state_mutex; -- cgit v1.2.3 From 4039049b5c462d3bb9ee8a68c4375582f037d5f2 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Fri, 29 Mar 2019 15:38:03 -0700 Subject: net/mlx5: Expose MPEIN (Management PCIE INfo) register layout Expose PRM layout for handling MPEIN (Management PCIE Info). It will be used in the downstream patch for querying MPEIN via the driver. Signed-off-by: Aya Levin Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 51 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c0ee597f5457..0bfb95e30e47 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -133,6 +133,7 @@ enum { MLX5_REG_MTRC_CONF = 0x9041, MLX5_REG_MTRC_STDB = 0x9042, MLX5_REG_MTRC_CTRL = 0x9043, + MLX5_REG_MPEIN = 0x9050, MLX5_REG_MPCNT = 0x9051, MLX5_REG_MTPPS = 0x9053, MLX5_REG_MTPPSE = 0x9054, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5decffe565fb..d31712af5a7b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -8025,6 +8025,52 @@ struct mlx5_ifc_ppcnt_reg_bits { union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set; }; +struct mlx5_ifc_mpein_reg_bits { + u8 reserved_at_0[0x2]; + u8 depth[0x6]; + u8 pcie_index[0x8]; + u8 node[0x8]; + u8 reserved_at_18[0x8]; + + u8 capability_mask[0x20]; + + u8 reserved_at_40[0x8]; + u8 link_width_enabled[0x8]; + u8 link_speed_enabled[0x10]; + + u8 lane0_physical_position[0x8]; + u8 link_width_active[0x8]; + u8 link_speed_active[0x10]; + + u8 num_of_pfs[0x10]; + u8 num_of_vfs[0x10]; + + u8 bdf0[0x10]; + u8 reserved_at_b0[0x10]; + + u8 max_read_request_size[0x4]; + u8 max_payload_size[0x4]; + u8 reserved_at_c8[0x5]; + u8 pwr_status[0x3]; + u8 port_type[0x4]; + u8 reserved_at_d4[0xb]; + u8 lane_reversal[0x1]; + + u8 reserved_at_e0[0x14]; + u8 pci_power[0xc]; + + u8 reserved_at_100[0x20]; + + u8 device_status[0x10]; + u8 port_state[0x8]; + u8 reserved_at_138[0x8]; + + u8 reserved_at_140[0x10]; + u8 receiver_detect_result[0x10]; + + u8 reserved_at_160[0x20]; +}; + struct mlx5_ifc_mpcnt_reg_bits { u8 reserved_at_0[0x8]; u8 pcie_index[0x8]; @@ -8344,7 +8390,9 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x74]; + u8 reserved_at_0[0x6e]; + u8 pci_status_and_power[0x1]; + u8 reserved_at_6f[0x5]; u8 mark_tx_action_cnp[0x1]; u8 mark_tx_action_cqe[0x1]; u8 dynamic_tx_overflow[0x1]; @@ -8944,6 +8992,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_pmtu_reg_bits pmtu_reg; struct mlx5_ifc_ppad_reg_bits ppad_reg; struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg; + struct mlx5_ifc_mpein_reg_bits mpein_reg; struct mlx5_ifc_mpcnt_reg_bits mpcnt_reg; struct mlx5_ifc_pplm_reg_bits pplm_reg; struct mlx5_ifc_pplr_reg_bits pplr_reg; -- cgit v1.2.3 From 045925e3fe5b98e402337a176d154252c56cef2e Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 27 Mar 2019 21:58:44 +0100 Subject: net: phy: add genphy_read_abilities Similar to genphy_c45_pma_read_abilities() add a function to dynamically detect the abilities of a Clause 22 PHY. This is mainly copied from genphy_config_init(). Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 34084892a466..ad88f063e50f 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1075,6 +1075,7 @@ void phy_attached_info(struct phy_device *phydev); /* Clause 22 PHY */ int genphy_config_init(struct phy_device *phydev); +int genphy_read_abilities(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); int genphy_restart_aneg(struct phy_device *phydev); int genphy_config_eee_advert(struct phy_device *phydev); -- cgit v1.2.3 From 06ee7115b0d1742de745ad143fb5e06d77d27fba Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:40 -0700 Subject: bpf: add verifier stats and log_level bit 2 In order to understand the verifier bottlenecks add various stats and extend log_level: log_level 1 and 2 are kept as-is: bit 0 - level=1 - print every insn and verifier state at branch points bit 1 - level=2 - print every insn and verifier state at every insn bit 2 - level=4 - print verifier error and stats at the end of verification When verifier rejects the program the libbpf is trying to load the program twice. Once with log_level=0 (no messages, only error code is reported to user space) and second time with log_level=1 to tell the user why the verifier rejected it. With introduction of bit 2 - level=4 the libbpf can choose to always use that level and load programs once, since the verification speed is not affected and in case of error the verbose message will be available. Note that the verifier stats are not part of uapi just like all other verbose messages. They're expected to change in the future. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7d8228d1c898..f7e15eeb60bb 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -248,6 +248,12 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) return log->len_used >= log->len_total - 1; } +#define BPF_LOG_LEVEL1 1 +#define BPF_LOG_LEVEL2 2 +#define BPF_LOG_STATS 4 +#define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) +#define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS) + static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { return log->level && log->ubuf && !bpf_verifier_log_full(log); @@ -284,6 +290,21 @@ struct bpf_verifier_env { struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; + /* number of instructions analyzed by the verifier */ + u32 insn_processed; + /* total verification time */ + u64 verification_time; + /* maximum number of verifier states kept in 'branching' instructions */ + u32 max_states_per_insn; + /* total number of allocated verifier states */ + u32 total_states; + /* some states are freed during program analysis. + * this is peak number of states. this number dominates kernel + * memory consumption during verification + */ + u32 peak_states; + /* longest register parentage chain walked for liveness marking */ + u32 longest_mark_read_walk; }; __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, -- cgit v1.2.3 From 9f4686c41bdff051f557accb531af79dd1773687 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:41 -0700 Subject: bpf: improve verification speed by droping states Branch instructions, branch targets and calls in a bpf program are the places where the verifier remembers states that led to successful verification of the program. These states are used to prune brute force program analysis. For unprivileged programs there is a limit of 64 states per such 'branching' instructions (maximum length is tracked by max_states_per_insn counter introduced in the previous patch). Simply reducing this threshold to 32 or lower increases insn_processed metric to the point that small valid programs get rejected. For root programs there is no limit and cilium programs can have max_states_per_insn to be 100 or higher. Walking 100+ states multiplied by number of 'branching' insns during verification consumes significant amount of cpu time. Turned out simple LRU-like mechanism can be used to remove states that unlikely will be helpful in future search pruning. This patch introduces hit_cnt and miss_cnt counters: hit_cnt - this many times this state successfully pruned the search miss_cnt - this many times this state was not equivalent to other states (and that other states were added to state list) The heuristic introduced in this patch is: if (sl->miss_cnt > sl->hit_cnt * 3 + 3) /* drop this state from future considerations */ Higher numbers increase max_states_per_insn (allow more states to be considered for pruning) and slow verification speed, but do not meaningfully reduce insn_processed metric. Lower numbers drop too many states and insn_processed increases too much. Many different formulas were considered. This one is simple and works well enough in practice. (the analysis was done on selftests/progs/* and on cilium programs) The end result is this heuristic improves verification speed by 10 times. Large synthetic programs that used to take a second more now take 1/10 of a second. In cases where max_states_per_insn used to be 100 or more, now it's ~10. There is a slight increase in insn_processed for cilium progs: before after bpf_lb-DLB_L3.o 1831 1838 bpf_lb-DLB_L4.o 3029 3218 bpf_lb-DUNKNOWN.o 1064 1064 bpf_lxc-DDROP_ALL.o 26309 26935 bpf_lxc-DUNKNOWN.o 33517 34439 bpf_netdev.o 9713 9721 bpf_overlay.o 6184 6184 bpf_lcx_jit.o 37335 39389 And 2-3 times improvement in the verification speed. Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f7e15eeb60bb..fc8254d6b569 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -207,6 +207,7 @@ struct bpf_verifier_state { struct bpf_verifier_state_list { struct bpf_verifier_state state; struct bpf_verifier_state_list *next; + int miss_cnt, hit_cnt; }; /* Possible states for alu_state member. */ @@ -280,6 +281,7 @@ struct bpf_verifier_env { bool strict_alignment; /* perform strict pointer alignment checks */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ + struct bpf_verifier_state_list *free_list; struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ -- cgit v1.2.3 From c04c0d2b968ac45d6ef020316808ef6c82325a82 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 1 Apr 2019 21:27:45 -0700 Subject: bpf: increase complexity limit and maximum program size Large verifier speed improvements allow to increase verifier complexity limit. Now regardless of the program composition and its size it takes little time for the verifier to hit insn_processed limit. On typical x86 machine non-debug kernel processes 1M instructions in 1/10 of a second. (before these speed improvements specially crafted programs could be hitting multi-second verification times) Full kasan kernel with debug takes ~1 second for the same 1M insns. Hence bump the BPF_COMPLEXITY_LIMIT_INSNS limit to 1M. Also increase the number of instructions per program from 4k to internal BPF_COMPLEXITY_LIMIT_INSNS limit. 4k limit was confusing to users, since small programs with hundreds of insns could be hitting BPF_COMPLEXITY_LIMIT_INSNS limit. Sometimes adding more insns and bpf_trace_printk debug statements would make the verifier accept the program while removing code would make the verifier reject it. Some user space application started to add #define MAX_FOO to their programs and do: MAX_FOO=100; again: compile with MAX_FOO; try to load; if (fails_to_load) { reduce MAX_FOO; goto again; } to be able to fit maximum amount of processing into single program. Other users artificially split their single program into a set of programs and use all 32 iterations of tail_calls to increase compute limits. And the most advanced folks used unlimited tc-bpf filter list to execute many bpf programs. Essentially the users managed to workaround 4k insn limit. This patch removes the limit for root programs from uapi. BPF_COMPLEXITY_LIMIT_INSNS is the kernel internal limit and success to load the program no longer depends on program size, but on 'smartness' of the verifier only. The verifier will continue to get smarter with every kernel release. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f62897198844..a445194b5fb6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -421,6 +421,7 @@ struct bpf_array { }; }; +#define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 32 struct bpf_event_entry { -- cgit v1.2.3 From 4950c2ba49cc6f2b38dbedcfa0ff67acf761419a Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 2 Apr 2019 20:43:30 +0200 Subject: net: phy: fix autoneg mismatch case in genphy_read_status The original patch didn't consider the case that autoneg process finishes successfully but both link partners have no mode in common. In this case there's no link, nevertheless we may be interested in what the link partner advertised. Like phydev->link we set phydev->autoneg_complete in genphy_update_link() and use the stored value in genphy_read_status(). This way we don't have to read register BMSR again. Fixes: b6163f194c69 ("net: phy: improve genphy_read_status") Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index ad88f063e50f..ab7439b3da2b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -390,6 +390,7 @@ struct phy_device { unsigned autoneg:1; /* The most recently read link state */ unsigned link:1; + unsigned autoneg_complete:1; /* Interrupts are enabled */ unsigned interrupts:1; -- cgit v1.2.3 From 0af7e7c128eb33f2dc16ed088ced00675785d628 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 Apr 2019 14:11:54 -0700 Subject: ipv4: Update fib_table_lookup tracepoint to take common nexthop Update fib_table_lookup tracepoint to take a fib_nh_common struct and dump the v6 gateway address if the nexthop uses it. Over the years saddr has not proven useful and the output of the tracepoint produces very long lines. Since saddr is not part of fib_nh_common, drop it. If it needs to be added later, fib_nh which contains saddr can be obtained from a fib_nh_common via container_of. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/trace/events/fib.h | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h index 61ea7a24c8e5..7f83b6eafc5c 100644 --- a/include/trace/events/fib.h +++ b/include/trace/events/fib.h @@ -13,9 +13,9 @@ TRACE_EVENT(fib_table_lookup, TP_PROTO(u32 tb_id, const struct flowi4 *flp, - const struct fib_nh *nh, int err), + const struct fib_nh_common *nhc, int err), - TP_ARGS(tb_id, flp, nh, err), + TP_ARGS(tb_id, flp, nhc, err), TP_STRUCT__entry( __field( u32, tb_id ) @@ -28,14 +28,17 @@ TRACE_EVENT(fib_table_lookup, __field( __u8, flags ) __array( __u8, src, 4 ) __array( __u8, dst, 4 ) - __array( __u8, gw, 4 ) - __array( __u8, saddr, 4 ) + __array( __u8, gw4, 4 ) + __array( __u8, gw6, 16 ) __field( u16, sport ) __field( u16, dport ) __dynamic_array(char, name, IFNAMSIZ ) ), TP_fast_assign( + struct in6_addr in6_zero = {}; + struct net_device *dev; + struct in6_addr *in6; __be32 *p32; __entry->tb_id = tb_id; @@ -62,33 +65,37 @@ TRACE_EVENT(fib_table_lookup, __entry->dport = 0; } - if (nh) { - struct net_device *dev; + dev = nhc ? nhc->nhc_dev : NULL; + __assign_str(name, dev ? dev->name : "-"); - p32 = (__be32 *) __entry->saddr; - *p32 = nh->nh_saddr; + if (nhc) { + if (nhc->nhc_family == AF_INET) { + p32 = (__be32 *) __entry->gw4; + *p32 = nhc->nhc_gw.ipv4; - p32 = (__be32 *) __entry->gw; - *p32 = nh->fib_nh_gw4; + in6 = (struct in6_addr *)__entry->gw6; + *in6 = in6_zero; + } else if (nhc->nhc_family == AF_INET6) { + p32 = (__be32 *) __entry->gw4; + *p32 = 0; - dev = nh->fib_nh_dev; - __assign_str(name, dev ? dev->name : "-"); + in6 = (struct in6_addr *)__entry->gw6; + *in6 = nhc->nhc_gw.ipv6; + } } else { - p32 = (__be32 *) __entry->saddr; + p32 = (__be32 *) __entry->gw4; *p32 = 0; - p32 = (__be32 *) __entry->gw; - *p32 = 0; - - __assign_str(name, "-"); + in6 = (struct in6_addr *)__entry->gw6; + *in6 = in6_zero; } ), - TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4 src %pI4 err %d", + TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4/%pI6c err %d", __entry->tb_id, __entry->oif, __entry->iif, __entry->proto, __entry->src, __entry->sport, __entry->dst, __entry->dport, __entry->tos, __entry->scope, __entry->flags, - __get_str(name), __entry->gw, __entry->saddr, __entry->err) + __get_str(name), __entry->gw4, __entry->gw6, __entry->err) ); #endif /* _TRACE_FIB_H */ -- cgit v1.2.3 From eba618abacade71669eb67c3360eecfee810cc88 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 Apr 2019 14:11:55 -0700 Subject: ipv4: Add fib_nh_common to fib_result Most of the ipv4 code only needs data from fib_nh_common. Add fib_nh_common selection to fib_result and update users to use it. Right now, fib_nh_common in fib_result will point to a fib_nh struct that is embedded within a fib_info: fib_info --> fib_nh fib_nh ... fib_nh ^ fib_result->nhc ----+ Later, nhc can point to a fib_nh within a nexthop struct: fib_info --> nexthop --> fib_nh ^ fib_result->nhc ---------------+ or for a nexthop group: fib_info --> nexthop --> nexthop --> fib_nh nexthop --> fib_nh ... nexthop --> fib_nh ^ fib_result->nhc ---------------------------+ In all cases nhsel within fib_result will point to which leg in the multipath route is used. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 12a6d759cf57..1f4a3b8bf584 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -156,15 +156,16 @@ struct fib_rule; struct fib_table; struct fib_result { - __be32 prefix; - unsigned char prefixlen; - unsigned char nh_sel; - unsigned char type; - unsigned char scope; - u32 tclassid; - struct fib_info *fi; - struct fib_table *table; - struct hlist_head *fa_head; + __be32 prefix; + unsigned char prefixlen; + unsigned char nh_sel; + unsigned char type; + unsigned char scope; + u32 tclassid; + struct fib_nh_common *nhc; + struct fib_info *fi; + struct fib_table *table; + struct hlist_head *fa_head; }; struct fib_result_nl { @@ -182,11 +183,10 @@ struct fib_result_nl { int err; }; -#ifdef CONFIG_IP_ROUTE_MULTIPATH -#define FIB_RES_NH(res) ((res).fi->fib_nh[(res).nh_sel]) -#else /* CONFIG_IP_ROUTE_MULTIPATH */ -#define FIB_RES_NH(res) ((res).fi->fib_nh[0]) -#endif /* CONFIG_IP_ROUTE_MULTIPATH */ +static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel) +{ + return &fi->fib_nh[nhsel].nh_common; +} #ifdef CONFIG_IP_MULTIPLE_TABLES #define FIB_TABLE_HASHSZ 256 @@ -195,18 +195,11 @@ struct fib_result_nl { #endif __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh); +__be32 fib_result_prefsrc(struct net *net, struct fib_result *res); -#define FIB_RES_SADDR(net, res) \ - ((FIB_RES_NH(res).nh_saddr_genid == \ - atomic_read(&(net)->ipv4.dev_addr_genid)) ? \ - FIB_RES_NH(res).nh_saddr : \ - fib_info_update_nh_saddr((net), &FIB_RES_NH(res))) -#define FIB_RES_GW(res) (FIB_RES_NH(res).fib_nh_gw4) -#define FIB_RES_DEV(res) (FIB_RES_NH(res).fib_nh_dev) -#define FIB_RES_OIF(res) (FIB_RES_NH(res).fib_nh_oif) - -#define FIB_RES_PREFSRC(net, res) ((res).fi->fib_prefsrc ? : \ - FIB_RES_SADDR(net, res)) +#define FIB_RES_NHC(res) ((res).nhc) +#define FIB_RES_DEV(res) (FIB_RES_NHC(res)->nhc_dev) +#define FIB_RES_OIF(res) (FIB_RES_NHC(res)->nhc_oif) struct fib_entry_notifier_info { struct fib_notifier_info info; /* must be first */ @@ -453,10 +446,12 @@ struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); static inline void fib_combine_itag(u32 *itag, const struct fib_result *res) { #ifdef CONFIG_IP_ROUTE_CLASSID + struct fib_nh_common *nhc = res->nhc; + struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); #ifdef CONFIG_IP_MULTIPLE_TABLES u32 rtag; #endif - *itag = FIB_RES_NH(*res).nh_tclassid<<16; + *itag = nh->nh_tclassid << 16; #ifdef CONFIG_IP_MULTIPLE_TABLES rtag = res->tclassid; if (*itag == 0) -- cgit v1.2.3 From c0a720770c01e67374b15f348f17a52409f6545c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 2 Apr 2019 14:11:58 -0700 Subject: ipv6: Flip to fib_nexthop_info Export fib_nexthop_info and fib_add_nexthop for use by IPv6 code. Remove rt6_nexthop_info and rt6_add_nexthop in favor of the IPv4 versions. Update fib_nexthop_info for IPv6 linkdown check and RTA_GATEWAY for AF_INET6. Signed-off-by: David Ahern Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/ip_fib.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 1f4a3b8bf584..3ce07841dc3b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -492,4 +492,9 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr); int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb); + +int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh, + unsigned int *flags, bool skip_oif); +int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh, + int nh_weight); #endif /* _NET_FIB_H */ -- cgit v1.2.3 From 28b05b92886871bdd8e6a9df73e3a15845fe8ef4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 3 Apr 2019 08:28:35 +0200 Subject: net: use correct this_cpu primitive in dev_recursion_level syzbot reports: BUG: using __this_cpu_read() in preemptible code: caller is dev_recursion_level include/linux/netdevice.h:3052 [inline] __this_cpu_preempt_check+0x246/0x270 lib/smp_processor_id.c:47 dev_recursion_level include/linux/netdevice.h:3052 [inline] ip6_skb_dst_mtu include/net/ip6_route.h:245 [inline] I erronously downgraded a this_cpu_read to __this_cpu_read when moving dev_recursion_level() around. Reported-by: syzbot+51471b4aae195285a4a3@syzkaller.appspotmail.com Fixes: 97cdcf37b57e ("net: place xmit recursion in softnet data") Signed-off-by: Florian Westphal Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eb9f05e0863d..521eb869555e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3049,7 +3049,7 @@ DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); static inline int dev_recursion_level(void) { - return __this_cpu_read(softnet_data.xmit.recursion); + return this_cpu_read(softnet_data.xmit.recursion); } #define XMIT_RECURSION_LIMIT 10 -- cgit v1.2.3 From 407dd706fb5245c138f3a972f8aaa1c8a09a574c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 3 Apr 2019 14:24:15 +0200 Subject: net: devlink: convert devlink_port_attrs bools to bits In order to save space in the struct, convert bools to bits. Signed-off-by: Jiri Pirko Reviewed-by: Florian Fainelli Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 31d5cec4d06b..4a1e3452a4ce 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -41,10 +41,10 @@ struct devlink { }; struct devlink_port_attrs { - bool set; + u8 set:1, + split:1; enum devlink_port_flavour flavour; u32 port_number; /* same value as "split group" */ - bool split; u32 split_subport_number; }; -- cgit v1.2.3 From bec5267cded268acdf679b651778c300d204e9f2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 3 Apr 2019 14:24:16 +0200 Subject: net: devlink: extend port attrs for switch ID Extend devlink_port_attrs_set() to pass switch ID for ports which are part of switch and store it in port attrs. For other ports, this is NULL. Note that this allows the driver to group devlink ports into one or more switches according to the actual topology. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4a1e3452a4ce..0f7968761204 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -42,10 +42,12 @@ struct devlink { struct devlink_port_attrs { u8 set:1, - split:1; + split:1, + switch_port:1; enum devlink_port_flavour flavour; u32 port_number; /* same value as "split group" */ u32 split_subport_number; + struct netdev_phys_item_id switch_id; }; struct devlink_port { @@ -582,7 +584,9 @@ void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour, u32 port_number, bool split, - u32 split_subport_number); + u32 split_subport_number, + const unsigned char *switch_id, + unsigned char switch_id_len); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, -- cgit v1.2.3 From 7e1146e8c10c00f859843817da8ecc5d902ea409 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 3 Apr 2019 14:24:17 +0200 Subject: net: devlink: introduce devlink_compat_switch_id_get() helper Introduce devlink_compat_switch_id_get() helper which fills up switch_id according to passed netdev pointer. Call it directly from dev_get_port_parent_id() as a fallback when ndo_get_port_parent_id is not defined for given netdev. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 0f7968761204..70c7d1ac8344 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -743,6 +743,8 @@ void devlink_compat_running_version(struct net_device *dev, int devlink_compat_flash_update(struct net_device *dev, const char *file_name); int devlink_compat_phys_port_name_get(struct net_device *dev, char *name, size_t len); +int devlink_compat_switch_id_get(struct net_device *dev, + struct netdev_phys_item_id *ppid); #else @@ -764,6 +766,13 @@ devlink_compat_phys_port_name_get(struct net_device *dev, return -EOPNOTSUPP; } +static inline int +devlink_compat_switch_id_get(struct net_device *dev, + struct netdev_phys_item_id *ppid) +{ + return -EOPNOTSUPP; +} + #endif #endif /* _NET_DEVLINK_H_ */ -- cgit v1.2.3 From 5d3c537f907036c1f18bd325ffc356e24cde664c Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 24 Mar 2019 09:21:40 +0200 Subject: net/mlx5: Handle event of power detection in the PCIE slot Handle event of power state change in the PCIE slot. When the event occurs, check if query power state and PCI power fields is supported. If so, read these fields from MPEIN (management PCIE info) register and issue a corresponding message. Signed-off-by: Aya Levin Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index f93a5598b942..db7dca75d726 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -361,6 +361,7 @@ enum { enum { MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT = 0x1, + MLX5_GENERAL_SUBTYPE_PCI_POWER_CHANGE_EVENT = 0x5, }; enum { -- cgit v1.2.3 From ff302db965b57c141297911ea647d36d11fedfbe Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 2 Apr 2019 10:07:45 +1100 Subject: rhashtable: allow rht_bucket_var to return NULL. Rather than returning a pointer to a static nulls, rht_bucket_var() now returns NULL if the bucket doesn't exist. This will make the next patch, which stores a bitlock in the bucket pointer, somewhat cleaner. This change involves introducing __rht_bucket_nested() which is like rht_bucket_nested(), but doesn't provide the static nulls, and changing rht_bucket_nested() to call this and possible provide a static nulls - as is still needed for the non-var case. Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 86dfa417848d..0c9175aeab8a 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -265,6 +265,8 @@ void rhashtable_destroy(struct rhashtable *ht); struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, unsigned int hash); +struct rhash_head __rcu **__rht_bucket_nested(const struct bucket_table *tbl, + unsigned int hash); struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); @@ -294,7 +296,7 @@ static inline struct rhash_head __rcu *const *rht_bucket( static inline struct rhash_head __rcu **rht_bucket_var( struct bucket_table *tbl, unsigned int hash) { - return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : + return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } @@ -890,6 +892,8 @@ static inline int __rhashtable_remove_fast_one( spin_lock_bh(lock); pprev = rht_bucket_var(tbl, hash); + if (!pprev) + goto out; rht_for_each_from(he, *pprev, tbl, hash) { struct rhlist_head *list; @@ -934,6 +938,7 @@ static inline int __rhashtable_remove_fast_one( break; } +out: spin_unlock_bh(lock); if (err > 0) { @@ -1042,6 +1047,8 @@ static inline int __rhashtable_replace_fast( spin_lock_bh(lock); pprev = rht_bucket_var(tbl, hash); + if (!pprev) + goto out; rht_for_each_from(he, *pprev, tbl, hash) { if (he != obj_old) { pprev = &he->next; @@ -1053,7 +1060,7 @@ static inline int __rhashtable_replace_fast( err = 0; break; } - +out: spin_unlock_bh(lock); return err; -- cgit v1.2.3 From 8f0db018006a421956965e1149234c4e8db718ee Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 2 Apr 2019 10:07:45 +1100 Subject: rhashtable: use bit_spin_locks to protect hash bucket. This patch changes rhashtables to use a bit_spin_lock on BIT(1) of the bucket pointer to lock the hash chain for that bucket. The benefits of a bit spin_lock are: - no need to allocate a separate array of locks. - no need to have a configuration option to guide the choice of the size of this array - locking cost is often a single test-and-set in a cache line that will have to be loaded anyway. When inserting at, or removing from, the head of the chain, the unlock is free - writing the new address in the bucket head implicitly clears the lock bit. For __rhashtable_insert_fast() we ensure this always happens when adding a new key. - even when lockings costs 2 updates (lock and unlock), they are in a cacheline that needs to be read anyway. The cost of using a bit spin_lock is a little bit of code complexity, which I think is quite manageable. Bit spin_locks are sometimes inappropriate because they are not fair - if multiple CPUs repeatedly contend of the same lock, one CPU can easily be starved. This is not a credible situation with rhashtable. Multiple CPUs may want to repeatedly add or remove objects, but they will typically do so at different buckets, so they will attempt to acquire different locks. As we have more bit-locks than we previously had spinlocks (by at least a factor of two) we can expect slightly less contention to go with the slightly better cache behavior and reduced memory consumption. To enhance type checking, a new struct is introduced to represent the pointer plus lock-bit that is stored in the bucket-table. This is "struct rhash_lock_head" and is empty. A pointer to this needs to be cast to either an unsigned lock, or a "struct rhash_head *" to be useful. Variables of this type are most often called "bkt". Previously "pprev" would sometimes point to a bucket, and sometimes a ->next pointer in an rhash_head. As these are now different types, pprev is NULL when it would have pointed to the bucket. In that case, 'blk' is used, together with correct locking protocol. Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable-types.h | 2 - include/linux/rhashtable.h | 261 +++++++++++++++++++++++++-------------- 2 files changed, 165 insertions(+), 98 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 763d613ce2c2..57467cbf4c5b 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -48,7 +48,6 @@ typedef int (*rht_obj_cmpfn_t)(struct rhashtable_compare_arg *arg, * @head_offset: Offset of rhash_head in struct to be hashed * @max_size: Maximum size while expanding * @min_size: Minimum size while shrinking - * @locks_mul: Number of bucket locks to allocate per cpu (default: 32) * @automatic_shrinking: Enable automatic shrinking of tables * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) * @obj_hashfn: Function to hash object @@ -62,7 +61,6 @@ struct rhashtable_params { unsigned int max_size; u16 min_size; bool automatic_shrinking; - u8 locks_mul; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; rht_obj_cmpfn_t obj_cmpfn; diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 0c9175aeab8a..ccbbafdf5547 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -24,12 +24,27 @@ #include #include #include +#include #include /* + * Objects in an rhashtable have an embedded struct rhash_head + * which is linked into as hash chain from the hash table - or one + * of two or more hash tables when the rhashtable is being resized. * The end of the chain is marked with a special nulls marks which has - * the least significant bit set. + * the least significant bit set but otherwise stores the address of + * the hash bucket. This allows us to be be sure we've found the end + * of the right list. + * The value stored in the hash bucket has BIT(2) used as a lock bit. + * This bit must be atomically set before any changes are made to + * the chain. To avoid dereferencing this pointer without clearing + * the bit first, we use an opaque 'struct rhash_lock_head *' for the + * pointer stored in the bucket. This struct needs to be defined so + * that rcu_derefernce() works on it, but it has no content so a + * cast is needed for it to be useful. This ensures it isn't + * used by mistake with clearing the lock bit first. */ +struct rhash_lock_head {}; /* Maximum chain length before rehash * @@ -52,8 +67,6 @@ * @nest: Number of bits of first-level nested table. * @rehash: Current bucket being rehashed * @hash_rnd: Random seed to fold into hash - * @locks_mask: Mask to apply before accessing locks[] - * @locks: Array of spinlocks protecting individual buckets * @walkers: List of active walkers * @rcu: RCU structure for freeing the table * @future_tbl: Table under construction during rehashing @@ -64,16 +77,70 @@ struct bucket_table { unsigned int size; unsigned int nest; u32 hash_rnd; - unsigned int locks_mask; - spinlock_t *locks; struct list_head walkers; struct rcu_head rcu; struct bucket_table __rcu *future_tbl; - struct rhash_head __rcu *buckets[] ____cacheline_aligned_in_smp; + struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; +/* + * We lock a bucket by setting BIT(1) in the pointer - this is always + * zero in real pointers and in the nulls marker. + * bit_spin_locks do not handle contention well, but the whole point + * of the hashtable design is to achieve minimum per-bucket contention. + * A nested hash table might not have a bucket pointer. In that case + * we cannot get a lock. For remove and replace the bucket cannot be + * interesting and doesn't need locking. + * For insert we allocate the bucket if this is the last bucket_table, + * and then take the lock. + * Sometimes we unlock a bucket by writing a new pointer there. In that + * case we don't need to unlock, but we do need to reset state such as + * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() + * provides the same release semantics that bit_spin_unlock() provides, + * this is safe. + */ + +static inline void rht_lock(struct rhash_lock_head **bkt) +{ + local_bh_disable(); + bit_spin_lock(1, (unsigned long *)bkt); +} + +static inline void rht_unlock(struct rhash_lock_head **bkt) +{ + bit_spin_unlock(1, (unsigned long *)bkt); + local_bh_enable(); +} + +static inline void rht_assign_unlock(struct rhash_lock_head **bkt, + struct rhash_head *obj) +{ + struct rhash_head **p = (struct rhash_head **)bkt; + + rcu_assign_pointer(*p, obj); + preempt_enable(); + __release(bitlock); + local_bh_enable(); +} + +/* + * If 'p' is a bucket head and might be locked: + * rht_ptr() returns the address without the lock bit. + * rht_ptr_locked() returns the address WITH the lock bit. + */ +static inline struct rhash_head __rcu *rht_ptr(const struct rhash_lock_head *p) +{ + return (void *)(((unsigned long)p) & ~BIT(1)); +} + +static inline struct rhash_lock_head __rcu *rht_ptr_locked(const + struct rhash_head *p) +{ + return (void *)(((unsigned long)p) | BIT(1)); +} + /* * NULLS_MARKER() expects a hash value with the low * bits mostly likely to be significant, and it discards @@ -206,25 +273,6 @@ static inline bool rht_grow_above_max(const struct rhashtable *ht, return atomic_read(&ht->nelems) >= ht->max_elems; } -/* The bucket lock is selected based on the hash and protects mutations - * on a group of hash buckets. - * - * A maximum of tbl->size/2 bucket locks is allocated. This ensures that - * a single lock always covers both buckets which may both contains - * entries which link to the same bucket of the old table during resizing. - * This allows to simplify the locking as locking the bucket in both - * tables during resize always guarantee protection. - * - * IMPORTANT: When holding the bucket lock of both the old and new table - * during expansions and shrinking, the old bucket lock must always be - * acquired first. - */ -static inline spinlock_t *rht_bucket_lock(const struct bucket_table *tbl, - unsigned int hash) -{ - return &tbl->locks[hash & tbl->locks_mask]; -} - #ifdef CONFIG_PROVE_LOCKING int lockdep_rht_mutex_is_held(struct rhashtable *ht); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); @@ -263,13 +311,13 @@ void rhashtable_free_and_destroy(struct rhashtable *ht, void *arg); void rhashtable_destroy(struct rhashtable *ht); -struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, - unsigned int hash); -struct rhash_head __rcu **__rht_bucket_nested(const struct bucket_table *tbl, - unsigned int hash); -struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, - struct bucket_table *tbl, +struct rhash_lock_head __rcu **rht_bucket_nested(const struct bucket_table *tbl, + unsigned int hash); +struct rhash_lock_head __rcu **__rht_bucket_nested(const struct bucket_table *tbl, unsigned int hash); +struct rhash_lock_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, + struct bucket_table *tbl, + unsigned int hash); #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) @@ -286,21 +334,21 @@ struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht, #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) -static inline struct rhash_head __rcu *const *rht_bucket( +static inline struct rhash_lock_head __rcu *const *rht_bucket( const struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } -static inline struct rhash_head __rcu **rht_bucket_var( +static inline struct rhash_lock_head __rcu **rht_bucket_var( struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } -static inline struct rhash_head __rcu **rht_bucket_insert( +static inline struct rhash_lock_head __rcu **rht_bucket_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : @@ -326,7 +374,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ - rht_for_each_from(pos, *rht_bucket(tbl, hash), tbl, hash) + rht_for_each_from(pos, rht_ptr(*rht_bucket(tbl, hash)), tbl, hash) /** * rht_for_each_entry_from - iterate over hash chain from given head @@ -351,7 +399,7 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_from(tpos, pos, *rht_bucket(tbl, hash), \ + rht_for_each_entry_from(tpos, pos, rht_ptr(*rht_bucket(tbl, hash)), \ tbl, hash, member) /** @@ -367,7 +415,8 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * remove the loop cursor from the list. */ #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ - for (pos = rht_dereference_bucket(*rht_bucket(tbl, hash), tbl, hash), \ + for (pos = rht_dereference_bucket(rht_ptr(*rht_bucket(tbl, hash)), \ + tbl, hash), \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ @@ -402,8 +451,12 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ -#define rht_for_each_rcu(pos, tbl, hash) \ - rht_for_each_rcu_from(pos, *rht_bucket(tbl, hash), tbl, hash) +#define rht_for_each_rcu(pos, tbl, hash) \ + for (({barrier(); }), \ + pos = rht_ptr(rht_dereference_bucket_rcu( \ + *rht_bucket(tbl, hash), tbl, hash)); \ + !rht_is_a_nulls(pos); \ + pos = rcu_dereference_raw(pos->next)) /** * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head @@ -437,7 +490,8 @@ static inline struct rhash_head __rcu **rht_bucket_insert( * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_rcu_from(tpos, pos, *rht_bucket(tbl, hash), \ + rht_for_each_entry_rcu_from(tpos, pos, \ + rht_ptr(*rht_bucket(tbl, hash)), \ tbl, hash, member) /** @@ -483,7 +537,7 @@ static inline struct rhash_head *__rhashtable_lookup( .ht = ht, .key = key, }; - struct rhash_head __rcu * const *head; + struct rhash_lock_head __rcu * const *bkt; struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; @@ -491,9 +545,10 @@ static inline struct rhash_head *__rhashtable_lookup( tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); - head = rht_bucket(tbl, hash); + bkt = rht_bucket(tbl, hash); do { - rht_for_each_rcu_from(he, *head, tbl, hash) { + he = rht_ptr(rht_dereference_bucket_rcu(*bkt, tbl, hash)); + rht_for_each_rcu_from(he, he, tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) @@ -503,7 +558,7 @@ restart: /* An object might have been moved to a different hash chain, * while we walk along it - better check and retry. */ - } while (he != RHT_NULLS_MARKER(head)); + } while (he != RHT_NULLS_MARKER(bkt)); /* Ensure we see any new tables. */ smp_rmb(); @@ -599,10 +654,10 @@ static inline void *__rhashtable_insert_fast( .ht = ht, .key = key, }; + struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct bucket_table *tbl; struct rhash_head *head; - spinlock_t *lock; unsigned int hash; int elasticity; void *data; @@ -611,23 +666,22 @@ static inline void *__rhashtable_insert_fast( tbl = rht_dereference_rcu(ht->tbl, ht); hash = rht_head_hashfn(ht, tbl, obj, params); - lock = rht_bucket_lock(tbl, hash); - spin_lock_bh(lock); + elasticity = RHT_ELASTICITY; + bkt = rht_bucket_insert(ht, tbl, hash); + data = ERR_PTR(-ENOMEM); + if (!bkt) + goto out; + pprev = NULL; + rht_lock(bkt); if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: - spin_unlock_bh(lock); + rht_unlock(bkt); rcu_read_unlock(); return rhashtable_insert_slow(ht, key, obj); } - elasticity = RHT_ELASTICITY; - pprev = rht_bucket_insert(ht, tbl, hash); - data = ERR_PTR(-ENOMEM); - if (!pprev) - goto out; - - rht_for_each_from(head, *pprev, tbl, hash) { + rht_for_each_from(head, rht_ptr(*bkt), tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; @@ -643,7 +697,7 @@ slow_path: data = rht_obj(ht, head); if (!rhlist) - goto out; + goto out_unlock; list = container_of(obj, struct rhlist_head, rhead); @@ -652,9 +706,13 @@ slow_path: RCU_INIT_POINTER(list->next, plist); head = rht_dereference_bucket(head->next, tbl, hash); RCU_INIT_POINTER(list->rhead.next, head); - rcu_assign_pointer(*pprev, obj); - - goto good; + if (pprev) { + rcu_assign_pointer(*pprev, obj); + rht_unlock(bkt); + } else + rht_assign_unlock(bkt, obj); + data = NULL; + goto out; } if (elasticity <= 0) @@ -662,12 +720,13 @@ slow_path: data = ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_max(ht, tbl))) - goto out; + goto out_unlock; if (unlikely(rht_grow_above_100(ht, tbl))) goto slow_path; - head = rht_dereference_bucket(*pprev, tbl, hash); + /* Inserting at head of list makes unlocking free. */ + head = rht_ptr(rht_dereference_bucket(*bkt, tbl, hash)); RCU_INIT_POINTER(obj->next, head); if (rhlist) { @@ -677,20 +736,21 @@ slow_path: RCU_INIT_POINTER(list->next, NULL); } - rcu_assign_pointer(*pprev, obj); - atomic_inc(&ht->nelems); + rht_assign_unlock(bkt, obj); + if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); -good: data = NULL; - out: - spin_unlock_bh(lock); rcu_read_unlock(); return data; + +out_unlock: + rht_unlock(bkt); + goto out; } /** @@ -699,9 +759,9 @@ out: * @obj: pointer to hash head inside object * @params: hash table parameters * - * Will take a per bucket spinlock to protect against mutual mutations + * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless - * they map to the same bucket lock. + * they map to the same bucket. * * It is safe to call this function from atomic context. * @@ -728,9 +788,9 @@ static inline int rhashtable_insert_fast( * @list: pointer to hash list head inside object * @params: hash table parameters * - * Will take a per bucket spinlock to protect against mutual mutations + * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless - * they map to the same bucket lock. + * they map to the same bucket. * * It is safe to call this function from atomic context. * @@ -751,9 +811,9 @@ static inline int rhltable_insert_key( * @list: pointer to hash list head inside object * @params: hash table parameters * - * Will take a per bucket spinlock to protect against mutual mutations + * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless - * they map to the same bucket lock. + * they map to the same bucket. * * It is safe to call this function from atomic context. * @@ -880,21 +940,20 @@ static inline int __rhashtable_remove_fast_one( struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { + struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; - spinlock_t * lock; unsigned int hash; int err = -ENOENT; hash = rht_head_hashfn(ht, tbl, obj, params); - lock = rht_bucket_lock(tbl, hash); + bkt = rht_bucket_var(tbl, hash); + if (!bkt) + return -ENOENT; + pprev = NULL; + rht_lock(bkt); - spin_lock_bh(lock); - - pprev = rht_bucket_var(tbl, hash); - if (!pprev) - goto out; - rht_for_each_from(he, *pprev, tbl, hash) { + rht_for_each_from(he, rht_ptr(*bkt), tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); @@ -934,13 +993,17 @@ static inline int __rhashtable_remove_fast_one( } } - rcu_assign_pointer(*pprev, obj); - break; + if (pprev) { + rcu_assign_pointer(*pprev, obj); + rht_unlock(bkt); + } else { + rht_assign_unlock(bkt, obj); + } + goto unlocked; } -out: - spin_unlock_bh(lock); - + rht_unlock(bkt); +unlocked: if (err > 0) { atomic_dec(&ht->nelems); if (unlikely(ht->p.automatic_shrinking && @@ -1029,9 +1092,9 @@ static inline int __rhashtable_replace_fast( struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { + struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; - spinlock_t *lock; unsigned int hash; int err = -ENOENT; @@ -1042,27 +1105,33 @@ static inline int __rhashtable_replace_fast( if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) return -EINVAL; - lock = rht_bucket_lock(tbl, hash); + bkt = rht_bucket_var(tbl, hash); + if (!bkt) + return -ENOENT; - spin_lock_bh(lock); + pprev = NULL; + rht_lock(bkt); - pprev = rht_bucket_var(tbl, hash); - if (!pprev) - goto out; - rht_for_each_from(he, *pprev, tbl, hash) { + rht_for_each_from(he, rht_ptr(*bkt), tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; } rcu_assign_pointer(obj_new->next, obj_old->next); - rcu_assign_pointer(*pprev, obj_new); + if (pprev) { + rcu_assign_pointer(*pprev, obj_new); + rht_unlock(bkt); + } else { + rht_assign_unlock(bkt, obj_new); + } err = 0; - break; + goto unlocked; } -out: - spin_unlock_bh(lock); + rht_unlock(bkt); + +unlocked: return err; } -- cgit v1.2.3 From 149212f07856b25a9d342bfd6d736519b2ef66dc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 2 Apr 2019 10:07:45 +1100 Subject: rhashtable: add lockdep tracking to bucket bit-spin-locks. Native bit_spin_locks are not tracked by lockdep. The bit_spin_locks used for rhashtable buckets are local to the rhashtable implementation, so there is little opportunity for the sort of misuse that lockdep might detect. However locks are held while a hash function or compare function is called, and if one of these took a lock, a misbehaviour is possible. As it is quite easy to add lockdep support this unlikely possibility seems to be enough justification. So create a lockdep class for bucket bit_spin_lock and attach through a lockdep_map in each bucket_table. Without the 'nested' annotation in rhashtable_rehash_one(), lockdep correctly reports a possible problem as this lock is taken while another bucket lock (in another table) is held. This confirms that the added support works. With the correct nested annotation in place, lockdep reports no problems. Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 51 ++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index ccbbafdf5547..460c0eaf6b96 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -82,6 +82,8 @@ struct bucket_table { struct bucket_table __rcu *future_tbl; + struct lockdep_map dep_map; + struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; @@ -102,23 +104,38 @@ struct bucket_table { * this is safe. */ -static inline void rht_lock(struct rhash_lock_head **bkt) +static inline void rht_lock(struct bucket_table *tbl, + struct rhash_lock_head **bkt) { local_bh_disable(); bit_spin_lock(1, (unsigned long *)bkt); + lock_map_acquire(&tbl->dep_map); +} + +static inline void rht_lock_nested(struct bucket_table *tbl, + struct rhash_lock_head **bucket, + unsigned int subclass) +{ + local_bh_disable(); + bit_spin_lock(1, (unsigned long *)bucket); + lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); } -static inline void rht_unlock(struct rhash_lock_head **bkt) +static inline void rht_unlock(struct bucket_table *tbl, + struct rhash_lock_head **bkt) { + lock_map_release(&tbl->dep_map); bit_spin_unlock(1, (unsigned long *)bkt); local_bh_enable(); } -static inline void rht_assign_unlock(struct rhash_lock_head **bkt, +static inline void rht_assign_unlock(struct bucket_table *tbl, + struct rhash_lock_head **bkt, struct rhash_head *obj) { struct rhash_head **p = (struct rhash_head **)bkt; + lock_map_release(&tbl->dep_map); rcu_assign_pointer(*p, obj); preempt_enable(); __release(bitlock); @@ -672,11 +689,11 @@ static inline void *__rhashtable_insert_fast( if (!bkt) goto out; pprev = NULL; - rht_lock(bkt); + rht_lock(tbl, bkt); if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: - rht_unlock(bkt); + rht_unlock(tbl, bkt); rcu_read_unlock(); return rhashtable_insert_slow(ht, key, obj); } @@ -708,9 +725,9 @@ slow_path: RCU_INIT_POINTER(list->rhead.next, head); if (pprev) { rcu_assign_pointer(*pprev, obj); - rht_unlock(bkt); + rht_unlock(tbl, bkt); } else - rht_assign_unlock(bkt, obj); + rht_assign_unlock(tbl, bkt, obj); data = NULL; goto out; } @@ -737,7 +754,7 @@ slow_path: } atomic_inc(&ht->nelems); - rht_assign_unlock(bkt, obj); + rht_assign_unlock(tbl, bkt, obj); if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); @@ -749,7 +766,7 @@ out: return data; out_unlock: - rht_unlock(bkt); + rht_unlock(tbl, bkt); goto out; } @@ -951,7 +968,7 @@ static inline int __rhashtable_remove_fast_one( if (!bkt) return -ENOENT; pprev = NULL; - rht_lock(bkt); + rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(*bkt), tbl, hash) { struct rhlist_head *list; @@ -995,14 +1012,14 @@ static inline int __rhashtable_remove_fast_one( if (pprev) { rcu_assign_pointer(*pprev, obj); - rht_unlock(bkt); + rht_unlock(tbl, bkt); } else { - rht_assign_unlock(bkt, obj); + rht_assign_unlock(tbl, bkt, obj); } goto unlocked; } - rht_unlock(bkt); + rht_unlock(tbl, bkt); unlocked: if (err > 0) { atomic_dec(&ht->nelems); @@ -1110,7 +1127,7 @@ static inline int __rhashtable_replace_fast( return -ENOENT; pprev = NULL; - rht_lock(bkt); + rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(*bkt), tbl, hash) { if (he != obj_old) { @@ -1121,15 +1138,15 @@ static inline int __rhashtable_replace_fast( rcu_assign_pointer(obj_new->next, obj_old->next); if (pprev) { rcu_assign_pointer(*pprev, obj_new); - rht_unlock(bkt); + rht_unlock(tbl, bkt); } else { - rht_assign_unlock(bkt, obj_new); + rht_assign_unlock(tbl, bkt, obj_new); } err = 0; goto unlocked; } - rht_unlock(bkt); + rht_unlock(tbl, bkt); unlocked: return err; -- cgit v1.2.3 From b262a69582a4676c7378a73077b7bb186c7c5b2a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:22 +0100 Subject: xfrm: place af number into xfrm_mode struct This will be useful to know if we're supposed to decode ipv4 or ipv6. While at it, make the unregister function return void, all module_exit functions did just BUG(); there is never a point in doing error checks if there is no way to handle such error. Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 85386becbaea..9a155063c25f 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -482,8 +482,9 @@ struct xfrm_mode { struct xfrm_state_afinfo *afinfo; struct module *owner; - unsigned int encap; - int flags; + u8 encap; + u8 family; + u8 flags; }; /* Flags for xfrm_mode. */ @@ -491,8 +492,8 @@ enum { XFRM_MODE_FLAG_TUNNEL = 1, }; -int xfrm_register_mode(struct xfrm_mode *mode, int family); -int xfrm_unregister_mode(struct xfrm_mode *mode, int family); +int xfrm_register_mode(struct xfrm_mode *mode); +void xfrm_unregister_mode(struct xfrm_mode *mode); static inline int xfrm_af2proto(unsigned int family) { -- cgit v1.2.3 From c2d305e51038167dd9de8d476c72f667d84cad8b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:24 +0100 Subject: xfrm: remove input indirection from xfrm_mode No need for any indirection or abstraction here, both functions are pretty much the same and quite small, they also have no external dependencies. xfrm_prepare_input can then be made static. With allmodconfig build, size increase of vmlinux is 25 byte: Before: text data bss dec filename 15730207 6936924 4046908 26714039 vmlinux After: 15730208 6936948 4046908 26714064 vmlinux v2: Fix INET_XFRM_MODE_TRANSPORT name in is-enabled test (Sabrina Dubroca) change copied comment to refer to transport and network header, not skb->{h,nh}, which don't exist anymore. (Sabrina) make xfrm_prepare_input static (Eyal Birger) Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 9a155063c25f..2c5fc9cc367d 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -436,16 +436,6 @@ struct xfrm_mode { */ int (*input2)(struct xfrm_state *x, struct sk_buff *skb); - /* - * This is the actual input entry point. - * - * For transport mode and equivalent this would be identical to - * input2 (which does not need to be set). While tunnel mode - * and equivalent would set this to the tunnel encapsulation function - * xfrm4_prepare_input that would in turn call input2. - */ - int (*input)(struct xfrm_state *x, struct sk_buff *skb); - /* * Add encapsulation header. * @@ -1606,7 +1596,6 @@ int xfrm_init_replay(struct xfrm_state *x); int xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); int xfrm_init_state(struct xfrm_state *x); -int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); int xfrm_trans_queue(struct sk_buff *skb, -- cgit v1.2.3 From 0c620e97b3490890facbbe06d5deed9b024de255 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:25 +0100 Subject: xfrm: remove output indirection from xfrm_mode Same is input indirection. Only exception: we need to export xfrm_outer_mode_output for pktgen. Increases size of vmlinux by about 163 byte: Before: text data bss dec filename 15730208 6936948 4046908 26714064 vmlinux After: 15730311 6937008 4046908 26714227 vmlinux xfrm_inner_extract_output has no more external callers, make it static. v2: add IS_ENABLED(IPV6) guard in xfrm6_prepare_output add two missing breaks in xfrm_outer_mode_output (Sabrina Dubroca) add WARN_ON_ONCE for 'call AF_INET6 related output function, but CONFIG_IPV6=n' case. make xfrm_inner_extract_output static Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2c5fc9cc367d..01e7e9c0e8a9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -449,17 +449,6 @@ struct xfrm_mode { */ int (*output2)(struct xfrm_state *x,struct sk_buff *skb); - /* - * This is the actual output entry point. - * - * For transport mode and equivalent this would be identical to - * output2 (which does not need to be set). While tunnel mode - * and equivalent would set this to a tunnel encapsulation function - * (xfrm4_prepare_output or xfrm6_prepare_output) that would in turn - * call output2. - */ - int (*output)(struct xfrm_state *x, struct sk_buff *skb); - /* * Adjust pointers into the packet and do GSO segmentation. */ @@ -1603,7 +1592,11 @@ int xfrm_trans_queue(struct sk_buff *skb, struct sk_buff *)); int xfrm_output_resume(struct sk_buff *skb, int err); int xfrm_output(struct sock *sk, struct sk_buff *skb); -int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb); + +#if IS_ENABLED(CONFIG_NET_PKTGEN) +int pktgen_xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb); +#endif + void xfrm_local_error(struct sk_buff *skb, int mtu); int xfrm4_extract_header(struct sk_buff *skb); int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb); @@ -1622,7 +1615,6 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) } int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb); -int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb); int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb); int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err); @@ -1649,7 +1641,6 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr); __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr); int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb); -int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb); int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb); int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, -- cgit v1.2.3 From 303c5fab1272888b22088fbdd08cb770205ccb7a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:26 +0100 Subject: xfrm: remove xmit indirection from xfrm_mode There are only two versions (tunnel and transport). The ip/ipv6 versions are only differ in sizeof(iphdr) vs ipv6hdr. Place this in the core and use x->outer_mode->encap type to call the correct adjustment helper. Before: text data bss dec filename 15730311 6937008 4046908 26714227 vmlinux After: 15730428 6937008 4046908 26714344 vmlinux (about 117 byte increase) v2: use family from x->outer_mode, not inner Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 01e7e9c0e8a9..07966a27e4a4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -454,11 +454,6 @@ struct xfrm_mode { */ struct sk_buff *(*gso_segment)(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features); - /* - * Adjust pointers into the packet when IPsec is done at layer2. - */ - void (*xmit)(struct xfrm_state *x, struct sk_buff *skb); - struct xfrm_state_afinfo *afinfo; struct module *owner; u8 encap; -- cgit v1.2.3 From 7613b92b1ae37141704948b77e8762c5de896510 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:27 +0100 Subject: xfrm: remove gso_segment indirection from xfrm_mode These functions are small and we only have versions for tunnel and transport mode for ipv4 and ipv6 respectively. Just place the 'transport or tunnel' conditional in the protocol specific function instead of using an indirection. Before: 3226 12 0 3238 net/ipv4/esp4_offload.o 7004 492 0 7496 net/ipv4/ip_vti.o 3339 12 0 3351 net/ipv6/esp6_offload.o 11294 460 0 11754 net/ipv6/ip6_vti.o 1180 72 0 1252 net/ipv4/xfrm4_mode_beet.o 428 48 0 476 net/ipv4/xfrm4_mode_transport.o 1271 48 0 1319 net/ipv4/xfrm4_mode_tunnel.o 1083 60 0 1143 net/ipv6/xfrm6_mode_beet.o 172 48 0 220 net/ipv6/xfrm6_mode_ro.o 429 48 0 477 net/ipv6/xfrm6_mode_transport.o 1164 48 0 1212 net/ipv6/xfrm6_mode_tunnel.o 15730428 6937008 4046908 26714344 vmlinux After: 3461 12 0 3473 net/ipv4/esp4_offload.o 7000 492 0 7492 net/ipv4/ip_vti.o 3574 12 0 3586 net/ipv6/esp6_offload.o 11295 460 0 11755 net/ipv6/ip6_vti.o 1180 64 0 1244 net/ipv4/xfrm4_mode_beet.o 171 40 0 211 net/ipv4/xfrm4_mode_transport.o 1163 40 0 1203 net/ipv4/xfrm4_mode_tunnel.o 1083 52 0 1135 net/ipv6/xfrm6_mode_beet.o 172 40 0 212 net/ipv6/xfrm6_mode_ro.o 172 40 0 212 net/ipv6/xfrm6_mode_transport.o 1056 40 0 1096 net/ipv6/xfrm6_mode_tunnel.o 15730424 6937008 4046908 26714340 vmlinux Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 07966a27e4a4..de103a6d1ef8 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -449,11 +449,6 @@ struct xfrm_mode { */ int (*output2)(struct xfrm_state *x,struct sk_buff *skb); - /* - * Adjust pointers into the packet and do GSO segmentation. - */ - struct sk_buff *(*gso_segment)(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features); - struct xfrm_state_afinfo *afinfo; struct module *owner; u8 encap; -- cgit v1.2.3 From b3284df1c86f7ac078dcb8fb250fe3d6437e740c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:28 +0100 Subject: xfrm: remove input2 indirection from xfrm_mode No external dependencies on any module, place this in the core. Increase is about 1800 byte for xfrm_input.o. The beet helpers get added to internal header, as they can be reused from xfrm_output.c in the next patch (kernel contains several copies of them in the xfrm{4,6}_mode_beet.c files). Before: text data bss dec filename 5578 176 2364 8118 net/xfrm/xfrm_input.o 1180 64 0 1244 net/ipv4/xfrm4_mode_beet.o 171 40 0 211 net/ipv4/xfrm4_mode_transport.o 1163 40 0 1203 net/ipv4/xfrm4_mode_tunnel.o 1083 52 0 1135 net/ipv6/xfrm6_mode_beet.o 172 40 0 212 net/ipv6/xfrm6_mode_ro.o 172 40 0 212 net/ipv6/xfrm6_mode_transport.o 1056 40 0 1096 net/ipv6/xfrm6_mode_tunnel.o After: text data bss dec filename 7373 200 2364 9937 net/xfrm/xfrm_input.o 587 44 0 631 net/ipv4/xfrm4_mode_beet.o 171 32 0 203 net/ipv4/xfrm4_mode_transport.o 649 32 0 681 net/ipv4/xfrm4_mode_tunnel.o 625 44 0 669 net/ipv6/xfrm6_mode_beet.o 172 32 0 204 net/ipv6/xfrm6_mode_ro.o 172 32 0 204 net/ipv6/xfrm6_mode_transport.o 599 32 0 631 net/ipv6/xfrm6_mode_tunnel.o v2: pass inner_mode to xfrm_inner_mode_encap_remove to fix AF_UNSPEC selector breakage (bisected by Benedict Wong) Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index de103a6d1ef8..bdda545cf740 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -423,19 +423,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); struct xfrm_mode { - /* - * Remove encapsulation header. - * - * The IP header will be moved over the top of the encapsulation - * header. - * - * On entry, the transport header shall point to where the IP header - * should be and the network header shall be set to where the IP - * header currently is. skb->data shall point to the start of the - * payload. - */ - int (*input2)(struct xfrm_state *x, struct sk_buff *skb); - /* * Add encapsulation header. * -- cgit v1.2.3 From 1de70830066b72b6a8e259e5363f6c0bc4ba7bbc Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:29 +0100 Subject: xfrm: remove output2 indirection from xfrm_mode similar to previous patch: no external module dependencies, so we can avoid the indirection by placing this in the core. This change removes the last indirection from xfrm_mode and the xfrm4|6_mode_{beet,tunnel}.c modules contain (almost) no code anymore. Before: text data bss dec hex filename 3957 136 0 4093 ffd net/xfrm/xfrm_output.o 587 44 0 631 277 net/ipv4/xfrm4_mode_beet.o 649 32 0 681 2a9 net/ipv4/xfrm4_mode_tunnel.o 625 44 0 669 29d net/ipv6/xfrm6_mode_beet.o 599 32 0 631 277 net/ipv6/xfrm6_mode_tunnel.o After: text data bss dec hex filename 5359 184 0 5543 15a7 net/xfrm/xfrm_output.o 171 24 0 195 c3 net/ipv4/xfrm4_mode_beet.o 171 24 0 195 c3 net/ipv4/xfrm4_mode_tunnel.o 172 24 0 196 c4 net/ipv6/xfrm6_mode_beet.o 172 24 0 196 c4 net/ipv6/xfrm6_mode_tunnel.o v2: fold the *encap_add functions into xfrm*_prepare_output preserve (move) output2 comment (Sabrina) use x->outer_mode->encap, not inner fix a build breakage on ppc (kbuild robot) Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index bdda545cf740..4351444c10fc 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -423,19 +423,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); struct xfrm_mode { - /* - * Add encapsulation header. - * - * On exit, the transport header will be set to the start of the - * encapsulation header to be filled in by x->type->output and - * the mac header will be set to the nextheader (protocol for - * IPv4) field of the extension header directly preceding the - * encapsulation header, or in its absence, that of the top IP - * header. The value of the network header will always point - * to the top IP header while skb->data will point to the payload. - */ - int (*output2)(struct xfrm_state *x,struct sk_buff *skb); - struct xfrm_state_afinfo *afinfo; struct module *owner; u8 encap; -- cgit v1.2.3 From 733a5fac2f15b55b9059230d098ed04341d2d884 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:30 +0100 Subject: xfrm: remove afinfo pointer from xfrm_mode Adds an EXPORT_SYMBOL for afinfo_get_rcu, as it will now be called from ipv6 in case of CONFIG_IPV6=m. This change has virtually no effect on vmlinux size, but it reduces afinfo size and allows followup patch to make xfrm modes const. v2: mark if (afinfo) tests as likely (Sabrina) re-fetch afinfo according to inner_mode in xfrm_prepare_input(). Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 4351444c10fc..8d1c9506bcf6 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -423,7 +423,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); struct xfrm_mode { - struct xfrm_state_afinfo *afinfo; struct module *owner; u8 encap; u8 family; -- cgit v1.2.3 From 4c145dce26013763490df88f2473714f5bc7857d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:31 +0100 Subject: xfrm: make xfrm modes builtin after previous changes, xfrm_mode contains no function pointers anymore and all modules defining such struct contain no code except an init/exit functions to register the xfrm_mode struct with the xfrm core. Just place the xfrm modes core and remove the modules, the run-time xfrm_mode register/unregister functionality is removed. Before: text data bss dec filename 7523 200 2364 10087 net/xfrm/xfrm_input.o 40003 628 440 41071 net/xfrm/xfrm_state.o 15730338 6937080 4046908 26714326 vmlinux 7389 200 2364 9953 net/xfrm/xfrm_input.o 40574 656 440 41670 net/xfrm/xfrm_state.o 15730084 6937068 4046908 26714060 vmlinux The xfrm*_mode_{transport,tunnel,beet} modules are gone. v2: replace CONFIG_INET6_XFRM_MODE_* IS_ENABLED guards with CONFIG_IPV6 ones rather than removing them. Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 8d1c9506bcf6..4ca79cdc3460 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -234,9 +234,9 @@ struct xfrm_state { /* Reference to data common to all the instances of this * transformer. */ const struct xfrm_type *type; - struct xfrm_mode *inner_mode; - struct xfrm_mode *inner_mode_iaf; - struct xfrm_mode *outer_mode; + const struct xfrm_mode *inner_mode; + const struct xfrm_mode *inner_mode_iaf; + const struct xfrm_mode *outer_mode; const struct xfrm_type_offload *type_offload; @@ -347,7 +347,6 @@ struct xfrm_state_afinfo { struct module *owner; const struct xfrm_type *type_map[IPPROTO_MAX]; const struct xfrm_type_offload *type_offload_map[IPPROTO_MAX]; - struct xfrm_mode *mode_map[XFRM_MODE_MAX]; int (*init_flags)(struct xfrm_state *x); void (*init_tempsel)(struct xfrm_selector *sel, @@ -423,7 +422,6 @@ int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned sh int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); struct xfrm_mode { - struct module *owner; u8 encap; u8 family; u8 flags; @@ -434,9 +432,6 @@ enum { XFRM_MODE_FLAG_TUNNEL = 1, }; -int xfrm_register_mode(struct xfrm_mode *mode); -void xfrm_unregister_mode(struct xfrm_mode *mode); - static inline int xfrm_af2proto(unsigned int family) { switch(family) { @@ -449,7 +444,7 @@ static inline int xfrm_af2proto(unsigned int family) } } -static inline struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto) +static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto) { if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6)) -- cgit v1.2.3 From c9500d7b7de8ff6ac88ee3e38b782889f1616593 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 29 Mar 2019 21:16:32 +0100 Subject: xfrm: store xfrm_mode directly, not its address This structure is now only 4 bytes, so its more efficient to cache a copy rather than its address. No significant size difference in allmodconfig vmlinux. With non-modular kernel that has all XFRM options enabled, this series reduces vmlinux image size by ~11kb. All xfrm_mode indirections are gone and all modes are built-in. before (ipsec-next master): text data bss dec filename 21071494 7233140 11104324 39408958 vmlinux.master after this series: 21066448 7226772 11104324 39397544 vmlinux.patched With allmodconfig kernel, the size increase is only 362 bytes, even all the xfrm config options removed in this series are modular. before: text data bss dec filename 15731286 6936912 4046908 26715106 vmlinux.master after this series: 15731492 6937068 4046908 26715468 vmlinux Signed-off-by: Florian Westphal Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 4ca79cdc3460..77eb578a0384 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -132,6 +132,17 @@ struct xfrm_state_offload { u8 flags; }; +struct xfrm_mode { + u8 encap; + u8 family; + u8 flags; +}; + +/* Flags for xfrm_mode. */ +enum { + XFRM_MODE_FLAG_TUNNEL = 1, +}; + /* Full description of state of transformer. */ struct xfrm_state { possible_net_t xs_net; @@ -234,9 +245,9 @@ struct xfrm_state { /* Reference to data common to all the instances of this * transformer. */ const struct xfrm_type *type; - const struct xfrm_mode *inner_mode; - const struct xfrm_mode *inner_mode_iaf; - const struct xfrm_mode *outer_mode; + struct xfrm_mode inner_mode; + struct xfrm_mode inner_mode_iaf; + struct xfrm_mode outer_mode; const struct xfrm_type_offload *type_offload; @@ -421,17 +432,6 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); -struct xfrm_mode { - u8 encap; - u8 family; - u8 flags; -}; - -/* Flags for xfrm_mode. */ -enum { - XFRM_MODE_FLAG_TUNNEL = 1, -}; - static inline int xfrm_af2proto(unsigned int family) { switch(family) { @@ -448,9 +448,9 @@ static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, i { if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6)) - return x->inner_mode; + return &x->inner_mode; else - return x->inner_mode_iaf; + return &x->inner_mode_iaf; } struct xfrm_tmpl { @@ -1990,7 +1990,7 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, tunnel = true; break; } - if (tunnel && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL)) + if (tunnel && !(x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL)) return -EINVAL; return 0; -- cgit v1.2.3 From 1e1b11b6a1111cd9e8af1fd6ccda270a9fa3eacf Mon Sep 17 00:00:00 2001 From: vamsi krishna Date: Fri, 1 Feb 2019 18:34:51 +0530 Subject: nl80211/cfg80211: Specify band specific min RSSI thresholds with sched scan This commit adds the support to specify the RSSI thresholds per band for each match set. This enhances the current behavior which specifies a single rssi_threshold across all the bands by introducing the rssi_threshold_per_band. These per band rssi thresholds are referred through NL80211_BAND_* (enum nl80211_band) variables as attribute types. Such attributes/values per each band are nested through NL80211_ATTR_SCHED_SCAN_MIN_RSSI. These band specific rssi thresholds shall take precedence over the current rssi_thold per match set. Drivers indicate this support through %NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These per band rssi attributes/values does not specify "default RSSI filter" as done by NL80211_SCHED_SCAN_MATCH_ATTR_RSSI to stay backward compatible. That said, these per band rssi values have to be specified for the corresponding matchset. Signed-off-by: vamsi krishna Signed-off-by: Srinivas Dasari [rebase on refactoring, add policy] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 8 ++++++++ include/uapi/linux/nl80211.h | 13 +++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bb307a11ee63..b13234a486e7 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1832,11 +1832,19 @@ static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match * or no match (RSSI only) * @rssi_thold: don't report scan results below this threshold (in s32 dBm) + * @per_band_rssi_thold: Minimum rssi threshold for each band to be applied + * for filtering out scan results received. Drivers advertize this support + * of band specific rssi based filtering through the feature capability + * %NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD. These band + * specific rssi thresholds take precedence over rssi_thold, if specified. + * If not specified for any band, it will be assigned with rssi_thold of + * corresponding matchset. */ struct cfg80211_match_set { struct cfg80211_ssid ssid; u8 bssid[ETH_ALEN]; s32 rssi_thold; + s32 per_band_rssi_thold[NUM_NL80211_BANDS]; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index dd4f86ee286e..4a9404958fbe 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3638,6 +3638,14 @@ enum nl80211_reg_rule_attr { * value as specified by &struct nl80211_bss_select_rssi_adjust. * @NL80211_SCHED_SCAN_MATCH_ATTR_BSSID: BSSID to be used for matching * (this cannot be used together with SSID). + * @NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI: Nested attribute that carries the + * band specific minimum rssi thresholds for the bands defined in + * enum nl80211_band. The minimum rssi threshold value(s32) specific to a + * band shall be encapsulated in attribute with type value equals to one + * of the NL80211_BAND_* defined in enum nl80211_band. For example, the + * minimum rssi threshold value for 2.4GHZ band shall be encapsulated + * within an attribute of type NL80211_BAND_2GHZ. And one or more of such + * attributes will be nested within this attribute. * @NL80211_SCHED_SCAN_MATCH_ATTR_MAX: highest scheduled scan filter * attribute number currently defined * @__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST: internal use @@ -3650,6 +3658,7 @@ enum nl80211_sched_scan_match_attr { NL80211_SCHED_SCAN_MATCH_ATTR_RELATIVE_RSSI, NL80211_SCHED_SCAN_MATCH_ATTR_RSSI_ADJUST, NL80211_SCHED_SCAN_MATCH_ATTR_BSSID, + NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI, /* keep last */ __NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST, @@ -5343,6 +5352,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_AP_PMKSA_CACHING: Driver/device supports PMKSA caching * (set/del PMKSA operations) in AP mode. * + * @NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD: Driver supports + * filtering of sched scan results using band specific RSSI thresholds. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5384,6 +5396,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS, NL80211_EXT_FEATURE_AP_PMKSA_CACHING, + NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From ab60633c7136c300f15a390f3469d7c4be15a055 Mon Sep 17 00:00:00 2001 From: Narayanraddi Masti Date: Thu, 7 Feb 2019 12:16:05 -0800 Subject: mac80211: Add support for NL80211_STA_INFO_AIRTIME_LINK_METRIC Add support for mesh airtime link metric attribute NL80211_STA_INFO_AIRTIME_LINK_METRIC. Signed-off-by: Narayanraddi Masti Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b13234a486e7..5859a5e02454 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1327,6 +1327,7 @@ struct cfg80211_tid_stats { * @fcs_err_count: number of packets (MPDUs) received from this station with * an FCS error. This counter should be incremented only when TA of the * received packet with an FCS error matches the peer MAC address. + * @airtime_link_metric: mesh airtime link metric. */ struct station_info { u64 filled; @@ -1381,6 +1382,8 @@ struct station_info { u32 rx_mpdu_count; u32 fcs_err_count; + + u32 airtime_link_metric; }; #if IS_ENABLED(CONFIG_CFG80211) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4a9404958fbe..07457f4aea00 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3139,6 +3139,7 @@ enum nl80211_sta_bss_param { * @NL80211_STA_INFO_TX_DURATION: aggregate PPDU duration for all frames * sent to the station (u64, usec) * @NL80211_STA_INFO_AIRTIME_WEIGHT: current airtime weight for station (u16) + * @NL80211_STA_INFO_AIRTIME_LINK_METRIC: airtime link metric for mesh station * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute */ @@ -3184,6 +3185,7 @@ enum nl80211_sta_info { NL80211_STA_INFO_CONNECTED_TO_GATE, NL80211_STA_INFO_TX_DURATION, NL80211_STA_INFO_AIRTIME_WEIGHT, + NL80211_STA_INFO_AIRTIME_LINK_METRIC, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, -- cgit v1.2.3 From cb74e9775871f8c82a1297cf76209f10ab5bbe3d Mon Sep 17 00:00:00 2001 From: Sunil Dutt Date: Wed, 20 Feb 2019 16:18:07 +0530 Subject: cfg80211/nl80211: Offload OWE processing to user space in AP mode This interface allows the host driver to offload OWE processing to user space. This intends to support OWE (Opportunistic Wireless Encryption) AKM by the drivers that implement SME but rely on the user space for the cryptographic/OWE processing in AP mode. Such drivers are not capable of processing/deriving the DH IE. A new NL80211 command - NL80211_CMD_UPDATE_OWE_INFO is introduced to send the request/event between the host driver and user space. Driver shall provide the OWE info (MAC address and DH IE) of the peer to user space for cryptographic processing of the DH IE through the event. Accordingly, the user space shall update the OWE info/DH IE to the driver. Following is the sequence in AP mode for OWE authentication. Driver passes the OWE info obtained from the peer in the Association Request to the user space through the event cfg80211_update_owe_info_event. User space shall process the OWE info received and generate new OWE info. This OWE info is passed to the driver through NL80211_CMD_UPDATE_OWE_INFO request. Driver eventually uses this OWE info to send the Association Response to the peer. This OWE info in the command interface carries the IEs that include PMKID of the peer if the PMKSA is still valid or an updated DH IE for generating a new PMKSA with the peer. Signed-off-by: Liangwei Dong Signed-off-by: Sunil Dutt Signed-off-by: Srinivas Dasari [remove policy initialization - no longer exists] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 42 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 7 +++++++ 2 files changed, 49 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5859a5e02454..70432fd638af 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3110,6 +3110,32 @@ struct cfg80211_pmsr_request { struct cfg80211_pmsr_request_peer peers[]; }; +/** + * struct cfg80211_update_owe_info - OWE Information + * + * This structure provides information needed for the drivers to offload OWE + * (Opportunistic Wireless Encryption) processing to the user space. + * + * Commonly used across update_owe_info request and event interfaces. + * + * @peer: MAC address of the peer device for which the OWE processing + * has to be done. + * @status: status code, %WLAN_STATUS_SUCCESS for successful OWE info + * processing, use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space + * cannot give you the real status code for failures. Used only for + * OWE update request command interface (user space to driver). + * @ie: IEs obtained from the peer or constructed by the user space. These are + * the IEs of the remote peer in the event from the host driver and + * the constructed IEs by the user space in the request interface. + * @ie_len: Length of IEs in octets. + */ +struct cfg80211_update_owe_info { + u8 peer[ETH_ALEN] __aligned(2); + u16 status; + const u8 *ie; + size_t ie_len; +}; + /** * struct cfg80211_ops - backend description for wireless configuration * @@ -3447,6 +3473,10 @@ struct cfg80211_pmsr_request { * Statistics should be cumulative, currently no way to reset is provided. * @start_pmsr: start peer measurement (e.g. FTM) * @abort_pmsr: abort peer measurement + * + * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME + * but offloading OWE processing to the user space will get the updated + * DH IE through this interface. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3761,6 +3791,8 @@ struct cfg80211_ops { struct cfg80211_pmsr_request *request); void (*abort_pmsr)(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request); + int (*update_owe_info)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_update_owe_info *owe_info); }; /* @@ -7219,4 +7251,14 @@ void cfg80211_pmsr_complete(struct wireless_dev *wdev, #define wiphy_WARN(wiphy, format, args...) \ WARN(1, "wiphy: %s\n" format, wiphy_name(wiphy), ##args); +/** + * cfg80211_update_owe_info_event - Notify the peer's OWE info to user space + * @netdev: network device + * @owe_info: peer's owe info + * @gfp: allocation flags + */ +void cfg80211_update_owe_info_event(struct net_device *netdev, + struct cfg80211_update_owe_info *owe_info, + gfp_t gfp); + #endif /* __NET_CFG80211_H */ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 07457f4aea00..a99d75bef598 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1065,6 +1065,11 @@ * indicated by %NL80211_ATTR_WIPHY_FREQ and other attributes * determining the width and type. * + * @NL80211_CMD_UPDATE_OWE_INFO: This interface allows the host driver to + * offload OWE processing to user space. This intends to support + * OWE AKM by the host drivers that implement SME but rely + * on the user space for the cryptographic/DH IE processing in AP mode. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1285,6 +1290,8 @@ enum nl80211_commands { NL80211_CMD_NOTIFY_RADAR, + NL80211_CMD_UPDATE_OWE_INFO, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ -- cgit v1.2.3 From fd69c399c7d6262086b6b820757c6aeaa71feeba Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 8 Apr 2019 10:15:59 +0200 Subject: datagram: remove rendundant 'peeked' argument After commit a297569fe00a ("net/udp: do not touch skb->peeked unless really needed") the 'peeked' argument of __skb_try_recv_datagram() and friends is always equal to !!'flags & MSG_PEEK'. Since such argument is really a boolean info, and the callers have already 'flags & MSG_PEEK' handy, we can remove it and clean-up the code a bit. Signed-off-by: Paolo Abeni Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 +++--- include/net/udp.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 69b5538adcea..a06275a618f0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3370,17 +3370,17 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), - int *peeked, int *off, int *err, + int *off, int *err, struct sk_buff **last); struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), - int *peeked, int *off, int *err, + int *off, int *err, struct sk_buff **last); struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags, void (*destructor)(struct sock *sk, struct sk_buff *skb), - int *peeked, int *off, int *err); + int *off, int *err); struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err); __poll_t datagram_poll(struct file *file, struct socket *sock, diff --git a/include/net/udp.h b/include/net/udp.h index fd6d948755c8..d8ce937bc395 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -269,13 +269,13 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, - int noblock, int *peeked, int *off, int *err); + int noblock, int *off, int *err); static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *err) { - int peeked, off = 0; + int off = 0; - return __skb_recv_udp(sk, flags, noblock, &peeked, &off, err); + return __skb_recv_udp(sk, flags, noblock, &off, err); } int udp_v4_early_demux(struct sk_buff *skb); -- cgit v1.2.3 From 3b15d09f7e6db44065aaba5fd16dc7420035c5ad Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 28 Feb 2019 13:13:26 +0800 Subject: time: Introduce jiffies64_to_msecs() there is a similar helper in net/netfilter/nf_tables_api.c, this maybe become a common request someday, so move it to time.c Signed-off-by: Zhang Yu Signed-off-by: Li RongQing Acked-by: John Stultz Signed-off-by: Pablo Neira Ayuso --- include/linux/jiffies.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fa928242567d..1b6d31da7cbc 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -297,6 +297,7 @@ static inline u64 jiffies_to_nsecs(const unsigned long j) } extern u64 jiffies64_to_nsecs(u64 j); +extern u64 jiffies64_to_msecs(u64 j); extern unsigned long __msecs_to_jiffies(const unsigned int m); #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) -- cgit v1.2.3 From 84c0d5e96f3ae20344fb3a79161eab18905dae56 Mon Sep 17 00:00:00 2001 From: Jacky Hu Date: Tue, 26 Mar 2019 18:31:21 +0800 Subject: ipvs: allow tunneling with gue encapsulation ipip packets are blocked in some public cloud environments, this patch allows gue encapsulation with the tunneling method, which would make tunneling working in those environments. Signed-off-by: Jacky Hu Acked-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 5 +++++ include/uapi/linux/ip_vs.h | 11 +++++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 047f9a5ccaad..2ac40135b576 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -600,6 +600,9 @@ struct ip_vs_dest_user_kern { /* Address family of addr */ u16 af; + + u16 tun_type; /* tunnel type */ + __be16 tun_port; /* tunnel port */ }; @@ -660,6 +663,8 @@ struct ip_vs_dest { atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ atomic_t last_weight; /* server latest weight */ + __u16 tun_type; /* tunnel type */ + __be16 tun_port; /* tunnel port */ refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index 1c916b2f89dc..e34f436fc79d 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -124,6 +124,13 @@ #define IP_VS_PEDATA_MAXLEN 255 +/* Tunnel types */ +enum { + IP_VS_CONN_F_TUNNEL_TYPE_IPIP = 0, /* IPIP */ + IP_VS_CONN_F_TUNNEL_TYPE_GUE, /* GUE */ + IP_VS_CONN_F_TUNNEL_TYPE_MAX, +}; + /* * The struct ip_vs_service_user and struct ip_vs_dest_user are * used to set IPVS rules through setsockopt. @@ -392,6 +399,10 @@ enum { IPVS_DEST_ATTR_STATS64, /* nested attribute for dest stats */ + IPVS_DEST_ATTR_TUN_TYPE, /* tunnel type */ + + IPVS_DEST_ATTR_TUN_PORT, /* tunnel port */ + __IPVS_DEST_ATTR_MAX, }; -- cgit v1.2.3 From 01902f8c85bfde343a4c2b7428d18762442f3a25 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 26 Mar 2019 20:06:20 +0800 Subject: netfilter: optimize nf_inet_addr_cmp optimize nf_inet_addr_cmp by 64bit xor computation similar to ipv6_addr_equal() Signed-off-by: Yuan Linsi Signed-off-by: Li RongQing Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 72cb19c3db6a..4e0145ea033e 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -24,10 +24,17 @@ static inline int NF_DROP_GETERR(int verdict) static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *a2) { +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + const unsigned long *ul1 = (const unsigned long *)a1; + const unsigned long *ul2 = (const unsigned long *)a2; + + return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL; +#else return a1->all[0] == a2->all[0] && a1->all[1] == a2->all[1] && a1->all[2] == a2->all[2] && a1->all[3] == a2->all[3]; +#endif } static inline void nf_inet_addr_mask(const union nf_inet_addr *a1, -- cgit v1.2.3 From d164385ec572cbe3335a635ac308760e126d4ec0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 27 Mar 2019 09:22:24 +0100 Subject: netfilter: nat: add inet family nat support We need minimal support from the nat core for this, as we do not want to register additional base hooks. When an inet hook is registered, interally register ipv4 and ipv6 hooks for them and unregister those when inet hooks are removed. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index cf332c4e0b32..423cda2c6542 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -69,9 +69,9 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum, #endif } -int nf_nat_register_fn(struct net *net, const struct nf_hook_ops *ops, +int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *nat_ops, unsigned int ops_count); -void nf_nat_unregister_fn(struct net *net, const struct nf_hook_ops *ops, +void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, unsigned int ops_count); unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -98,6 +98,9 @@ void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops); int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops); void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops); +int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops); +void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops); + unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); -- cgit v1.2.3 From c1deb065cf3b5bcd483e3f03479f930edb151b99 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 27 Mar 2019 09:22:25 +0100 Subject: netfilter: nf_tables: merge route type into core very little code, so it really doesn't make sense to have extra modules or even a kconfig knob for this. Merge them and make functionality available unconditionally. The merge makes inet family route support trivial, so add it as well here. Before: text data bss dec hex filename 835 832 0 1667 683 nft_chain_route_ipv4.ko 870 832 0 1702 6a6 nft_chain_route_ipv6.ko 111568 2556 529 114653 1bfdd nf_tables.ko After: text data bss dec hex filename 113133 2556 529 116218 1c5fa nf_tables.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_ipv6.h | 15 +++++++++++++++ include/net/netfilter/nf_tables.h | 2 ++ 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 471e9467105b..12113e502656 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -87,6 +87,21 @@ static inline int nf_ip6_route(struct net *net, struct dst_entry **dst, } int ip6_route_me_harder(struct net *net, struct sk_buff *skb); + +static inline int nf_ip6_route_me_harder(struct net *net, struct sk_buff *skb) +{ +#if IS_MODULE(CONFIG_IPV6) + const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); + + if (!v6_ops) + return -EHOSTUNREACH; + + return v6_ops->route_me_harder(net, skb); +#else + return ip6_route_me_harder(net, skb); +#endif +} + __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol); diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3e9ab643eedf..55dff3ab44c7 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1411,4 +1411,6 @@ struct nft_trans_flowtable { int __init nft_chain_filter_init(void); void nft_chain_filter_fini(void); +void __init nft_chain_route_init(void); +void nft_chain_route_fini(void); #endif /* _NET_NF_TABLES_H */ -- cgit v1.2.3 From 4806e975729f99c7908d1688a143f1e16d464e6c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 27 Mar 2019 09:22:26 +0100 Subject: netfilter: replace NF_NAT_NEEDED with IS_ENABLED(CONFIG_NF_NAT) NF_NAT_NEEDED is true whenever nat support for either ipv4 or ipv6 is enabled. Now that the af-specific nat configuration switches have been removed, IS_ENABLED(CONFIG_NF_NAT) has the same effect. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 2 +- include/net/netfilter/nf_conntrack_expect.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 4e0145ea033e..a7252f3baeb0 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -367,7 +367,7 @@ extern struct nf_nat_hook __rcu *nf_nat_hook; static inline void nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) { -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) struct nf_nat_hook *nat_hook; rcu_read_lock(); diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 006e430d1cdf..93ce6b0daaba 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -48,7 +48,7 @@ struct nf_conntrack_expect { /* Expectation class */ unsigned int class; -#ifdef CONFIG_NF_NAT_NEEDED +#if IS_ENABLED(CONFIG_NF_NAT) union nf_inet_addr saved_addr; /* This is the original per-proto part, used to map the * expected connection the way the recipient expects. */ -- cgit v1.2.3 From 22c7652cdaa8cd33ce78bacceb4e826a3f795873 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 27 Mar 2019 11:36:26 +0100 Subject: netfilter: nft_osf: Add version option support Add version option support to the nftables "osf" expression. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink_osf.h | 11 ++++++++--- include/uapi/linux/netfilter/nf_tables.h | 6 ++++++ 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h index c6000046c966..788613f36935 100644 --- a/include/linux/netfilter/nfnetlink_osf.h +++ b/include/linux/netfilter/nfnetlink_osf.h @@ -21,13 +21,18 @@ struct nf_osf_finger { struct nf_osf_user_finger finger; }; +struct nf_osf_data { + const char *genre; + const char *version; +}; + bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, const struct list_head *nf_osf_fingers); -const char *nf_osf_find(const struct sk_buff *skb, - const struct list_head *nf_osf_fingers, - const int ttl_check); +bool nf_osf_find(const struct sk_buff *skb, + const struct list_head *nf_osf_fingers, + const int ttl_check, struct nf_osf_data *data); #endif /* _NFOSF_H */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index a66c8de006cc..061bb3eb20c3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1522,15 +1522,21 @@ enum nft_flowtable_hook_attributes { * * @NFTA_OSF_DREG: destination register (NLA_U32: nft_registers) * @NFTA_OSF_TTL: Value of the TTL osf option (NLA_U8) + * @NFTA_OSF_FLAGS: flags (NLA_U32) */ enum nft_osf_attributes { NFTA_OSF_UNSPEC, NFTA_OSF_DREG, NFTA_OSF_TTL, + NFTA_OSF_FLAGS, __NFTA_OSF_MAX, }; #define NFTA_OSF_MAX (__NFTA_OSF_MAX - 1) +enum nft_osf_flags { + NFT_OSF_F_VERSION = (1 << 0), +}; + /** * enum nft_device_attributes - nf_tables device netlink attributes * -- cgit v1.2.3 From 3b0a081db1f730373993c7a27936778402a3322c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 4 Apr 2019 10:58:20 +0200 Subject: netfilter: make two functions static They have no external callers anymore. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 - include/net/netfilter/nf_tables.h | 2 -- 2 files changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index bf384b3eedb8..1f852ef7b098 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -317,7 +317,6 @@ struct xt_table_info *xt_replace_table(struct xt_table *table, int *error); struct xt_match *xt_find_match(u8 af, const char *name, u8 revision); -struct xt_target *xt_find_target(u8 af, const char *name, u8 revision); struct xt_match *xt_request_find_match(u8 af, const char *name, u8 revision); struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision); int xt_find_revision(u8 af, const char *name, u8 revision, int target, diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 55dff3ab44c7..2d5a0a1a87b8 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -475,8 +475,6 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, enum nft_trans_phase phase); int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding); -void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_binding *binding, bool commit); void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set); /** -- cgit v1.2.3 From 3b8b11f96616c2e763ebcc093b778b309fc07a92 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 5 Apr 2019 21:23:13 +0200 Subject: net: phy: improve link partner capability detection genphy_read_status() so far checks phydev->supported, not the actual PHY capabilities. This can make a difference if the supported speeds have been limited by of_set_phy_supported() or phy_set_max_speed(). It seems that this issue only affects the link partner advertisements as displayed by ethtool. Also this patch wouldn't apply to older kernels because linkmode bitmaps have been introduced recently. Therefore net-next. Signed-off-by: Heiner Kallweit Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index ab7439b3da2b..0f9552b17ee7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -345,6 +345,7 @@ struct phy_c45_device_ids { * is_c45: Set to true if this phy uses clause 45 addressing. * is_internal: Set to true if this phy is internal to a MAC. * is_pseudo_fixed_link: Set to true if this phy is an Ethernet switch, etc. + * is_gigabit_capable: Set to true if PHY supports 1000Mbps * has_fixups: Set to true if this phy has fixups/quirks. * suspended: Set to true if this phy has been suspended successfully. * sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. @@ -382,6 +383,7 @@ struct phy_device { unsigned is_c45:1; unsigned is_internal:1; unsigned is_pseudo_fixed_link:1; + unsigned is_gigabit_capable:1; unsigned has_fixups:1; unsigned suspended:1; unsigned sysfs_links:1; -- cgit v1.2.3 From 1aefd3de7bc667115bb77cb0bc21e874c7e190fc Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:24 -0700 Subject: ipv6: Add fib6_nh_init and release to stubs Add fib6_nh_init and fib6_nh_release to ipv6_stubs. If fib6_nh_init fails, callers should not invoke fib6_nh_release, so there is no reason to have a dummy stub for the IPv6 is not enabled case. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ipv6_stubs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index d8d9c0b0e8c0..453b55bf6723 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -12,6 +12,8 @@ /* structs from net/ip6_fib.h */ struct fib6_info; +struct fib6_nh; +struct fib6_config; /* This is ugly, ideally these symbols should be built * into the core kernel. @@ -40,6 +42,10 @@ struct ipv6_stub { u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, struct in6_addr *saddr); + int (*fib6_nh_init)(struct net *net, struct fib6_nh *fib6_nh, + struct fib6_config *cfg, gfp_t gfp_flags, + struct netlink_ext_ack *extack); + void (*fib6_nh_release)(struct fib6_nh *fib6_nh); void (*udpv6_encap_enable)(void); void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, -- cgit v1.2.3 From 71df5777aaaeff673c242a49b945b1b96fe81718 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:25 -0700 Subject: ipv6: Add neighbor helpers that use the ipv6 stub Add ipv6 helpers to handle ndisc references via the stub. Update bpf_ipv6_fib_lookup to use __ipv6_neigh_lookup_noref_stub instead of the open code ___neigh_lookup_noref with the stub. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ndisc.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index ddfbb591e2c5..366150053043 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -2,6 +2,8 @@ #ifndef _NDISC_H #define _NDISC_H +#include + /* * ICMP codes for neighbour discovery messages */ @@ -379,6 +381,14 @@ static inline struct neighbour *__ipv6_neigh_lookup_noref(struct net_device *dev return ___neigh_lookup_noref(&nd_tbl, neigh_key_eq128, ndisc_hashfn, pkey, dev); } +static inline +struct neighbour *__ipv6_neigh_lookup_noref_stub(struct net_device *dev, + const void *pkey) +{ + return ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, + ndisc_hashfn, pkey, dev); +} + static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, const void *pkey) { struct neighbour *n; @@ -409,6 +419,36 @@ static inline void __ipv6_confirm_neigh(struct net_device *dev, rcu_read_unlock_bh(); } +static inline void __ipv6_confirm_neigh_stub(struct net_device *dev, + const void *pkey) +{ + struct neighbour *n; + + rcu_read_lock_bh(); + n = __ipv6_neigh_lookup_noref_stub(dev, pkey); + if (n) { + unsigned long now = jiffies; + + /* avoid dirtying neighbour */ + if (n->confirmed != now) + n->confirmed = now; + } + rcu_read_unlock_bh(); +} + +/* uses ipv6_stub and is meant for use outside of IPv6 core */ +static inline struct neighbour *ip_neigh_gw6(struct net_device *dev, + const void *addr) +{ + struct neighbour *neigh; + + neigh = __ipv6_neigh_lookup_noref_stub(dev, addr); + if (unlikely(!neigh)) + neigh = __neigh_create(ipv6_stub->nd_tbl, addr, dev, false); + + return neigh; +} + int ndisc_init(void); int ndisc_late_init(void); -- cgit v1.2.3 From bdf004677107e3b847c5db09c9fbf8edefa24996 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:26 -0700 Subject: net: Replace nhc_has_gw with nhc_gw_family Allow the gateway in a fib_nh_common to be from a different address family than the outer fib{6}_nh. To that end, replace nhc_has_gw with nhc_gw_family and update users of nhc_has_gw to check nhc_gw_family. Now nhc_family is used to know if the nh_common is part of a fib_nh or fib6_nh (used for container_of to get to route family specific data), and nhc_gw_family represents the address family for the gateway. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- include/net/ip_fib.h | 7 +++---- include/trace/events/fib.h | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 342180a7285c..5909fc421305 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -69,7 +69,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { return !(f6i->fib6_flags & (RTF_ADDRCONF|RTF_DYNAMIC)) && - f6i->fib6_nh.fib_nh_has_gw; + f6i->fib6_nh.fib_nh_gw_family; } void ip6_route_input(struct sk_buff *skb); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 3ce07841dc3b..c68a40435ee0 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -83,8 +83,8 @@ struct fib_nh_common { struct lwtunnel_state *nhc_lwtstate; unsigned char nhc_scope; u8 nhc_family; - u8 nhc_has_gw:1, - unused:7; + u8 nhc_gw_family; + union { __be32 ipv4; struct in6_addr ipv6; @@ -112,8 +112,7 @@ struct fib_nh { #define fib_nh_flags nh_common.nhc_flags #define fib_nh_lws nh_common.nhc_lwtstate #define fib_nh_scope nh_common.nhc_scope -#define fib_nh_family nh_common.nhc_family -#define fib_nh_has_gw nh_common.nhc_has_gw +#define fib_nh_gw_family nh_common.nhc_gw_family #define fib_nh_gw4 nh_common.nhc_gw.ipv4 #define fib_nh_gw6 nh_common.nhc_gw.ipv6 #define fib_nh_weight nh_common.nhc_weight diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h index 7f83b6eafc5c..6f2a4dc35e37 100644 --- a/include/trace/events/fib.h +++ b/include/trace/events/fib.h @@ -69,13 +69,13 @@ TRACE_EVENT(fib_table_lookup, __assign_str(name, dev ? dev->name : "-"); if (nhc) { - if (nhc->nhc_family == AF_INET) { + if (nhc->nhc_gw_family == AF_INET) { p32 = (__be32 *) __entry->gw4; *p32 = nhc->nhc_gw.ipv4; in6 = (struct in6_addr *)__entry->gw6; *in6 = in6_zero; - } else if (nhc->nhc_family == AF_INET6) { + } else if (nhc->nhc_gw_family == AF_INET6) { p32 = (__be32 *) __entry->gw4; *p32 = 0; -- cgit v1.2.3 From 1550c171935d264f522581fd037db5e64a716bb6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:27 -0700 Subject: ipv4: Prepare rtable for IPv6 gateway To allow the gateway to be either an IPv4 or IPv6 address, remove rt_uses_gateway from rtable and replace with rt_gw_family. If rt_gw_family is set it implies rt_uses_gateway. Rename rt_gateway to rt_gw4 to represent the IPv4 version. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/route.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index 9883dc82f723..96912b099c08 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -55,12 +55,12 @@ struct rtable { unsigned int rt_flags; __u16 rt_type; __u8 rt_is_input; - __u8 rt_uses_gateway; + u8 rt_gw_family; int rt_iif; /* Info on neighbour */ - __be32 rt_gateway; + __be32 rt_gw4; /* Miscellaneous cached information */ u32 rt_mtu_locked:1, @@ -82,8 +82,8 @@ static inline bool rt_is_output_route(const struct rtable *rt) static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr) { - if (rt->rt_gateway) - return rt->rt_gateway; + if (rt->rt_gw_family == AF_INET) + return rt->rt_gw4; return daddr; } -- cgit v1.2.3 From f35b794b3b405e2478654ea875bc0b29fe1a1bc5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:28 -0700 Subject: ipv4: Prepare fib_config for IPv6 gateway Similar to rtable, fib_config needs to allow the gateway to be either an IPv4 or an IPv6 address. To that end, rename fc_gw to fc_gw4 to mean an IPv4 address and add fc_gw_family. Checks on 'is a gateway set' are changed to see if fc_gw_family is set. In the process prepare the code for a fc_gw_family == AF_INET6. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index c68a40435ee0..1f72ad553c31 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -32,10 +32,11 @@ struct fib_config { u8 fc_protocol; u8 fc_scope; u8 fc_type; - /* 3 bytes unused */ + u8 fc_gw_family; + /* 2 bytes unused */ u32 fc_table; __be32 fc_dst; - __be32 fc_gw; + __be32 fc_gw4; int fc_oif; u32 fc_flags; u32 fc_priority; -- cgit v1.2.3 From 0f5f7d7bf6e6bda4dffe7b42812a16ada6ea9816 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:29 -0700 Subject: ipv4: Add support to rtable for ipv6 gateway Add support for an IPv6 gateway to rtable. Since a gateway is either IPv4 or IPv6, make it a union with rt_gw4 where rt_gw_family decides which address is in use. When dumping the route data, encode an ipv6 nexthop using RTA_VIA. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/route.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index 96912b099c08..5d28a2509b58 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -60,7 +60,10 @@ struct rtable { int rt_iif; /* Info on neighbour */ - __be32 rt_gw4; + union { + __be32 rt_gw4; + struct in6_addr rt_gw6; + }; /* Miscellaneous cached information */ u32 rt_mtu_locked:1, -- cgit v1.2.3 From a4ea5d43c807be28545625c1e0641905022fa0d1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:30 -0700 Subject: ipv4: Add support to fib_config for IPv6 gateway Add support for an IPv6 gateway to fib_config. Since a gateway is either IPv4 or IPv6, make it a union with fc_gw4 where fc_gw_family decides which address is in use. Update current checks on family and gw4 to handle ipv6 as well. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 1f72ad553c31..f1c452f618a9 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -36,7 +36,10 @@ struct fib_config { /* 2 bytes unused */ u32 fc_table; __be32 fc_dst; - __be32 fc_gw4; + union { + __be32 fc_gw4; + struct in6_addr fc_gw6; + }; int fc_oif; u32 fc_flags; u32 fc_priority; -- cgit v1.2.3 From 0353f28231c79416191326810e7fe656b69c63b7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:33 -0700 Subject: neighbor: Add skip_cache argument to neigh_output A later patch allows an IPv6 gateway with an IPv4 route. The neighbor entry will exist in the v6 ndisc table and the cached header will contain the ipv6 protocol which is wrong for an IPv4 packet. For an IPv4 packet to use the v6 neighbor entry, neigh_output needs to skip the cached header and just use the output callback for the neigh entry. A future patchset can look at expanding the hh_cache to handle 2 protocols. For now, IPv6 gateways with an IPv4 route will take the extra overhead of generating the header. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/neighbour.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 7c1ab9edba03..3e5438bd0101 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -498,11 +498,12 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb return dev_queue_xmit(skb); } -static inline int neigh_output(struct neighbour *n, struct sk_buff *skb) +static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, + bool skip_cache) { const struct hh_cache *hh = &n->hh; - if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) + if ((n->nud_state & NUD_CONNECTED) && hh->hh_len && !skip_cache) return neigh_hh_output(hh, skb); else return n->output(n, skb); -- cgit v1.2.3 From 5c9f7c1dfc2e0776551ef1ceb335187c6698d1ff Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:34 -0700 Subject: ipv4: Add helpers for neigh lookup for nexthop A common theme in the output path is looking up a neigh entry for a nexthop, either the gateway in an rtable or a fallback to the daddr in the skb: nexthop = (__force u32)rt_nexthop(rt, ip_hdr(skb)->daddr); neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); To allow the nexthop to be an IPv6 address we need to consider the family of the nexthop and then call __ipv{4,6}_neigh_lookup_noref based on it. To make this simpler, add a ip_neigh_gw4 helper similar to ip_neigh_gw6 added in an earlier patch which handles: neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); And then add a second one, ip_neigh_for_gw, that calls either ip_neigh_gw4 or ip_neigh_gw6 based on the address family of the gateway. Update the output paths in the VRF driver and core v4 code to use ip_neigh_for_gw simplifying the family based lookup and making both ready for a v6 nexthop. ipv4_neigh_lookup has a different need - the potential to resolve a passed in address in addition to any gateway in the rtable or skb. Since this is a one-off, add ip_neigh_gw4 and ip_neigh_gw6 diectly. The difference between __neigh_create used by the helpers and neigh_create called by ipv4_neigh_lookup is taking a refcount, so add rcu_read_lock_bh and bump the refcnt on the neigh entry. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/route.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index 5d28a2509b58..96f6c9ae33c2 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include #include @@ -350,4 +352,34 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) return hoplimit; } +static inline struct neighbour *ip_neigh_gw4(struct net_device *dev, + __be32 daddr) +{ + struct neighbour *neigh; + + neigh = __ipv4_neigh_lookup_noref(dev, daddr); + if (unlikely(!neigh)) + neigh = __neigh_create(&arp_tbl, &daddr, dev, false); + + return neigh; +} + +static inline struct neighbour *ip_neigh_for_gw(struct rtable *rt, + struct sk_buff *skb, + bool *is_v6gw) +{ + struct net_device *dev = rt->dst.dev; + struct neighbour *neigh; + + if (likely(rt->rt_gw_family == AF_INET)) { + neigh = ip_neigh_gw4(dev, rt->rt_gw4); + } else if (rt->rt_gw_family == AF_INET6) { + neigh = ip_neigh_gw6(dev, &rt->rt_gw6); + *is_v6gw = true; + } else { + neigh = ip_neigh_gw4(dev, ip_hdr(skb)->daddr); + } + return neigh; +} + #endif /* _ROUTE_H */ -- cgit v1.2.3 From 19a9d136f198cd7c4e26ea6897a0cf067d3f7ecb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:39 -0700 Subject: ipv4: Flag fib_info with a fib_nh using IPv6 gateway Until support is added to the offload drivers, they need to be able to reject routes with an IPv6 gateway. To that end add a flag to fib_info that indicates if any fib_nh has a v6 gateway. The flag allows the drivers to efficiently know the use of a v6 gateway without walking all fib_nh tied to a fib_info each time a route is added. Update mlxsw and rocker to reject the routes with extack message as to why. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f1c452f618a9..337106469ec5 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -147,6 +147,7 @@ struct fib_info { #define fib_rtt fib_metrics->metrics[RTAX_RTT-1] #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; + bool fib_nh_is_v6; struct rcu_head rcu; struct fib_nh fib_nh[0]; #define fib_dev fib_nh[0].fib_nh_dev -- cgit v1.2.3 From d15662682db232da77136cd348f4c9df312ca6f9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 5 Apr 2019 16:30:40 -0700 Subject: ipv4: Allow ipv6 gateway with ipv4 routes Add support for RTA_VIA and allow an IPv6 nexthop for v4 routes: $ ip ro add 172.16.1.0/24 via inet6 2001:db8::1 dev eth0 $ ip ro ls ... 172.16.1.0/24 via inet6 2001:db8::1 dev eth0 For convenience and simplicity, userspace can use RTA_VIA to specify AF_INET or AF_INET6 gateway. The common fib_nexthop_info dump function compares the gateway address family to the nh_common family to know if the gateway should be encoded as RTA_VIA or RTA_GATEWAY. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 337106469ec5..d8195c77e247 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -401,6 +401,8 @@ static inline bool fib4_rules_early_flow_dissect(struct net *net, /* Exported by fib_frontend.c */ extern const struct nla_policy rtm_ipv4_policy[]; void ip_fib_init(void); +int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, + struct netlink_ext_ack *extack); __be32 fib_compute_spec_dst(struct sk_buff *skb); bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev); int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, -- cgit v1.2.3 From d8eca5bbb2be9bc7546f9e733786fa2f1a594c67 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:03 +0200 Subject: bpf: implement lookup-free direct value access for maps This generic extension to BPF maps allows for directly loading an address residing inside a BPF map value as a single BPF ldimm64 instruction! The idea is similar to what BPF_PSEUDO_MAP_FD does today, which is a special src_reg flag for ldimm64 instruction that indicates that inside the first part of the double insns's imm field is a file descriptor which the verifier then replaces as a full 64bit address of the map into both imm parts. For the newly added BPF_PSEUDO_MAP_VALUE src_reg flag, the idea is the following: the first part of the double insns's imm field is again a file descriptor corresponding to the map, and the second part of the imm field is an offset into the value. The verifier will then replace both imm parts with an address that points into the BPF map value at the given value offset for maps that support this operation. Currently supported is array map with single entry. It is possible to support more than just single map element by reusing both 16bit off fields of the insns as a map index, so full array map lookup could be expressed that way. It hasn't been implemented here due to lack of concrete use case, but could easily be done so in future in a compatible way, since both off fields right now have to be 0 and would correctly denote a map index 0. The BPF_PSEUDO_MAP_VALUE is a distinct flag as otherwise with BPF_PSEUDO_MAP_FD we could not differ offset 0 between load of map pointer versus load of map's value at offset 0, and changing BPF_PSEUDO_MAP_FD's encoding into off by one to differ between regular map pointer and map value pointer would add unnecessary complexity and increases barrier for debugability thus less suitable. Using the second part of the imm field as an offset into the value does /not/ come with limitations since maximum possible value size is in u32 universe anyway. This optimization allows for efficiently retrieving an address to a map value memory area without having to issue a helper call which needs to prepare registers according to calling convention, etc, without needing the extra NULL test, and without having to add the offset in an additional instruction to the value base pointer. The verifier then treats the destination register as PTR_TO_MAP_VALUE with constant reg->off from the user passed offset from the second imm field, and guarantees that this is within bounds of the map value. Any subsequent operations are normally treated as typical map value handling without anything extra needed from verification side. The two map operations for direct value access have been added to array map for now. In future other types could be supported as well depending on the use case. The main use case for this commit is to allow for BPF loader support for global variables that reside in .data/.rodata/.bss sections such that we can directly load the address of them with minimal additional infrastructure required. Loader support has been added in subsequent commits for libbpf library. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ include/linux/bpf_verifier.h | 4 ++++ include/uapi/linux/bpf.h | 13 ++++++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a445194b5fb6..bd93a592dd29 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -57,6 +57,12 @@ struct bpf_map_ops { const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); + + /* Direct value access helpers. */ + int (*map_direct_value_addr)(const struct bpf_map *map, + u64 *imm, u32 off); + int (*map_direct_value_meta)(const struct bpf_map *map, + u64 imm, u32 *off); }; struct bpf_map { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index fc8254d6b569..b3ab61fe1932 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -224,6 +224,10 @@ struct bpf_insn_aux_data { unsigned long map_state; /* pointer/poison value for maps */ s32 call_imm; /* saved imm field of call insn */ u32 alu_limit; /* limit for add/sub register with pointer */ + struct { + u32 map_index; /* index into used_maps[] */ + u32 map_off; /* offset from value base address */ + }; }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int sanitize_stack_off; /* stack slot to be cleared */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 837024512baf..26cfb5b2c964 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -255,8 +255,19 @@ enum bpf_attach_type { */ #define BPF_F_ANY_ALIGNMENT (1U << 1) -/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ +/* When BPF ldimm64's insn[0].src_reg != 0 then this can have + * two extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd map fd + * insn[1].imm: 0 offset into value + * insn[0].off: 0 0 + * insn[1].off: 0 0 + * ldimm64 rewrite: address of map address of map[0]+offset + * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_FD 1 +#define BPF_PSEUDO_MAP_VALUE 2 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function -- cgit v1.2.3 From 591fe9888d7809d9ee5c828020b6c6ae27c37229 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:05 +0200 Subject: bpf: add program side {rd, wr}only support for maps This work adds two new map creation flags BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG in order to allow for read-only or write-only BPF maps from a BPF program side. Today we have BPF_F_RDONLY and BPF_F_WRONLY, but this only applies to system call side, meaning the BPF program has full read/write access to the map as usual while bpf(2) calls with map fd can either only read or write into the map depending on the flags. BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG allows for the exact opposite such that verifier is going to reject program loads if write into a read-only map or a read into a write-only map is detected. For read-only map case also some helpers are forbidden for programs that would alter the map state such as map deletion, update, etc. As opposed to the two BPF_F_RDONLY / BPF_F_WRONLY flags, BPF_F_RDONLY_PROG as well as BPF_F_WRONLY_PROG really do correspond to the map lifetime. We've enabled this generic map extension to various non-special maps holding normal user data: array, hash, lru, lpm, local storage, queue and stack. Further generic map types could be followed up in future depending on use-case. Main use case here is to forbid writes into .rodata map values from verifier side. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 29 +++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 6 +++++- 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bd93a592dd29..be20804631b5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -430,6 +430,35 @@ struct bpf_array { #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 32 +#define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ + BPF_F_RDONLY_PROG | \ + BPF_F_WRONLY | \ + BPF_F_WRONLY_PROG) + +#define BPF_MAP_CAN_READ BIT(0) +#define BPF_MAP_CAN_WRITE BIT(1) + +static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) +{ + u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); + + /* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is + * not possible. + */ + if (access_flags & BPF_F_RDONLY_PROG) + return BPF_MAP_CAN_READ; + else if (access_flags & BPF_F_WRONLY_PROG) + return BPF_MAP_CAN_WRITE; + else + return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE; +} + +static inline bool bpf_map_flags_access_ok(u32 access_flags) +{ + return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) != + (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); +} + struct bpf_event_entry { struct perf_event *event; struct file *perf_file; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 26cfb5b2c964..d275446d807c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -294,7 +294,7 @@ enum bpf_attach_type { #define BPF_OBJ_NAME_LEN 16U -/* Flags for accessing BPF object */ +/* Flags for accessing BPF object from syscall side. */ #define BPF_F_RDONLY (1U << 3) #define BPF_F_WRONLY (1U << 4) @@ -304,6 +304,10 @@ enum bpf_attach_type { /* Zero-initialize hash function seed. This should only be used for testing. */ #define BPF_F_ZERO_SEED (1U << 6) +/* Flags for accessing BPF object from program side. */ +#define BPF_F_RDONLY_PROG (1U << 7) +#define BPF_F_WRONLY_PROG (1U << 8) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) -- cgit v1.2.3 From 87df15de441bd4add7876ef584da8cabdd9a042a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:06 +0200 Subject: bpf: add syscall side map freeze support This patch adds a new BPF_MAP_FREEZE command which allows to "freeze" the map globally as read-only / immutable from syscall side. Map permission handling has been refactored into map_get_sys_perms() and drops FMODE_CAN_WRITE in case of locked map. Main use case is to allow for setting up .rodata sections from the BPF ELF which are loaded into the kernel, meaning BPF loader first allocates map, sets up map value by copying .rodata section into it and once complete, it calls BPF_MAP_FREEZE on the map fd to prevent further modifications. Right now BPF_MAP_FREEZE only takes map fd as argument while remaining bpf_attr members are required to be zero. I didn't add write-only locking here as counterpart since I don't have a concrete use-case for it on my side, and I think it makes probably more sense to wait once there is actually one. In that case bpf_attr can be extended as usual with a flag field and/or others where flag 0 means that we lock the map read-only hence this doesn't prevent to add further extensions to BPF_MAP_FREEZE upon need. A map creation flag like BPF_F_WRONCE was not considered for couple of reasons: i) in case of a generic implementation, a map can consist of more than just one element, thus there could be multiple map updates needed to set the map into a state where it can then be made immutable, ii) WRONCE indicates exact one-time write before it is then set immutable. A generic implementation would set a bit atomically on map update entry (if unset), indicating that every subsequent update from then onwards will need to bail out there. However, map updates can fail, so upon failure that flag would need to be unset again and the update attempt would need to be repeated for it to be eventually made immutable. While this can be made race-free, this approach feels less clean and in combination with reason i), it's not generic enough. A dedicated BPF_MAP_FREEZE command directly sets the flag and caller has the guarantee that map is immutable from syscall side upon successful return for any future syscall invocations that would alter the map state, which is also more intuitive from an API point of view. A command name such as BPF_MAP_LOCK has been avoided as it's too close with BPF map spin locks (which already has BPF_F_LOCK flag). BPF_MAP_FREEZE is so far only enabled for privileged users. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- include/uapi/linux/bpf.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be20804631b5..65f7094c40b4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -87,7 +87,8 @@ struct bpf_map { struct btf *btf; u32 pages; bool unpriv_array; - /* 51 bytes hole */ + bool frozen; /* write-once */ + /* 48 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d275446d807c..af1cbd951f26 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -105,6 +105,7 @@ enum bpf_cmd { BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, BPF_MAP_LOOKUP_AND_DELETE_ELEM, + BPF_MAP_FREEZE, }; enum bpf_map_type { -- cgit v1.2.3 From f063c889c9458354a92b235a51cbb60d30321070 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:08 +0200 Subject: bpf: add specification for BTF Var and DataSec kinds This adds the BTF specification and UAPI bits for supporting BTF Var and DataSec kinds. This is following LLVM upstream commit ac4082b77e07 ("[BPF] Add BTF Var and DataSec Support") which has been merged recently. Var itself is for describing a global variable and DataSec to describe ELF sections e.g. data/bss/rodata sections that hold one or multiple global variables. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/uapi/linux/btf.h | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 7b7475ef2f17..9310652ca4f9 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -39,11 +39,11 @@ struct btf_type { * struct, union and fwd */ __u32 info; - /* "size" is used by INT, ENUM, STRUCT and UNION. + /* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC. * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC and FUNC_PROTO. + * FUNC, FUNC_PROTO and VAR. * "type" is a type_id referring to another type. */ union { @@ -70,8 +70,10 @@ struct btf_type { #define BTF_KIND_RESTRICT 11 /* Restrict */ #define BTF_KIND_FUNC 12 /* Function */ #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ -#define BTF_KIND_MAX 13 -#define NR_BTF_KINDS 14 +#define BTF_KIND_VAR 14 /* Variable */ +#define BTF_KIND_DATASEC 15 /* Section */ +#define BTF_KIND_MAX BTF_KIND_DATASEC +#define NR_BTF_KINDS (BTF_KIND_MAX + 1) /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. @@ -138,4 +140,26 @@ struct btf_param { __u32 type; }; +enum { + BTF_VAR_STATIC = 0, + BTF_VAR_GLOBAL_ALLOCATED, +}; + +/* BTF_KIND_VAR is followed by a single "struct btf_var" to describe + * additional information related to the variable such as its linkage. + */ +struct btf_var { + __u32 linkage; +}; + +/* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo" + * to describe all BTF_KIND_VAR types it contains along with it's + * in-section offset as well as size. + */ +struct btf_var_secinfo { + __u32 type; + __u32 offset; + __u32 size; +}; + #endif /* _UAPI__LINUX_BTF_H__ */ -- cgit v1.2.3 From 2824ecb7010f6a20e9a4140512b798469ab066cc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Apr 2019 23:20:10 +0200 Subject: bpf: allow for key-less BTF in array map Given we'll be reusing BPF array maps for global data/bss/rodata sections, we need a way to associate BTF DataSec type as its map value type. In usual cases we have this ugly BPF_ANNOTATE_KV_PAIR() macro hack e.g. via 38d5d3b3d5db ("bpf: Introduce BPF_ANNOTATE_KV_PAIR") to get initial map to type association going. While more use cases for it are discouraged, this also won't work for global data since the use of array map is a BPF loader detail and therefore unknown at compilation time. For array maps with just a single entry we make an exception in terms of BTF in that key type is declared optional if value type is of DataSec type. The latter LLVM is guaranteed to emit and it also aligns with how we regard global data maps as just a plain buffer area reusing existing map facilities for allowing things like introspection with existing tools. Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 455d31b55828..64cdf2a23d42 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -51,6 +51,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); +bool btf_type_is_void(const struct btf_type *t); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); -- cgit v1.2.3 From b6d9ccb1125049941590ea895c38e1167badba5f Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 28 Mar 2019 15:27:31 +0200 Subject: net/mlx5: E-Switch, don't use hardcoded values for FDB prios When creating the FDB prios, use the enum values already defined and not the hardcoded values. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky --- include/linux/mlx5/fs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 9df51da04621..3eeb04154317 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -75,6 +75,11 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_EGRESS, }; +enum { + FDB_FAST_PATH, + FDB_SLOW_PATH, +}; + struct mlx5_flow_table; struct mlx5_flow_group; struct mlx5_flow_namespace; -- cgit v1.2.3 From d9cb06759eca5a420072b937d2a2a670db474008 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 28 Mar 2019 15:27:32 +0200 Subject: net/mlx5: E-Switch, add a new prio to be used by the RDMA side Create a new prio in the FDB, it will be used when inserting steering rules into the FDB from the RDMA side. We create a new PRIO so rules from the net side and rules from the RDMA side won't be inserted to the same PRIO, each side has it's own sandbox to play in. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 3eeb04154317..fd91df3a4e09 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -76,6 +76,7 @@ enum mlx5_flow_namespace_type { }; enum { + FDB_BYPASS_PATH, FDB_FAST_PATH, FDB_SLOW_PATH, }; -- cgit v1.2.3 From 1f5e6fdd6aec7929e67afad1e42e35d894a119ae Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 10 Apr 2019 14:32:38 +0200 Subject: net: sched: prefer qdisc_is_empty() over direct qlen access When checking for root qdisc queue length, do not access directly q.qlen. In the following patches we will move back qlen accounting to per CPU values for NOLOCK qdiscs. Instead, prefer the qdisc_is_empty() helper usage. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 0aea0e262452..7ecb6127e980 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -747,7 +747,7 @@ static inline bool qdisc_all_tx_empty(const struct net_device *dev) struct netdev_queue *txq = netdev_get_tx_queue(dev, i); const struct Qdisc *q = rcu_dereference(txq->qdisc); - if (q->q.qlen) { + if (!qdisc_is_empty(q)) { rcu_read_unlock(); return false; } -- cgit v1.2.3 From 9c01c9f1f2a3ddbddbf3b233cc6bfa86f5a59af0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 10 Apr 2019 14:32:39 +0200 Subject: net: sched: always do stats accounting according to TCQ_F_CPUSTATS The core sched implementation checks independently for NOLOCK flag to acquire/release the root spin lock and for qdisc_is_percpu_stats() to account per CPU values in many places. This change update the last few places checking the TCQ_F_NOLOCK to do per CPU stats accounting according to qdisc_is_percpu_stats() value. The above allows to clean dev_requeue_skb() implementation a bit and makes stats update always consistent with a single flag. v1 -> v2: - do not move qdisc_is_empty definition, fix build issue Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sch_generic.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7ecb6127e980..ed56474cfe3b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -146,9 +146,14 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; } +static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) +{ + return q->flags & TCQ_F_CPUSTATS; +} + static inline bool qdisc_is_empty(const struct Qdisc *qdisc) { - if (qdisc->flags & TCQ_F_NOLOCK) + if (qdisc_is_percpu_stats(qdisc)) return qdisc->empty; return !qdisc->q.qlen; } @@ -490,7 +495,7 @@ static inline u32 qdisc_qlen_sum(const struct Qdisc *q) { u32 qlen = q->qstats.qlen; - if (q->flags & TCQ_F_NOLOCK) + if (qdisc_is_percpu_stats(q)) qlen += atomic_read(&q->q.atomic_qlen); else qlen += q->q.qlen; @@ -817,11 +822,6 @@ static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return sch->enqueue(skb, sch, to_free); } -static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) -{ - return q->flags & TCQ_F_CPUSTATS; -} - static inline void _bstats_update(struct gnet_stats_basic_packed *bstats, __u64 bytes, __u32 packets) { @@ -1113,8 +1113,13 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) if (skb) { skb = __skb_dequeue(&sch->gso_skb); - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; + if (qdisc_is_percpu_stats(sch)) { + qdisc_qstats_cpu_backlog_dec(sch, skb); + qdisc_qstats_atomic_qlen_dec(sch); + } else { + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + } } else { skb = sch->dequeue(sch); } -- cgit v1.2.3 From 8a53e616de294873fec1a75ddb77ecb3d225cee0 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 10 Apr 2019 14:32:40 +0200 Subject: net: sched: when clearing NOLOCK, clear TCQ_F_CPUSTATS, too Since stats updating is always consistent with TCQ_F_CPUSTATS flag, we can disable it at qdisc creation time flipping such bit. In my experiments, if the NOLOCK flag is cleared, per CPU stats accounting does not give any measurable performance gain, but it waste some memory. Let's clear TCQ_F_CPUSTATS together with NOLOCK, when enslaving a NOLOCK qdisc to 'lock' one. Use stats update helper inside pfifo_fast, to cope correctly with TCQ_F_CPUSTATS flag change. As a side effect, q.qlen value for any child qdiscs is always consistent for all lock classfull qdiscs. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sch_generic.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index ed56474cfe3b..f069011524ba 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1106,6 +1106,32 @@ static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch) return skb; } +static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch, + struct sk_buff *skb) +{ + if (qdisc_is_percpu_stats(sch)) { + qdisc_qstats_cpu_backlog_dec(sch, skb); + qdisc_bstats_cpu_update(sch, skb); + qdisc_qstats_atomic_qlen_dec(sch); + } else { + qdisc_qstats_backlog_dec(sch, skb); + qdisc_bstats_update(sch, skb); + sch->q.qlen--; + } +} + +static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch, + unsigned int pkt_len) +{ + if (qdisc_is_percpu_stats(sch)) { + qdisc_qstats_atomic_qlen_inc(sch); + this_cpu_add(sch->cpu_qstats->backlog, pkt_len); + } else { + sch->qstats.backlog += pkt_len; + sch->q.qlen++; + } +} + /* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) { -- cgit v1.2.3 From 73eb628ddfd3884d1e58a8022de2e78de7807fc6 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 10 Apr 2019 14:32:41 +0200 Subject: Revert: "net: sched: put back q.qlen into a single location" This revert commit 46b1c18f9deb ("net: sched: put back q.qlen into a single location"). After the previous patch, when a NOLOCK qdisc is enslaved to a locking qdisc it switches to global stats accounting. As a consequence, when a classful qdisc accesses directly a child qdisc's qlen, such qdisc is not doing per CPU accounting and qlen value is consistent. In the control path nobody uses directly qlen since commit e5f0e8f8e45 ("net: sched: introduce and use qdisc tree flush/purge helpers"), so we can remove the contented atomic ops from the datapath. v1 -> v2: - complete the qdisc_qstats_atomic_qlen_dec() -> qdisc_qstats_cpu_qlen_dec() replacement, fix build issue - more descriptive commit message Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/sch_generic.h | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f069011524ba..e8f85cd2afce 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -52,10 +52,7 @@ struct qdisc_size_table { struct qdisc_skb_head { struct sk_buff *head; struct sk_buff *tail; - union { - u32 qlen; - atomic_t atomic_qlen; - }; + __u32 qlen; spinlock_t lock; }; @@ -486,19 +483,27 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) BUILD_BUG_ON(sizeof(qcb->data) < sz); } +static inline int qdisc_qlen_cpu(const struct Qdisc *q) +{ + return this_cpu_ptr(q->cpu_qstats)->qlen; +} + static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; } -static inline u32 qdisc_qlen_sum(const struct Qdisc *q) +static inline int qdisc_qlen_sum(const struct Qdisc *q) { - u32 qlen = q->qstats.qlen; + __u32 qlen = q->qstats.qlen; + int i; - if (qdisc_is_percpu_stats(q)) - qlen += atomic_read(&q->q.atomic_qlen); - else + if (qdisc_is_percpu_stats(q)) { + for_each_possible_cpu(i) + qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen; + } else { qlen += q->q.qlen; + } return qlen; } @@ -889,14 +894,14 @@ static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch, this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb)); } -static inline void qdisc_qstats_atomic_qlen_inc(struct Qdisc *sch) +static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch) { - atomic_inc(&sch->q.atomic_qlen); + this_cpu_inc(sch->cpu_qstats->qlen); } -static inline void qdisc_qstats_atomic_qlen_dec(struct Qdisc *sch) +static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch) { - atomic_dec(&sch->q.atomic_qlen); + this_cpu_dec(sch->cpu_qstats->qlen); } static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch) @@ -1112,7 +1117,7 @@ static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch, if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_backlog_dec(sch, skb); qdisc_bstats_cpu_update(sch, skb); - qdisc_qstats_atomic_qlen_dec(sch); + qdisc_qstats_cpu_qlen_dec(sch); } else { qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); @@ -1124,7 +1129,7 @@ static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch, unsigned int pkt_len) { if (qdisc_is_percpu_stats(sch)) { - qdisc_qstats_atomic_qlen_inc(sch); + qdisc_qstats_cpu_qlen_inc(sch); this_cpu_add(sch->cpu_qstats->backlog, pkt_len); } else { sch->qstats.backlog += pkt_len; @@ -1141,7 +1146,7 @@ static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch) skb = __skb_dequeue(&sch->gso_skb); if (qdisc_is_percpu_stats(sch)) { qdisc_qstats_cpu_backlog_dec(sch, skb); - qdisc_qstats_atomic_qlen_dec(sch); + qdisc_qstats_cpu_qlen_dec(sch); } else { qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; -- cgit v1.2.3 From b0b9395d865e3060d97658fbc9ba3f77fecc8da1 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 9 Apr 2019 11:49:09 -0700 Subject: bpf: support input __sk_buff context in BPF_PROG_TEST_RUN Add new set of arguments to bpf_attr for BPF_PROG_TEST_RUN: * ctx_in/ctx_size_in - input context * ctx_out/ctx_size_out - output context The intended use case is to pass some meta data to the test runs that operate on skb (this has being brought up on recent LPC). For programs that use bpf_prog_test_run_skb, support __sk_buff input and output. Initially, from input __sk_buff, copy _only_ cb and priority into skb, all other non-zero fields are prohibited (with EINVAL). If the user has set ctx_out/ctx_size_out, copy the potentially modified __sk_buff back to the userspace. We require all fields of input __sk_buff except the ones we explicitly support to be set to zero. The expectation is that in the future we might add support for more fields and we want to fail explicitly if the user runs the program on the kernel where we don't yet support them. The API is intentionally vague (i.e. we don't explicitly add __sk_buff to bpf_attr, but ctx_in) to potentially let other test_run types use this interface in the future (this can be xdp_md for xdp types for example). v4: * don't copy more than allowed in bpf_ctx_init [Martin] v3: * handle case where ctx_in is NULL, but ctx_out is not [Martin] * convert size==0 checks to ptr==NULL checks and add some extra ptr checks [Martin] v2: * Addressed comments from Martin Lau Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index af1cbd951f26..31a27dd337dc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -412,6 +412,13 @@ union bpf_attr { __aligned_u64 data_out; __u32 repeat; __u32 duration; + __u32 ctx_size_in; /* input: len of ctx_in */ + __u32 ctx_size_out; /* input/output: len of ctx_out + * returns ENOSPC if ctx_out + * is too small. + */ + __aligned_u64 ctx_in; + __aligned_u64 ctx_out; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ -- cgit v1.2.3 From bf8981a2aa082d9d64771b47c8a1c9c388d8cd40 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Apr 2019 10:44:06 +0200 Subject: netfilter: nf_nat: merge ip/ip6 masquerade headers Both are now implemented by nf_nat_masquerade.c, so no need to keep different headers. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_nat_masquerade.h | 15 --------------- include/net/netfilter/ipv6/nf_nat_masquerade.h | 11 ----------- include/net/netfilter/nf_nat_masquerade.h | 21 +++++++++++++++++++++ 3 files changed, 21 insertions(+), 26 deletions(-) delete mode 100644 include/net/netfilter/ipv4/nf_nat_masquerade.h delete mode 100644 include/net/netfilter/ipv6/nf_nat_masquerade.h create mode 100644 include/net/netfilter/nf_nat_masquerade.h (limited to 'include') diff --git a/include/net/netfilter/ipv4/nf_nat_masquerade.h b/include/net/netfilter/ipv4/nf_nat_masquerade.h deleted file mode 100644 index 13d55206bb9f..000000000000 --- a/include/net/netfilter/ipv4/nf_nat_masquerade.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NF_NAT_MASQUERADE_IPV4_H_ -#define _NF_NAT_MASQUERADE_IPV4_H_ - -#include - -unsigned int -nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, - const struct nf_nat_range2 *range, - const struct net_device *out); - -int nf_nat_masquerade_ipv4_register_notifier(void); -void nf_nat_masquerade_ipv4_unregister_notifier(void); - -#endif /*_NF_NAT_MASQUERADE_IPV4_H_ */ diff --git a/include/net/netfilter/ipv6/nf_nat_masquerade.h b/include/net/netfilter/ipv6/nf_nat_masquerade.h deleted file mode 100644 index 2917bf95c437..000000000000 --- a/include/net/netfilter/ipv6/nf_nat_masquerade.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NF_NAT_MASQUERADE_IPV6_H_ -#define _NF_NAT_MASQUERADE_IPV6_H_ - -unsigned int -nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, - const struct net_device *out); -int nf_nat_masquerade_ipv6_register_notifier(void); -void nf_nat_masquerade_ipv6_unregister_notifier(void); - -#endif /* _NF_NAT_MASQUERADE_IPV6_H_ */ diff --git a/include/net/netfilter/nf_nat_masquerade.h b/include/net/netfilter/nf_nat_masquerade.h new file mode 100644 index 000000000000..cafe71822a53 --- /dev/null +++ b/include/net/netfilter/nf_nat_masquerade.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _NF_NAT_MASQUERADE_H_ +#define _NF_NAT_MASQUERADE_H_ + +#include + +unsigned int +nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, + const struct nf_nat_range2 *range, + const struct net_device *out); + +int nf_nat_masquerade_ipv4_register_notifier(void); +void nf_nat_masquerade_ipv4_unregister_notifier(void); + +unsigned int +nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, + const struct net_device *out); +int nf_nat_masquerade_ipv6_register_notifier(void); +void nf_nat_masquerade_ipv6_unregister_notifier(void); + +#endif /*_NF_NAT_MASQUERADE_H_ */ -- cgit v1.2.3 From 610a43149cabd0c7aa7bed19cbcf05a0249ab32a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Apr 2019 10:44:08 +0200 Subject: netfilter: nf_nat_masquerade: unify ipv4/6 notifier registration Only reason for having two different register functions was because of ipt_MASQUERADE and ip6t_MASQUERADE being two different modules. Previous patch merged those into xt_MASQUERADE, so we can merge this too. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_nat_masquerade.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_nat_masquerade.h b/include/net/netfilter/nf_nat_masquerade.h index cafe71822a53..54a14d643c34 100644 --- a/include/net/netfilter/nf_nat_masquerade.h +++ b/include/net/netfilter/nf_nat_masquerade.h @@ -9,13 +9,11 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, const struct nf_nat_range2 *range, const struct net_device *out); -int nf_nat_masquerade_ipv4_register_notifier(void); -void nf_nat_masquerade_ipv4_unregister_notifier(void); +int nf_nat_masquerade_inet_register_notifiers(void); +void nf_nat_masquerade_inet_unregister_notifiers(void); unsigned int nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, const struct net_device *out); -int nf_nat_masquerade_ipv6_register_notifier(void); -void nf_nat_masquerade_ipv6_unregister_notifier(void); #endif /*_NF_NAT_MASQUERADE_H_ */ -- cgit v1.2.3 From c695865c5c9803f14eef2c99d8a49d9ad60a3383 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 11 Apr 2019 09:12:02 -0700 Subject: bpf: fix missing bpf_check_uarg_tail_zero in BPF_PROG_TEST_RUN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b0b9395d865e ("bpf: support input __sk_buff context in BPF_PROG_TEST_RUN") started using bpf_check_uarg_tail_zero in BPF_PROG_TEST_RUN. However, bpf_check_uarg_tail_zero is not defined for !CONFIG_BPF_SYSCALL: net/bpf/test_run.c: In function ‘bpf_ctx_init’: net/bpf/test_run.c:142:9: error: implicit declaration of function ‘bpf_check_uarg_tail_zero’ [-Werror=implicit-function-declaration] err = bpf_check_uarg_tail_zero(data_in, max_size, size); ^~~~~~~~~~~~~~~~~~~~~~~~ Let's not build net/bpf/test_run.c when CONFIG_BPF_SYSCALL is not set. Reported-by: kbuild test robot Fixes: b0b9395d865e ("bpf: support input __sk_buff context in BPF_PROG_TEST_RUN") Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 65f7094c40b4..e4d4c1771ab0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -483,14 +483,6 @@ typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type, u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); -int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, - union bpf_attr __user *uattr); -int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, - union bpf_attr __user *uattr); -int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, - const union bpf_attr *kattr, - union bpf_attr __user *uattr); - /* an array of programs to be executed under rcu_lock. * * Typical usage: @@ -681,6 +673,13 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); int array_map_alloc_check(union bpf_attr *attr); +int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); +int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); +int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -792,6 +791,27 @@ static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, { return ERR_PTR(-EOPNOTSUPP); } + +static inline int bpf_prog_test_run_xdp(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + +static inline int bpf_prog_test_run_skb(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} + +static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + return -ENOTSUPP; +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, -- cgit v1.2.3 From 58dfc900faff6db7eb9bf01555622e0b6c74c262 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Tue, 9 Apr 2019 15:06:41 +0100 Subject: bpf: add layer 2 encap support to bpf_skb_adjust_room commit 868d523535c2 ("bpf: add bpf_skb_adjust_room encap flags") introduced support to bpf_skb_adjust_room for GSO-friendly GRE and UDP encapsulation. For GSO to work for skbs, the inner headers (mac and network) need to be marked. For L3 encapsulation using bpf_skb_adjust_room, the mac and network headers are identical. Here we provide a way of specifying the inner mac header length for cases where L2 encap is desired. Such an approach can support encapsulated ethernet headers, MPLS headers etc. For example to convert from a packet of form [eth][ip][tcp] to [eth][ip][udp][inner mac][ip][tcp], something like the following could be done: headroom = sizeof(iph) + sizeof(struct udphdr) + inner_maclen; ret = bpf_skb_adjust_room(skb, headroom, BPF_ADJ_ROOM_MAC, BPF_F_ADJ_ROOM_ENCAP_L4_UDP | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | BPF_F_ADJ_ROOM_ENCAP_L2(inner_maclen)); Signed-off-by: Alan Maguire Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 31a27dd337dc..2e96d0b4bf65 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1523,6 +1523,10 @@ union bpf_attr { * * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP **: * Use with ENCAP_L3 flags to further specify the tunnel type. * + * * **BPF_F_ADJ_ROOM_ENCAP_L2(len) **: + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; **len** is the length of the inner MAC header. + * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -2664,10 +2668,16 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff +#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 + #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ + BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { -- cgit v1.2.3 From cc3a86c802f0ba9a2627aef256d95ae3b3fa6e91 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 9 Apr 2019 14:41:12 -0700 Subject: ipv6: Change rt6_probe to take a fib6_nh rt6_probe sends probes for gateways in a nexthop. As such it really depends on a fib6_nh, not a fib entry. Move last_probe to fib6_nh and update rt6_probe to a fib6_nh struct. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 58dbb4e82908..2e9235adfa0d 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -127,6 +127,10 @@ struct rt6_exception { struct fib6_nh { struct fib_nh_common nh_common; + +#ifdef CONFIG_IPV6_ROUTER_PREF + unsigned long last_probe; +#endif }; struct fib6_info { @@ -155,10 +159,6 @@ struct fib6_info { struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; -#ifdef CONFIG_IPV6_ROUTER_PREF - unsigned long last_probe; -#endif - u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; -- cgit v1.2.3 From 971502d77faa50a37c89bc6d172450294ad9a5fd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 Apr 2019 16:36:41 +0200 Subject: bridge: netfilter: unroll NF_HOOK helper in bridge input path Replace NF_HOOK() based invocation of the netfilter hooks with a private copy of nf_hook_slow(). This copy has one difference: it can return the rx handler value expected by the stack, i.e. RX_HANDLER_CONSUMED or RX_HANDLER_PASS. This is needed by the next patch to invoke the ebtables "broute" table via the standard netfilter hooks rather than the custom "br_should_route_hook" indirection that is used now. When the skb is to be "brouted", we must return RX_HANDLER_PASS from the bridge rx input handler, but there is no way to indicate this via NF_HOOK(), unless perhaps by some hack such as exposing bridge_cb in the netfilter core or a percpu flag. text data bss dec filename 3369 56 0 3425 net/bridge/br_input.o.before 3458 40 0 3498 net/bridge/br_input.o.after This allows removal of the "br_should_route_hook" in the next patch. Signed-off-by: Florian Westphal Acked-by: David S. Miller Acked-by: Nikolay Aleksandrov Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_queue.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index a50a69f5334c..7239105d9d2e 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -119,4 +119,7 @@ nfqueue_hash(const struct sk_buff *skb, u16 queue, u16 queues_total, u8 family, return queue; } +int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, + const struct nf_hook_entries *entries, unsigned int index, + unsigned int verdict); #endif /* _NF_QUEUE_H */ -- cgit v1.2.3 From 223fd0adfa8af36d5d9b5d38016e579ee052f367 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 11 Apr 2019 16:36:42 +0200 Subject: bridge: broute: make broute a real ebtables table This makes broute a normal ebtables table, hooking at PREROUTING. The broute hook is removed. It uses skb->cb to signal to bridge rx handler that the skb should be routed instead of being bridged. This change is backwards compatible with ebtables as no userspace visible parts are changed. This means we can also remove the !ops test in ebt_register_table, it was only there for broute table sake. Signed-off-by: Florian Westphal Acked-by: David S. Miller Acked-by: Nikolay Aleksandrov Signed-off-by: Pablo Neira Ayuso --- include/linux/if_bridge.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 627b788ba0ff..ef0819ced0fc 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -56,9 +56,6 @@ struct br_ip_list { extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); -typedef int br_should_route_hook_t(struct sk_buff *skb); -extern br_should_route_hook_t __rcu *br_should_route_hook; - #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING) int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list); -- cgit v1.2.3 From 013b96ec64616b57fc631b304dfcecc5bc288f90 Mon Sep 17 00:00:00 2001 From: David Miller Date: Thu, 11 Apr 2019 15:02:07 -0700 Subject: sctp: Pass sk_buff_head explicitly to sctp_ulpq_tail_event(). Now the SKB list implementation assumption can be removed. And now that we know that the list head is always non-NULL we can remove the code blocks dealing with that as well. Signed-off-by: David S. Miller Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/ulpqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h index bb0ecba3db2b..f4ac7117ff29 100644 --- a/include/net/sctp/ulpqueue.h +++ b/include/net/sctp/ulpqueue.h @@ -59,7 +59,7 @@ void sctp_ulpq_free(struct sctp_ulpq *); int sctp_ulpq_tail_data(struct sctp_ulpq *, struct sctp_chunk *, gfp_t); /* Add a new event for propagation to the ULP. */ -int sctp_ulpq_tail_event(struct sctp_ulpq *, struct sctp_ulpevent *ev); +int sctp_ulpq_tail_event(struct sctp_ulpq *, struct sk_buff_head *skb_list); /* Renege previously received chunks. */ void sctp_ulpq_renege(struct sctp_ulpq *, struct sctp_chunk *, gfp_t); -- cgit v1.2.3 From 7b146cebe30cb481b0f70d85779da938da818637 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 27 Feb 2019 12:59:24 -0800 Subject: bpf: Sysctl hook Containerized applications may run as root and it may create problems for whole host. Specifically such applications may change a sysctl and affect applications in other containers. Furthermore in existing infrastructure it may not be possible to just completely disable writing to sysctl, instead such a process should be gradual with ability to log what sysctl are being changed by a container, investigate, limit the set of writable sysctl to currently used ones (so that new ones can not be changed) and eventually reduce this set to zero. The patch introduces new program type BPF_PROG_TYPE_CGROUP_SYSCTL and attach type BPF_CGROUP_SYSCTL to solve these problems on cgroup basis. New program type has access to following minimal context: struct bpf_sysctl { __u32 write; }; Where @write indicates whether sysctl is being read (= 0) or written (= 1). Helpers to access sysctl name and value will be introduced separately. BPF_CGROUP_SYSCTL attach point is added to sysctl code right before passing control to ctl_table->proc_handler so that BPF program can either allow or deny access to sysctl. Suggested-by: Roman Gushchin Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 18 ++++++++++++++++++ include/linux/bpf_types.h | 1 + include/linux/filter.h | 8 ++++++++ include/uapi/linux/bpf.h | 9 +++++++++ 4 files changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a4c644c1c091..b1c45da20a26 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -17,6 +17,8 @@ struct bpf_map; struct bpf_prog; struct bpf_sock_ops_kern; struct bpf_cgroup_storage; +struct ctl_table; +struct ctl_table_header; #ifdef CONFIG_CGROUP_BPF @@ -109,6 +111,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, short access, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, + struct ctl_table *table, int write, + enum bpf_attach_type type); + static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { @@ -253,6 +259,17 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, \ __ret; \ }) + + +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ + BPF_CGROUP_SYSCTL); \ + __ret; \ +}) + int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, @@ -321,6 +338,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 08bf2f1fe553..d26991a16894 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -28,6 +28,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl) #endif #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) diff --git a/include/linux/filter.h b/include/linux/filter.h index 6074aa064b54..a17732057880 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -33,6 +33,8 @@ struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; struct sock_reuseport; +struct ctl_table; +struct ctl_table_header; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function @@ -1177,4 +1179,10 @@ struct bpf_sock_ops_kern { */ }; +struct bpf_sysctl_kern { + struct ctl_table_header *head; + struct ctl_table *table; + int write; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e96d0b4bf65..cc2a2466d5f3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -167,6 +167,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_PROG_TYPE_CGROUP_SYSCTL, }; enum bpf_attach_type { @@ -188,6 +189,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, BPF_FLOW_DISSECTOR, + BPF_CGROUP_SYSCTL, __MAX_BPF_ATTACH_TYPE }; @@ -3308,4 +3310,11 @@ struct bpf_line_info { struct bpf_spin_lock { __u32 val; }; + +struct bpf_sysctl { + __u32 write; /* Sysctl is being read (= 0) or written (= 1). + * Allows 1,2,4-byte read, but no write. + */ +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 808649fb787d918a48a360a668ee4ee9023f0c11 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 27 Feb 2019 13:28:48 -0800 Subject: bpf: Introduce bpf_sysctl_get_name helper Add bpf_sysctl_get_name() helper to copy sysctl name (/proc/sys/ entry) into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer. By default full name (w/o /proc/sys/) is copied, e.g. "net/ipv4/tcp_mem". If BPF_F_SYSCTL_BASE_NAME flag is set, only base name will be copied, e.g. "tcp_mem". Documentation for the new helper is provided in bpf.h UAPI. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cc2a2466d5f3..9c8a2f3ccb9b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2506,6 +2506,22 @@ union bpf_attr { * Return * 0 if iph and th are a valid SYN cookie ACK, or a negative error * otherwise. + * + * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * Description + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2608,7 +2624,8 @@ union bpf_attr { FN(skb_ecn_set_ce), \ FN(get_listener_sock), \ FN(skc_lookup_tcp), \ - FN(tcp_check_syncookie), + FN(tcp_check_syncookie), \ + FN(sysctl_get_name), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2681,6 +2698,9 @@ enum bpf_func_id { BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) +/* BPF_FUNC_sysctl_get_name flags. */ +#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, -- cgit v1.2.3 From 1d11b3016cec4ed9770b98e82a61708c8f4926e7 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 28 Feb 2019 19:22:15 -0800 Subject: bpf: Introduce bpf_sysctl_get_current_value helper Add bpf_sysctl_get_current_value() helper to copy current sysctl value into provided by BPF_PROG_TYPE_CGROUP_SYSCTL program buffer. It provides same string as user space can see by reading corresponding file in /proc/sys/, including new line, etc. Documentation for the new helper is provided in bpf.h UAPI. Since current value is kept in ctl_table->data in a parsed form, ctl_table->proc_handler() with write=0 is called to read that data and convert it to a string. Such a string can later be parsed by a program using helpers that will be introduced separately. Unfortunately it's not trivial to provide API to access parsed data due to variety of data representations (string, intvec, uintvec, ulongvec, custom structures, even NULL, etc). Instead it's assumed that user know how to handle specific sysctl they're interested in and appropriate helpers can be used. Since ctl_table->proc_handler() expects __user buffer, conversion to __user happens for kernel allocated one where the value is stored. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 ++ include/uapi/linux/bpf.h | 22 +++++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index a17732057880..f254ff92819f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1182,6 +1182,8 @@ struct bpf_sock_ops_kern { struct bpf_sysctl_kern { struct ctl_table_header *head; struct ctl_table *table; + void *cur_val; + size_t cur_len; int write; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9c8a2f3ccb9b..063543afa359 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2522,6 +2522,25 @@ union bpf_attr { * * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). + * + * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2625,7 +2644,8 @@ union bpf_attr { FN(get_listener_sock), \ FN(skc_lookup_tcp), \ FN(tcp_check_syncookie), \ - FN(sysctl_get_name), + FN(sysctl_get_name), \ + FN(sysctl_get_current_value), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 4e63acdff864654cee0ac5aaeda3913798ee78f6 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 7 Mar 2019 18:38:43 -0800 Subject: bpf: Introduce bpf_sysctl_{get,set}_new_value helpers Add helpers to work with new value being written to sysctl by user space. bpf_sysctl_get_new_value() copies value being written to sysctl into provided buffer. bpf_sysctl_set_new_value() overrides new value being written by user space with a one from provided buffer. Buffer should contain string representation of the value, similar to what can be seen in /proc/sys/. Both helpers can be used only on sysctl write. File position matters and can be managed by an interface that will be introduced separately. E.g. if user space calls sys_write to a file in /proc/sys/ at file position = X, where X > 0, then the value set by bpf_sysctl_set_new_value() will be written starting from X. If program wants to override whole value with specified buffer, file position has to be set to zero. Documentation for the new helpers is provided in bpf.h UAPI. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 8 +++++--- include/linux/filter.h | 3 +++ include/uapi/linux/bpf.h | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 45 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index b1c45da20a26..1e97271f9a10 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -113,7 +113,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - enum bpf_attach_type type); + void __user *buf, size_t *pcount, + void **new_buf, enum bpf_attach_type type); static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) @@ -261,11 +262,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) \ +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, nbuf) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ + buf, count, nbuf, \ BPF_CGROUP_SYSCTL); \ __ret; \ }) @@ -338,7 +340,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,nbuf) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/filter.h b/include/linux/filter.h index f254ff92819f..a23653f9460c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1184,6 +1184,9 @@ struct bpf_sysctl_kern { struct ctl_table *table; void *cur_val; size_t cur_len; + void *new_val; + size_t new_len; + int new_updated; int write; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 063543afa359..547b8258d731 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2541,6 +2541,40 @@ union bpf_attr { * * **-EINVAL** if current value was unavailable, e.g. because * sysctl is uninitialized and read returns -EIO for it. + * + * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + * + * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * Description + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * Return + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2645,7 +2679,9 @@ union bpf_attr { FN(skc_lookup_tcp), \ FN(tcp_check_syncookie), \ FN(sysctl_get_name), \ - FN(sysctl_get_current_value), + FN(sysctl_get_current_value), \ + FN(sysctl_get_new_value), \ + FN(sysctl_set_new_value), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From e1550bfe0de47e30484ba91de1e50a91ec1c31f5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 7 Mar 2019 18:50:52 -0800 Subject: bpf: Add file_pos field to bpf_sysctl ctx Add file_pos field to bpf_sysctl context to read and write sysctl file position at which sysctl is being accessed (read or written). The field can be used to e.g. override whole sysctl value on write to sysctl even when sys_write is called by user space with file_pos > 0. Or BPF program may reject such accesses. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 9 +++++---- include/linux/filter.h | 3 +++ include/uapi/linux/bpf.h | 3 +++ 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 1e97271f9a10..cb3c6b3b89c8 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -114,7 +114,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, void __user *buf, size_t *pcount, - void **new_buf, enum bpf_attach_type type); + loff_t *ppos, void **new_buf, + enum bpf_attach_type type); static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) @@ -262,12 +263,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, nbuf) \ +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos, nbuf) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ - buf, count, nbuf, \ + buf, count, pos, nbuf, \ BPF_CGROUP_SYSCTL); \ __ret; \ }) @@ -340,7 +341,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,nbuf) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/filter.h b/include/linux/filter.h index a23653f9460c..fb0edad75971 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1188,6 +1188,9 @@ struct bpf_sysctl_kern { size_t new_len; int new_updated; int write; + loff_t *ppos; + /* Temporary "register" for indirect stores to ppos. */ + u64 tmp_reg; }; #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 547b8258d731..89976de909af 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3391,6 +3391,9 @@ struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. */ + __u32 file_pos; /* Sysctl file position to read from, write to. + * Allows 1,2,4-byte read an 4-byte write. + */ }; #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 57c3bb725a3dd97d960d7e1cd0845d88de53217f Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 18 Mar 2019 16:57:10 -0700 Subject: bpf: Introduce ARG_PTR_TO_{INT,LONG} arg types Currently the way to pass result from BPF helper to BPF program is to provide memory area defined by pointer and size: func(void *, size_t). It works great for generic use-case, but for simple types, such as int, it's overkill and consumes two arguments when it could use just one. Introduce new argument types ARG_PTR_TO_INT and ARG_PTR_TO_LONG to be able to pass result from helper to program via pointer to int and long correspondingly: func(int *) or func(long *). New argument types are similar to ARG_PTR_TO_MEM with the following differences: * they don't require corresponding ARG_CONST_SIZE argument, predefined access sizes are used instead (32bit for int, 64bit for long); * it's possible to use more than one such an argument in a helper; * provided pointers have to be aligned. It's easy to introduce similar ARG_PTR_TO_CHAR and ARG_PTR_TO_SHORT argument types. It's not done due to lack of use-case though. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e4d4c1771ab0..fd06ada941ad 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -202,6 +202,8 @@ enum bpf_arg_type { ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ + ARG_PTR_TO_INT, /* pointer to int */ + ARG_PTR_TO_LONG, /* pointer to long */ }; /* type of values returned from helper functions */ -- cgit v1.2.3 From d7a4cb9b6705a89937d12c8158a35a3145dc967a Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Mon, 18 Mar 2019 17:55:26 -0700 Subject: bpf: Introduce bpf_strtol and bpf_strtoul helpers Add bpf_strtol and bpf_strtoul to convert a string to long and unsigned long correspondingly. It's similar to user space strtol(3) and strtoul(3) with a few changes to the API: * instead of NUL-terminated C string the helpers expect buffer and buffer length; * resulting long or unsigned long is returned in a separate result-argument; * return value is used to indicate success or failure, on success number of consumed bytes is returned that can be used to identify position to read next if the buffer is expected to contain multiple integers; * instead of *base* argument, *flags* is used that provides base in 5 LSB, other bits are reserved for future use; * number of supported bases is limited. Documentation for the new helpers is provided in bpf.h UAPI. The helpers are made available to BPF_PROG_TYPE_CGROUP_SYSCTL programs to be able to convert string input to e.g. "ulongvec" output. E.g. "net/ipv4/tcp_mem" consists of three ulong integers. They can be parsed by calling to bpf_strtoul three times. Implementation notes: Implementation includes "../../lib/kstrtox.h" to reuse integer parsing functions. It's done exactly same way as fs/proc/base.c already does. Unfortunately existing kstrtoX function can't be used directly since they fail if any invalid character is present right after integer in the string. Existing simple_strtoX functions can't be used either since they're obsolete and don't handle overflow properly. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fd06ada941ad..f15432d90728 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -989,6 +989,8 @@ extern const struct bpf_func_proto bpf_sk_redirect_map_proto; extern const struct bpf_func_proto bpf_spin_lock_proto; extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; +extern const struct bpf_func_proto bpf_strtol_proto; +extern const struct bpf_func_proto bpf_strtoul_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 89976de909af..c26be24fd5e2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2575,6 +2575,53 @@ union bpf_attr { * **-E2BIG** if the *buf_len* is too big. * * **-EINVAL** if sysctl is being read. + * + * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by isspace(3)) followed by a single optional '-' + * sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space strtol(3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than buf_len. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by isspace(3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space strtoul(3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than buf_len. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2681,7 +2728,9 @@ union bpf_attr { FN(sysctl_get_name), \ FN(sysctl_get_current_value), \ FN(sysctl_get_new_value), \ - FN(sysctl_set_new_value), + FN(sysctl_set_new_value), \ + FN(strtol), \ + FN(strtoul), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From e4edbe3c1f44c84f319149aeb998e7e36b3b897f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 12 Apr 2019 11:52:07 +1000 Subject: rhashtable: fix some __rcu annotation errors With these annotations, the rhashtable now gets no warnings when compiled with "C=1" for sparse checking. Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 460c0eaf6b96..2711cbf01b64 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -40,7 +40,7 @@ * the chain. To avoid dereferencing this pointer without clearing * the bit first, we use an opaque 'struct rhash_lock_head *' for the * pointer stored in the bucket. This struct needs to be defined so - * that rcu_derefernce() works on it, but it has no content so a + * that rcu_dereference() works on it, but it has no content so a * cast is needed for it to be useful. This ensures it isn't * used by mistake with clearing the lock bit first. */ @@ -130,10 +130,10 @@ static inline void rht_unlock(struct bucket_table *tbl, } static inline void rht_assign_unlock(struct bucket_table *tbl, - struct rhash_lock_head **bkt, + struct rhash_lock_head __rcu **bkt, struct rhash_head *obj) { - struct rhash_head **p = (struct rhash_head **)bkt; + struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt; lock_map_release(&tbl->dep_map); rcu_assign_pointer(*p, obj); @@ -556,6 +556,7 @@ static inline struct rhash_head *__rhashtable_lookup( }; struct rhash_lock_head __rcu * const *bkt; struct bucket_table *tbl; + struct rhash_head __rcu *head; struct rhash_head *he; unsigned int hash; @@ -564,8 +565,8 @@ restart: hash = rht_key_hashfn(ht, tbl, key, params); bkt = rht_bucket(tbl, hash); do { - he = rht_ptr(rht_dereference_bucket_rcu(*bkt, tbl, hash)); - rht_for_each_rcu_from(he, he, tbl, hash) { + head = rht_ptr(rht_dereference_bucket_rcu(*bkt, tbl, hash)); + rht_for_each_rcu_from(he, head, tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) -- cgit v1.2.3 From c5783311a1248c437614d438b69c5f31fe483ecb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 12 Apr 2019 11:52:08 +1000 Subject: rhashtable: reorder some inline functions and macros. This patch only moves some code around, it doesn't change the code at all. A subsequent patch will benefit from this as it needs to add calls to functions which are now defined before the call-site, but weren't before. Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 142 ++++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 2711cbf01b64..c504cd820736 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -87,77 +87,6 @@ struct bucket_table { struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; -/* - * We lock a bucket by setting BIT(1) in the pointer - this is always - * zero in real pointers and in the nulls marker. - * bit_spin_locks do not handle contention well, but the whole point - * of the hashtable design is to achieve minimum per-bucket contention. - * A nested hash table might not have a bucket pointer. In that case - * we cannot get a lock. For remove and replace the bucket cannot be - * interesting and doesn't need locking. - * For insert we allocate the bucket if this is the last bucket_table, - * and then take the lock. - * Sometimes we unlock a bucket by writing a new pointer there. In that - * case we don't need to unlock, but we do need to reset state such as - * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() - * provides the same release semantics that bit_spin_unlock() provides, - * this is safe. - */ - -static inline void rht_lock(struct bucket_table *tbl, - struct rhash_lock_head **bkt) -{ - local_bh_disable(); - bit_spin_lock(1, (unsigned long *)bkt); - lock_map_acquire(&tbl->dep_map); -} - -static inline void rht_lock_nested(struct bucket_table *tbl, - struct rhash_lock_head **bucket, - unsigned int subclass) -{ - local_bh_disable(); - bit_spin_lock(1, (unsigned long *)bucket); - lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); -} - -static inline void rht_unlock(struct bucket_table *tbl, - struct rhash_lock_head **bkt) -{ - lock_map_release(&tbl->dep_map); - bit_spin_unlock(1, (unsigned long *)bkt); - local_bh_enable(); -} - -static inline void rht_assign_unlock(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bkt, - struct rhash_head *obj) -{ - struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt; - - lock_map_release(&tbl->dep_map); - rcu_assign_pointer(*p, obj); - preempt_enable(); - __release(bitlock); - local_bh_enable(); -} - -/* - * If 'p' is a bucket head and might be locked: - * rht_ptr() returns the address without the lock bit. - * rht_ptr_locked() returns the address WITH the lock bit. - */ -static inline struct rhash_head __rcu *rht_ptr(const struct rhash_lock_head *p) -{ - return (void *)(((unsigned long)p) & ~BIT(1)); -} - -static inline struct rhash_lock_head __rcu *rht_ptr_locked(const - struct rhash_head *p) -{ - return (void *)(((unsigned long)p) | BIT(1)); -} - /* * NULLS_MARKER() expects a hash value with the low * bits mostly likely to be significant, and it discards @@ -372,6 +301,77 @@ static inline struct rhash_lock_head __rcu **rht_bucket_insert( &tbl->buckets[hash]; } +/* + * We lock a bucket by setting BIT(1) in the pointer - this is always + * zero in real pointers and in the nulls marker. + * bit_spin_locks do not handle contention well, but the whole point + * of the hashtable design is to achieve minimum per-bucket contention. + * A nested hash table might not have a bucket pointer. In that case + * we cannot get a lock. For remove and replace the bucket cannot be + * interesting and doesn't need locking. + * For insert we allocate the bucket if this is the last bucket_table, + * and then take the lock. + * Sometimes we unlock a bucket by writing a new pointer there. In that + * case we don't need to unlock, but we do need to reset state such as + * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() + * provides the same release semantics that bit_spin_unlock() provides, + * this is safe. + */ + +static inline void rht_lock(struct bucket_table *tbl, + struct rhash_lock_head **bkt) +{ + local_bh_disable(); + bit_spin_lock(1, (unsigned long *)bkt); + lock_map_acquire(&tbl->dep_map); +} + +static inline void rht_lock_nested(struct bucket_table *tbl, + struct rhash_lock_head **bucket, + unsigned int subclass) +{ + local_bh_disable(); + bit_spin_lock(1, (unsigned long *)bucket); + lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); +} + +static inline void rht_unlock(struct bucket_table *tbl, + struct rhash_lock_head **bkt) +{ + lock_map_release(&tbl->dep_map); + bit_spin_unlock(1, (unsigned long *)bkt); + local_bh_enable(); +} + +/* + * If 'p' is a bucket head and might be locked: + * rht_ptr() returns the address without the lock bit. + * rht_ptr_locked() returns the address WITH the lock bit. + */ +static inline struct rhash_head __rcu *rht_ptr(const struct rhash_lock_head *p) +{ + return (void *)(((unsigned long)p) & ~BIT(1)); +} + +static inline struct rhash_lock_head __rcu *rht_ptr_locked(const + struct rhash_head *p) +{ + return (void *)(((unsigned long)p) | BIT(1)); +} + +static inline void rht_assign_unlock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt, + struct rhash_head *obj) +{ + struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt; + + lock_map_release(&tbl->dep_map); + rcu_assign_pointer(*p, obj); + preempt_enable(); + __release(bitlock); + local_bh_enable(); +} + /** * rht_for_each_from - iterate over hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. -- cgit v1.2.3 From adc6a3ab192eb40fb9d8b093c87d9aa785af4513 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 12 Apr 2019 11:52:08 +1000 Subject: rhashtable: move dereference inside rht_ptr() Rather than dereferencing a pointer to a bucket and then passing the result to rht_ptr(), we now pass in the pointer and do the dereference in rht_ptr(). This requires that we pass in the tbl and hash as well to support RCU checks, and means that the various rht_for_each functions can expect a pointer that can be dereferenced without further care. There are two places where we dereference a bucket pointer where there is no testable protection - in each case we know that we much have exclusive access without having taken a lock. The previous code used rht_dereference() to pretend that holding the mutex provided protects, but holding the mutex never provides protection for accessing buckets. So instead introduce rht_ptr_exclusive() that can be used when there is known to be exclusive access without holding any locks. Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 69 ++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index c504cd820736..b54e6436547e 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -344,12 +344,28 @@ static inline void rht_unlock(struct bucket_table *tbl, } /* - * If 'p' is a bucket head and might be locked: - * rht_ptr() returns the address without the lock bit. - * rht_ptr_locked() returns the address WITH the lock bit. + * Where 'bkt' is a bucket and might be locked: + * rht_ptr() dereferences that pointer and clears the lock bit. + * rht_ptr_exclusive() dereferences in a context where exclusive + * access is guaranteed, such as when destroying the table. */ -static inline struct rhash_head __rcu *rht_ptr(const struct rhash_lock_head *p) +static inline struct rhash_head *rht_ptr( + struct rhash_lock_head __rcu * const *bkt, + struct bucket_table *tbl, + unsigned int hash) { + const struct rhash_lock_head *p = + rht_dereference_bucket_rcu(*bkt, tbl, hash); + + return (void *)(((unsigned long)p) & ~BIT(1)); +} + +static inline struct rhash_head *rht_ptr_exclusive( + struct rhash_lock_head __rcu * const *bkt) +{ + const struct rhash_lock_head *p = + rcu_dereference_protected(*bkt, 1); + return (void *)(((unsigned long)p) & ~BIT(1)); } @@ -380,8 +396,8 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, * @hash: the hash value / bucket index */ #define rht_for_each_from(pos, head, tbl, hash) \ - for (pos = rht_dereference_bucket(head, tbl, hash); \ - !rht_is_a_nulls(pos); \ + for (pos = head; \ + !rht_is_a_nulls(pos); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** @@ -391,7 +407,8 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ - rht_for_each_from(pos, rht_ptr(*rht_bucket(tbl, hash)), tbl, hash) + rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ + tbl, hash) /** * rht_for_each_entry_from - iterate over hash chain from given head @@ -403,7 +420,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \ - for (pos = rht_dereference_bucket(head, tbl, hash); \ + for (pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) @@ -416,8 +433,9 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ - rht_for_each_entry_from(tpos, pos, rht_ptr(*rht_bucket(tbl, hash)), \ - tbl, hash, member) + rht_for_each_entry_from(tpos, pos, \ + rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ + tbl, hash, member) /** * rht_for_each_entry_safe - safely iterate over hash chain of given type @@ -432,8 +450,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, * remove the loop cursor from the list. */ #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ - for (pos = rht_dereference_bucket(rht_ptr(*rht_bucket(tbl, hash)), \ - tbl, hash), \ + for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ @@ -454,7 +471,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, */ #define rht_for_each_rcu_from(pos, head, tbl, hash) \ for (({barrier(); }), \ - pos = rht_dereference_bucket_rcu(head, tbl, hash); \ + pos = head; \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) @@ -469,10 +486,9 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu(pos, tbl, hash) \ - for (({barrier(); }), \ - pos = rht_ptr(rht_dereference_bucket_rcu( \ - *rht_bucket(tbl, hash), tbl, hash)); \ - !rht_is_a_nulls(pos); \ + for (({barrier(); }), \ + pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash); \ + !rht_is_a_nulls(pos); \ pos = rcu_dereference_raw(pos->next)) /** @@ -490,7 +506,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, */ #define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \ for (({barrier(); }), \ - pos = rht_dereference_bucket_rcu(head, tbl, hash); \ + pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) @@ -508,8 +524,9 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, */ #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ rht_for_each_entry_rcu_from(tpos, pos, \ - rht_ptr(*rht_bucket(tbl, hash)), \ - tbl, hash, member) + rht_ptr(rht_bucket(tbl, hash), \ + tbl, hash), \ + tbl, hash, member) /** * rhl_for_each_rcu - iterate over rcu hash table list @@ -556,7 +573,6 @@ static inline struct rhash_head *__rhashtable_lookup( }; struct rhash_lock_head __rcu * const *bkt; struct bucket_table *tbl; - struct rhash_head __rcu *head; struct rhash_head *he; unsigned int hash; @@ -565,8 +581,7 @@ restart: hash = rht_key_hashfn(ht, tbl, key, params); bkt = rht_bucket(tbl, hash); do { - head = rht_ptr(rht_dereference_bucket_rcu(*bkt, tbl, hash)); - rht_for_each_rcu_from(he, head, tbl, hash) { + rht_for_each_rcu_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) @@ -699,7 +714,7 @@ slow_path: return rhashtable_insert_slow(ht, key, obj); } - rht_for_each_from(head, rht_ptr(*bkt), tbl, hash) { + rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; @@ -744,7 +759,7 @@ slow_path: goto slow_path; /* Inserting at head of list makes unlocking free. */ - head = rht_ptr(rht_dereference_bucket(*bkt, tbl, hash)); + head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (rhlist) { @@ -971,7 +986,7 @@ static inline int __rhashtable_remove_fast_one( pprev = NULL; rht_lock(tbl, bkt); - rht_for_each_from(he, rht_ptr(*bkt), tbl, hash) { + rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); @@ -1130,7 +1145,7 @@ static inline int __rhashtable_replace_fast( pprev = NULL; rht_lock(tbl, bkt); - rht_for_each_from(he, rht_ptr(*bkt), tbl, hash) { + rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; -- cgit v1.2.3 From f4712b46a529ca2da078c82d5d99d367c7ebf82b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 12 Apr 2019 11:52:08 +1000 Subject: rhashtable: replace rht_ptr_locked() with rht_assign_locked() The only times rht_ptr_locked() is used, it is to store a new value in a bucket-head. This is the only time it makes sense to use it too. So replace it by a function which does the whole task: Sets the lock bit and assigns to a bucket head. Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index b54e6436547e..882bc0fcea4b 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -316,6 +316,7 @@ static inline struct rhash_lock_head __rcu **rht_bucket_insert( * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() * provides the same release semantics that bit_spin_unlock() provides, * this is safe. + * When we write to a bucket without unlocking, we use rht_assign_locked(). */ static inline void rht_lock(struct bucket_table *tbl, @@ -369,10 +370,12 @@ static inline struct rhash_head *rht_ptr_exclusive( return (void *)(((unsigned long)p) & ~BIT(1)); } -static inline struct rhash_lock_head __rcu *rht_ptr_locked(const - struct rhash_head *p) +static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, + struct rhash_head *obj) { - return (void *)(((unsigned long)p) | BIT(1)); + struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt; + + rcu_assign_pointer(*p, (void *)((unsigned long)obj | BIT(1))); } static inline void rht_assign_unlock(struct bucket_table *tbl, -- cgit v1.2.3 From ca0b709d1a07b1fe1fb356d8d58f220287f85672 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 12 Apr 2019 11:52:08 +1000 Subject: rhashtable: use BIT(0) for locking. As reported by Guenter Roeck, the new bit-locking using BIT(1) doesn't work on the m68k architecture. m68k only requires 2-byte alignment for words and longwords, so there is only one unused bit in pointers to structs - We current use two, one for the NULLS marker at the end of the linked list, and one for the bit-lock in the head of the list. The two uses don't need to conflict as we never need the head of the list to be a NULLS marker - the marker is only needed to check if an object has moved to a different table, and the bucket head cannot move. The NULLS marker is only needed in a ->next pointer. As we already have different types for the bucket head pointer (struct rhash_lock_head) and the ->next pointers (struct rhash_head), it is fairly easy to treat the lsb differently in each. So: Initialize buckets heads to NULL, and use the lsb for locking. When loading the pointer from the bucket head, if it is NULL (ignoring the lock big), report as being the expected NULLS marker. When storing a value into a bucket head, if it is a NULLS marker, store NULL instead. And convert all places that used bit 1 for locking, to use bit 0. Fixes: 8f0db018006a ("rhashtable: use bit_spin_locks to protect hash bucket.") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: NeilBrown Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 882bc0fcea4b..f7714d3b46bd 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -35,7 +35,7 @@ * the least significant bit set but otherwise stores the address of * the hash bucket. This allows us to be be sure we've found the end * of the right list. - * The value stored in the hash bucket has BIT(2) used as a lock bit. + * The value stored in the hash bucket has BIT(0) used as a lock bit. * This bit must be atomically set before any changes are made to * the chain. To avoid dereferencing this pointer without clearing * the bit first, we use an opaque 'struct rhash_lock_head *' for the @@ -91,15 +91,19 @@ struct bucket_table { * NULLS_MARKER() expects a hash value with the low * bits mostly likely to be significant, and it discards * the msb. - * We git it an address, in which the bottom 2 bits are + * We give it an address, in which the bottom bit is * always 0, and the msb might be significant. * So we shift the address down one bit to align with * expectations and avoid losing a significant bit. + * + * We never store the NULLS_MARKER in the hash table + * itself as we need the lsb for locking. + * Instead we store a NULL */ #define RHT_NULLS_MARKER(ptr) \ ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1)) #define INIT_RHT_NULLS_HEAD(ptr) \ - ((ptr) = RHT_NULLS_MARKER(&(ptr))) + ((ptr) = NULL) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) { @@ -302,8 +306,9 @@ static inline struct rhash_lock_head __rcu **rht_bucket_insert( } /* - * We lock a bucket by setting BIT(1) in the pointer - this is always - * zero in real pointers and in the nulls marker. + * We lock a bucket by setting BIT(0) in the pointer - this is always + * zero in real pointers. The NULLS mark is never stored in the bucket, + * rather we store NULL if the bucket is empty. * bit_spin_locks do not handle contention well, but the whole point * of the hashtable design is to achieve minimum per-bucket contention. * A nested hash table might not have a bucket pointer. In that case @@ -323,7 +328,7 @@ static inline void rht_lock(struct bucket_table *tbl, struct rhash_lock_head **bkt) { local_bh_disable(); - bit_spin_lock(1, (unsigned long *)bkt); + bit_spin_lock(0, (unsigned long *)bkt); lock_map_acquire(&tbl->dep_map); } @@ -332,7 +337,7 @@ static inline void rht_lock_nested(struct bucket_table *tbl, unsigned int subclass) { local_bh_disable(); - bit_spin_lock(1, (unsigned long *)bucket); + bit_spin_lock(0, (unsigned long *)bucket); lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); } @@ -340,7 +345,7 @@ static inline void rht_unlock(struct bucket_table *tbl, struct rhash_lock_head **bkt) { lock_map_release(&tbl->dep_map); - bit_spin_unlock(1, (unsigned long *)bkt); + bit_spin_unlock(0, (unsigned long *)bkt); local_bh_enable(); } @@ -358,7 +363,9 @@ static inline struct rhash_head *rht_ptr( const struct rhash_lock_head *p = rht_dereference_bucket_rcu(*bkt, tbl, hash); - return (void *)(((unsigned long)p) & ~BIT(1)); + if ((((unsigned long)p) & ~BIT(0)) == 0) + return RHT_NULLS_MARKER(bkt); + return (void *)(((unsigned long)p) & ~BIT(0)); } static inline struct rhash_head *rht_ptr_exclusive( @@ -367,7 +374,9 @@ static inline struct rhash_head *rht_ptr_exclusive( const struct rhash_lock_head *p = rcu_dereference_protected(*bkt, 1); - return (void *)(((unsigned long)p) & ~BIT(1)); + if (!p) + return RHT_NULLS_MARKER(bkt); + return (void *)(((unsigned long)p) & ~BIT(0)); } static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, @@ -375,7 +384,9 @@ static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, { struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt; - rcu_assign_pointer(*p, (void *)((unsigned long)obj | BIT(1))); + if (rht_is_a_nulls(obj)) + obj = NULL; + rcu_assign_pointer(*p, (void *)((unsigned long)obj | BIT(0))); } static inline void rht_assign_unlock(struct bucket_table *tbl, @@ -384,6 +395,8 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, { struct rhash_head __rcu **p = (struct rhash_head __rcu **)bkt; + if (rht_is_a_nulls(obj)) + obj = NULL; lock_map_release(&tbl->dep_map); rcu_assign_pointer(*p, obj); preempt_enable(); -- cgit v1.2.3 From 9dde27de3e5efa0d032f3c891a0ca833a0d31911 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 15 Apr 2019 17:15:07 +0800 Subject: sctp: implement memory accounting on rx path sk_forward_alloc's updating is also done on rx path, but to be consistent we change to use sk_mem_charge() in sctp_skb_set_owner_r(). In sctp_eat_data(), it's not enough to check sctp_memory_pressure only, which doesn't work for mem_cgroup_sockets_enabled, so we change to use sk_under_memory_pressure(). When it's under memory pressure, sk_mem_reclaim() and sk_rmem_schedule() should be called on both RENEGE or CHUNK DELIVERY path exit the memory pressure status as soon as possible. Note that sk_rmem_schedule() is using datalen to make things easy there. Reported-by: Matteo Croce Tested-by: Matteo Croce Acked-by: Neil Horman Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 1d13ec3f2707..eefdfa5abf6e 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -421,7 +421,7 @@ static inline void sctp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) /* * This mimics the behavior of skb_set_owner_r */ - sk->sk_forward_alloc -= event->rmem_len; + sk_mem_charge(sk, event->rmem_len); } /* Tests if the list has one and only one entry. */ -- cgit v1.2.3 From bfb35c27c65fce60a12e188135ae1344d1b89e24 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Fri, 12 Apr 2019 12:27:34 +0100 Subject: bpf: fix whitespace for ENCAP_L2 defines in bpf.h replace tab after #define with space in line with rest of definitions Signed-off-by: Alan Maguire Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c26be24fd5e2..704bb69514a2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2792,14 +2792,14 @@ enum bpf_func_id { /* BPF_FUNC_skb_adjust_room flags. */ #define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) -#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff -#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff +#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) #define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) #define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) #define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) -#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) -- cgit v1.2.3 From 725721a6506eea53bfde81a34e91a06d6162c216 Mon Sep 17 00:00:00 2001 From: Viet Hoang Tran Date: Mon, 15 Apr 2019 09:54:55 +0000 Subject: bpf: allow clearing all sock_ops callback flags The helper function bpf_sock_ops_cb_flags_set() can be used to both set and clear the sock_ops callback flags. However, its current behavior is not consistent. BPF program may clear a flag if more than one were set, or replace a flag with another one, but cannot clear all flags. This patch also updates the documentation to clarify the ability to clear flags of this helper function. Signed-off-by: Hoang Tran Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 704bb69514a2..eaf2d3284248 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1737,12 +1737,19 @@ union bpf_attr { * error if an eBPF program tries to set a callback that is not * supported in the current kernel. * - * The supported callback values that *argval* can combine are: + * *argval* is a flag array which can combine these flags: * * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * * Here are some examples of where one could call such eBPF * program: * -- cgit v1.2.3 From ba0509b6881efd0c8b26c36490cba87d8fb324c0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 12 Apr 2019 17:07:37 +0200 Subject: net: core: introduce build_skb_around The function build_skb() also have the responsibility to allocate and clear the SKB structure. Introduce a new function build_skb_around(), that moves the responsibility of allocation and clearing to the caller. This allows caller to use kmem_cache (slab/slub) bulk allocation API. Next patch use this function combined with kmem_cache_alloc_bulk. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Acked-by: Eric Dumazet Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a06275a618f0..e81f2b0e8a83 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1042,6 +1042,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags, int node); struct sk_buff *__build_skb(void *data, unsigned int frag_size); struct sk_buff *build_skb(void *data, unsigned int frag_size); +struct sk_buff *build_skb_around(struct sk_buff *skb, + void *data, unsigned int frag_size); /** * alloc_skb - allocate a network buffer -- cgit v1.2.3 From b1d40991506aa9f1de310a2e74ef8e3bec6ba215 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 16 Apr 2019 14:35:59 -0700 Subject: ipv6: Rename fib6_multipath_select and pass fib6_result Add 'struct fib6_result' to hold the fib entry and fib6_nh from a fib lookup as separate entries, similar to what IPv4 now has with fib_result. Rename fib6_multipath_select to fib6_select_path, pass fib6_result to it, and set f6i and nh in the result once a path selection is done. Call fib6_select_path unconditionally for path selection which means moving the sibling and oif check to fib6_select_path. To handle the two different call paths (2 only call multipath_select if flowi6_oif == 0 and the other always calls it), add a new have_oif_match that controls the sibling walk if relevant. Update callers of fib6_multipath_select accordingly and have them use the fib6_info and fib6_nh from the result. This is needed for multipath nexthop objects where a single f6i can point to multiple fib6_nh (similar to IPv4). Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 13 ++++++++----- include/net/ipv6_stubs.h | 9 ++++----- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 2e9235adfa0d..c4d818041663 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -190,6 +190,11 @@ struct rt6_info { unsigned short rt6i_nfheader_len; }; +struct fib6_result { + struct fib6_nh *nh; + struct fib6_info *f6i; +}; + #define for_each_fib6_node_rt_rcu(fn) \ for (rt = rcu_dereference((fn)->leaf); rt; \ rt = rcu_dereference(rt->fib6_next)) @@ -391,11 +396,9 @@ struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int strict); -struct fib6_info *fib6_multipath_select(const struct net *net, - struct fib6_info *match, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, int strict); - +void fib6_select_path(const struct net *net, struct fib6_result *res, + struct flowi6 *fl6, int oif, bool have_oif_match, + const struct sk_buff *skb, int strict); struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr); diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 453b55bf6723..5df36d6a2613 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -14,6 +14,7 @@ struct fib6_info; struct fib6_nh; struct fib6_config; +struct fib6_result; /* This is ugly, ideally these symbols should be built * into the core kernel. @@ -34,11 +35,9 @@ struct ipv6_stub { struct fib6_table *table, int oif, struct flowi6 *fl6, int flags); - struct fib6_info *(*fib6_multipath_select)(const struct net *net, - struct fib6_info *f6i, - struct flowi6 *fl6, int oif, - const struct sk_buff *skb, - int strict); + void (*fib6_select_path)(const struct net *net, struct fib6_result *res, + struct flowi6 *fl6, int oif, bool oif_match, + const struct sk_buff *skb, int strict); u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, struct in6_addr *saddr); -- cgit v1.2.3 From b748f26092626332f73e71d75e4390de6b8bdf9b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 16 Apr 2019 14:36:06 -0700 Subject: ipv6: Pass fib6_result to ip6_mtu_from_fib6 and fib6_mtu Change ip6_mtu_from_fib6 and fib6_mtu to take a fib6_result over a fib6_info. Update both to use the fib6_nh from fib6_result. Since the signature of ip6_mtu_from_fib6 is already changing, add const to daddr and saddr. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 5 +++-- include/net/ipv6_stubs.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 5909fc421305..46bbd8ff9cc6 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -302,8 +302,9 @@ static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) return mtu; } -u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr, - struct in6_addr *saddr); +u32 ip6_mtu_from_fib6(const struct fib6_result *res, + const struct in6_addr *daddr, + const struct in6_addr *saddr); struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 5df36d6a2613..0d16b9ec0485 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -38,8 +38,9 @@ struct ipv6_stub { void (*fib6_select_path)(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool oif_match, const struct sk_buff *skb, int strict); - u32 (*ip6_mtu_from_fib6)(struct fib6_info *f6i, struct in6_addr *daddr, - struct in6_addr *saddr); + u32 (*ip6_mtu_from_fib6)(const struct fib6_result *res, + const struct in6_addr *daddr, + const struct in6_addr *saddr); int (*fib6_nh_init)(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, -- cgit v1.2.3 From 8ff2e5b26cb84b1b0f502c0b7a3c62e4c4d86acc Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 16 Apr 2019 14:36:09 -0700 Subject: ipv6: Pass fib6_result to fib6_table_lookup tracepoint Change fib6_table_lookup tracepoint to take the fib6_result and use the fib6_info and fib6_nh from it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/trace/events/fib6.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index 6d05ebdd669c..70e252d926ea 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -12,10 +12,10 @@ TRACE_EVENT(fib6_table_lookup, - TP_PROTO(const struct net *net, const struct fib6_info *f6i, + TP_PROTO(const struct net *net, const struct fib6_result *res, struct fib6_table *table, const struct flowi6 *flp), - TP_ARGS(net, f6i, table, flp), + TP_ARGS(net, res, table, flp), TP_STRUCT__entry( __field( u32, tb_id ) @@ -39,7 +39,7 @@ TRACE_EVENT(fib6_table_lookup, struct in6_addr *in6; __entry->tb_id = table->tb6_id; - __entry->err = ip6_rt_type_to_error(f6i->fib6_type); + __entry->err = ip6_rt_type_to_error(res->f6i->fib6_type); __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->tos = ip6_tclass(flp->flowlabel); @@ -62,20 +62,20 @@ TRACE_EVENT(fib6_table_lookup, __entry->dport = 0; } - if (f6i->fib6_nh.fib_nh_dev) { - __assign_str(name, f6i->fib6_nh.fib_nh_dev); + if (res->nh && res->nh->fib_nh_dev) { + __assign_str(name, res->nh->fib_nh_dev); } else { __assign_str(name, "-"); } - if (f6i == net->ipv6.fib6_null_entry) { + if (res->f6i == net->ipv6.fib6_null_entry) { struct in6_addr in6_zero = {}; in6 = (struct in6_addr *)__entry->gw; *in6 = in6_zero; - } else if (f6i) { + } else if (res->nh) { in6 = (struct in6_addr *)__entry->gw; - *in6 = f6i->fib6_nh.fib_nh_gw6; + *in6 = res->nh->fib_nh_gw6; } ), -- cgit v1.2.3 From effda4dd97e878ab83336bec7411cc41b5cc6d37 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 16 Apr 2019 14:36:10 -0700 Subject: ipv6: Pass fib6_result to fib lookups Change fib6_lookup and fib6_table_lookup to take a fib6_result and set f6i and nh rather than returning a fib6_info. For now both always return 0. A later patch set can make these more like the IPv4 counterparts and return EINVAL, EACCESS, etc based on fib6_type. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 9 +++++---- include/net/ipv6_stubs.h | 11 +++++------ 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index c4d818041663..cb3277cd1413 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -389,12 +389,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, /* called with rcu lock held; can return error pointer * caller needs to select path */ -struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, - int flags); +int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags); /* called with rcu lock held; caller needs to select path */ -struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table, - int oif, struct flowi6 *fl6, int strict); +int fib6_table_lookup(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, struct fib6_result *res, + int strict); void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 0d16b9ec0485..6c0c4fde16f8 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -29,12 +29,11 @@ struct ipv6_stub { int (*ipv6_route_input)(struct sk_buff *skb); struct fib6_table *(*fib6_get_table)(struct net *net, u32 id); - struct fib6_info *(*fib6_lookup)(struct net *net, int oif, - struct flowi6 *fl6, int flags); - struct fib6_info *(*fib6_table_lookup)(struct net *net, - struct fib6_table *table, - int oif, struct flowi6 *fl6, - int flags); + int (*fib6_lookup)(struct net *net, int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags); + int (*fib6_table_lookup)(struct net *net, struct fib6_table *table, + int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags); void (*fib6_select_path)(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool oif_match, const struct sk_buff *skb, int strict); -- cgit v1.2.3 From 7d21fec90438941b44b699ae73673d2f8a3a9d76 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 16 Apr 2019 14:36:11 -0700 Subject: ipv6: Add fib6_type and fib6_flags to fib6_result Add the fib6_flags and fib6_type to fib6_result. Update the lookup helpers to set them and update post fib lookup users to use the version from the result. This allows nexthop objects to have blackhole nexthop. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 2 ++ include/trace/events/fib6.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index cb3277cd1413..6b7557b71c8c 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -193,6 +193,8 @@ struct rt6_info { struct fib6_result { struct fib6_nh *nh; struct fib6_info *f6i; + u32 fib6_flags; + u8 fib6_type; }; #define for_each_fib6_node_rt_rcu(fn) \ diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h index 70e252d926ea..c6abdcc77c12 100644 --- a/include/trace/events/fib6.h +++ b/include/trace/events/fib6.h @@ -39,7 +39,7 @@ TRACE_EVENT(fib6_table_lookup, struct in6_addr *in6; __entry->tb_id = table->tb6_id; - __entry->err = ip6_rt_type_to_error(res->f6i->fib6_type); + __entry->err = ip6_rt_type_to_error(res->fib6_type); __entry->oif = flp->flowi6_oif; __entry->iif = flp->flowi6_iif; __entry->tos = ip6_tclass(flp->flowlabel); -- cgit v1.2.3 From b8fb1ab46169ac016a8552a6455bb0bfc401f8e2 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 16 Apr 2019 17:31:43 -0700 Subject: net ipv6: Prevent neighbor add if protocol is disabled on device Disabling IPv6 on an interface removes existing entries but nothing prevents new entries from being manually added. To that end, add a new neigh_table operation, allow_add, that is called on RTM_NEWNEIGH to see if neighbor entries are allowed on a given device. If IPv6 is disabled on the device, allow_add returns false and passes a message back to the user via extack. $ echo 1 > /proc/sys/net/ipv6/conf/eth1/disable_ipv6 $ ip -6 neigh add fe80::4c88:bff:fe21:2704 dev eth1 lladdr de:ad:be:ef:01:01 Error: IPv6 is disabled on this device. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/neighbour.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 3e5438bd0101..50a67bd6a434 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -205,6 +205,8 @@ struct neigh_table { int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); void (*proxy_redo)(struct sk_buff *skb); + bool (*allow_add)(const struct net_device *dev, + struct netlink_ext_ack *extack); char *id; struct neigh_parms parms; struct list_head parms_list; -- cgit v1.2.3 From 0bc199854405543b0debe67c735c0aae94f1d319 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Wed, 17 Apr 2019 16:35:49 -0400 Subject: ipv6: Add rate limit mask for ICMPv6 messages To make ICMPv6 closer to ICMPv4, add ratemask parameter. Since the ICMP message types use larger numeric values, a simple bitmask doesn't fit. I use large bitmap. The input and output are the in form of list of ranges. Set the default to rate limit all error messages but Packet Too Big. For Packet Too Big, use ratemask instead of hard-coded. There are functions where icmpv6_xrlim_allow() and icmpv6_global_allow() aren't called. This patch only adds them to icmpv6_echo_reply(). Rate limiting error messages is mandated by RFC 4443 but RFC 4890 says that it is also acceptable to rate limit informational messages. Thus, I removed the current hard-coded behavior of icmpv6_mask_allow() that doesn't rate limit informational messages. v2: Add dummy function proc_do_large_bitmap() if CONFIG_PROC_SYSCTL isn't defined, expand the description in ip-sysctl.txt and remove unnecessary conditional before kfree(). v3: Inline the bitmap instead of dynamically allocated. Still is a pointer to it is needed because of the way proc_do_large_bitmap work. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 3 +++ include/uapi/linux/icmpv6.h | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 64e29b58bb5e..5e61b5a8635d 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -8,6 +8,7 @@ #ifndef __NETNS_IPV6_H__ #define __NETNS_IPV6_H__ #include +#include struct ctl_table_header; @@ -35,6 +36,8 @@ struct netns_sysctl_ipv6 { int icmpv6_echo_ignore_all; int icmpv6_echo_ignore_multicast; int icmpv6_echo_ignore_anycast; + DECLARE_BITMAP(icmpv6_ratemask, ICMPV6_MSG_MAX + 1); + unsigned long *icmpv6_ratemask_ptr; int anycast_src_echo_reply; int ip_nonlocal_bind; int fwmark_reflect; diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index 325395f56bfa..2622b5a3e616 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -90,6 +90,8 @@ struct icmp6hdr { #define ICMPV6_TIME_EXCEED 3 #define ICMPV6_PARAMPROB 4 +#define ICMPV6_ERRMSG_MAX 127 + #define ICMPV6_INFOMSG_MASK 0x80 #define ICMPV6_ECHO_REQUEST 128 @@ -110,6 +112,8 @@ struct icmp6hdr { #define ICMPV6_MRDISC_ADV 151 +#define ICMPV6_MSG_MAX 255 + /* * Codes for Destination Unreachable */ -- cgit v1.2.3 From a06eaaf7913cab9dd6cb8ece4f78cfd7a802872a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Apr 2019 13:51:59 -0700 Subject: net: skb: remove unused asserts We are discouraging the use of BUG() these days, remove the unused ASSERT macros from skbuff.h. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a06275a618f0..e4ee92089dd6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2100,8 +2100,6 @@ void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size, unsigned int truesize); -#define SKB_PAGE_ASSERT(skb) BUG_ON(skb_shinfo(skb)->nr_frags) -#define SKB_FRAG_ASSERT(skb) BUG_ON(skb_has_frag_list(skb)) #define SKB_LINEAR_ASSERT(skb) BUG_ON(skb_is_nonlinear(skb)) #ifdef NET_SKBUFF_DATA_USES_OFFSET -- cgit v1.2.3 From 71dd6c0dff51b5f1fef2e9dfa6f6a948aac975f3 Mon Sep 17 00:00:00 2001 From: David Bauer Date: Wed, 17 Apr 2019 23:59:21 +0200 Subject: net: phy: add support for reset-controller This commit adds support for PHY reset pins handled by a reset controller. Signed-off-by: David Bauer Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 3e99ae3ed87f..6eaf71500ef6 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -40,6 +40,7 @@ struct mdio_device { int addr; int flags; struct gpio_desc *reset; + struct reset_control *reset_ctrl; unsigned int reset_assert_delay; unsigned int reset_deassert_delay; }; -- cgit v1.2.3 From 6110ed2db3a41f3b9d676e58ac3d4637c2b497c4 Mon Sep 17 00:00:00 2001 From: David Bauer Date: Wed, 17 Apr 2019 23:59:22 +0200 Subject: net: mdio: rename mdio_device reset to reset_gpio This renames the GPIO reset of mdio devices from 'reset' to 'reset_gpio' to better differentiate between GPIO and reset-controller driven reset line. Signed-off-by: David Bauer Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 6eaf71500ef6..9dc16d5705a1 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -39,7 +39,7 @@ struct mdio_device { /* Bus address of the MDIO device (0-31) */ int addr; int flags; - struct gpio_desc *reset; + struct gpio_desc *reset_gpio; struct reset_control *reset_ctrl; unsigned int reset_assert_delay; unsigned int reset_deassert_delay; -- cgit v1.2.3 From 8c8b3458d0b91b2230f76fbe1b0280568f10d19f Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Thu, 18 Apr 2019 18:35:31 +0100 Subject: vlan: support binding link state to vlan member bridge ports In the case of vlan filtering on bridges, the bridge may also have the corresponding vlan devices as upper devices. Currently the link state of vlan devices is transferred from the lower device. So this is up if the bridge is in admin up state and there is at least one bridge port that is up, regardless of the vlan that the port is a member of. The link state of the vlan device may need to track only the state of the subset of ports that are also members of the corresponding vlan, rather than that of all ports. Add a flag to specify a vlan bridge binding mode, by which the link state is no longer automatically transferred from the lower device, but is instead determined by the bridge ports that are members of the vlan. Signed-off-by: Mike Manning Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_vlan.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_vlan.h b/include/uapi/linux/if_vlan.h index 7a0e8bd65b6b..90a2c89afc8f 100644 --- a/include/uapi/linux/if_vlan.h +++ b/include/uapi/linux/if_vlan.h @@ -32,10 +32,11 @@ enum vlan_ioctl_cmds { }; enum vlan_flags { - VLAN_FLAG_REORDER_HDR = 0x1, - VLAN_FLAG_GVRP = 0x2, - VLAN_FLAG_LOOSE_BINDING = 0x4, - VLAN_FLAG_MVRP = 0x8, + VLAN_FLAG_REORDER_HDR = 0x1, + VLAN_FLAG_GVRP = 0x2, + VLAN_FLAG_LOOSE_BINDING = 0x4, + VLAN_FLAG_MVRP = 0x8, + VLAN_FLAG_BRIDGE_BINDING = 0x10, }; enum vlan_name_types { -- cgit v1.2.3 From c7cbdbf29f488a19982cd9f4a109887f18028bbb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 Apr 2019 22:51:48 +0200 Subject: net: rework SIOCGSTAMP ioctl handling The SIOCGSTAMP/SIOCGSTAMPNS ioctl commands are implemented by many socket protocol handlers, and all of those end up calling the same sock_get_timestamp()/sock_get_timestampns() helper functions, which results in a lot of duplicate code. With the introduction of 64-bit time_t on 32-bit architectures, this gets worse, as we then need four different ioctl commands in each socket protocol implementation. To simplify that, let's add a new .gettstamp() operation in struct proto_ops, and move ioctl implementation into the common sock_ioctl()/compat_sock_ioctl_trans() functions that these all go through. We can reuse the sock_get_timestamp() implementation, but generalize it so it can deal with both native and compat mode, as well as timeval and timespec structures. Acked-by: Stefan Schmidt Acked-by: Neil Horman Acked-by: Marc Kleine-Budde Link: https://lore.kernel.org/lkml/CAK8P3a038aDQQotzua_QtKGhq8O9n+rdiz2=WDCp82ys8eUT+A@mail.gmail.com/ Signed-off-by: Arnd Bergmann Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/net.h | 2 ++ include/net/compat.h | 3 --- include/net/sock.h | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index c606c72311d0..50bf5206ead6 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -161,6 +161,8 @@ struct proto_ops { int (*compat_ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); #endif + int (*gettstamp) (struct socket *sock, void __user *userstamp, + bool timeval, bool time32); int (*listen) (struct socket *sock, int len); int (*shutdown) (struct socket *sock, int flags); int (*setsockopt)(struct socket *sock, int level, diff --git a/include/net/compat.h b/include/net/compat.h index 4c6d75612b6c..f277653c7e17 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -30,9 +30,6 @@ struct compat_cmsghdr { compat_int_t cmsg_type; }; -int compat_sock_get_timestamp(struct sock *, struct timeval __user *); -int compat_sock_get_timestampns(struct sock *, struct timespec __user *); - #else /* defined(CONFIG_COMPAT) */ /* * To avoid compiler warnings: diff --git a/include/net/sock.h b/include/net/sock.h index bdd77bbce7d8..784cd19d5ff7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1614,6 +1614,8 @@ int sock_setsockopt(struct socket *sock, int level, int op, int sock_getsockopt(struct socket *sock, int level, int op, char __user *optval, int __user *optlen); +int sock_gettstamp(struct socket *sock, void __user *userstamp, + bool timeval, bool time32); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, @@ -2503,8 +2505,6 @@ static inline bool sk_listener(const struct sock *sk) } void sock_enable_timestamp(struct sock *sk, int flag); -int sock_get_timestamp(struct sock *, struct timeval __user *); -int sock_get_timestampns(struct sock *, struct timespec __user *); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); -- cgit v1.2.3 From 0768e17073dc527ccd18ed5f96ce85f9985e9115 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 Apr 2019 22:56:11 +0200 Subject: net: socket: implement 64-bit timestamps The 'timeval' and 'timespec' data structures used for socket timestamps are going to be redefined in user space based on 64-bit time_t in future versions of the C library to deal with the y2038 overflow problem, which breaks the ABI definition. Unlike many modern ioctl commands, SIOCGSTAMP and SIOCGSTAMPNS do not use the _IOR() macro to encode the size of the transferred data, so it remains ambiguous whether the application uses the old or new layout. The best workaround I could find is rather ugly: we redefine the command code based on the size of the respective data structure with a ternary operator. This lets it get evaluated as late as possible, hopefully after that structure is visible to the caller. We cannot use an #ifdef here, because inux/sockios.h might have been included before any libc header that could determine the size of time_t. The ioctl implementation now interprets the new command codes as always referring to the 64-bit structure on all architectures, while the old architecture specific command code still refers to the old architecture specific layout. The new command number is only used when they are actually different. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/uapi/asm-generic/sockios.h | 4 ++-- include/uapi/linux/sockios.h | 21 +++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/asm-generic/sockios.h b/include/uapi/asm-generic/sockios.h index 64f658c7cec2..44fa3ed70483 100644 --- a/include/uapi/asm-generic/sockios.h +++ b/include/uapi/asm-generic/sockios.h @@ -8,7 +8,7 @@ #define FIOGETOWN 0x8903 #define SIOCGPGRP 0x8904 #define SIOCATMARK 0x8905 -#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */ -#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */ +#define SIOCGSTAMP_OLD 0x8906 /* Get stamp (timeval) */ +#define SIOCGSTAMPNS_OLD 0x8907 /* Get stamp (timespec) */ #endif /* __ASM_GENERIC_SOCKIOS_H */ diff --git a/include/uapi/linux/sockios.h b/include/uapi/linux/sockios.h index d393e9ed3964..7d1bccbbef78 100644 --- a/include/uapi/linux/sockios.h +++ b/include/uapi/linux/sockios.h @@ -19,6 +19,7 @@ #ifndef _LINUX_SOCKIOS_H #define _LINUX_SOCKIOS_H +#include #include /* Linux-specific socket ioctls */ @@ -27,6 +28,26 @@ #define SOCK_IOC_TYPE 0x89 +/* + * the timeval/timespec data structure layout is defined by libc, + * so we need to cover both possible versions on 32-bit. + */ +/* Get stamp (timeval) */ +#define SIOCGSTAMP_NEW _IOR(SOCK_IOC_TYPE, 0x06, long long[2]) +/* Get stamp (timespec) */ +#define SIOCGSTAMPNS_NEW _IOR(SOCK_IOC_TYPE, 0x07, long long[2]) + +#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) +/* on 64-bit and x32, avoid the ?: operator */ +#define SIOCGSTAMP SIOCGSTAMP_OLD +#define SIOCGSTAMPNS SIOCGSTAMPNS_OLD +#else +#define SIOCGSTAMP ((sizeof(struct timeval)) == 8 ? \ + SIOCGSTAMP_OLD : SIOCGSTAMP_NEW) +#define SIOCGSTAMPNS ((sizeof(struct timespec)) == 8 ? \ + SIOCGSTAMPNS_OLD : SIOCGSTAMPNS_NEW) +#endif + /* Routing table calls. */ #define SIOCADDRT 0x890B /* add routing table entry */ #define SIOCDELRT 0x890C /* delete routing table entry */ -- cgit v1.2.3 From 42e5425aa0dfd8a6cdd7e177cfd9703df05c7411 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 18 Apr 2019 21:02:19 +0700 Subject: tipc: introduce new socket option TIPC_SOCK_RECVQ_USED When using TIPC_SOCK_RECVQ_DEPTH for getsockopt(), it returns the number of buffers in receive socket buffer which is not so helpful for user space applications. This commit introduces the new option TIPC_SOCK_RECVQ_USED which returns the current allocated bytes of the receive socket buffer. This helps user space applications dimension its buffer usage to avoid buffer overload issue. Signed-off-by: Tung Nguyen Acked-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 6b2fd4d9655f..7df026ea6aff 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -190,6 +190,7 @@ struct sockaddr_tipc { #define TIPC_MCAST_REPLICAST 134 /* Default: TIPC selects. No arg */ #define TIPC_GROUP_JOIN 135 /* Takes struct tipc_group_req* */ #define TIPC_GROUP_LEAVE 136 /* No argument */ +#define TIPC_SOCK_RECVQ_USED 137 /* Default: none (read only) */ /* * Flag values -- cgit v1.2.3 From 4e54507ab1a9da05238b986292f6cb702e6696c7 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 21 Apr 2019 08:49:01 -0700 Subject: ipv6: Simplify rt6_qualify_for_ecmp After commit c7a1ce397ada ("ipv6: Change addrconf_f6i_alloc to use ip6_route_info_create"), the gateway is no longer filled in for fib6_nh structs in a prefix route. Accordingly, the RTF_ADDRCONF flag check can be dropped from the 'rt6_qualify_for_ecmp'. Further, RTF_DYNAMIC is only set in rt6_info instances, so it can be removed from the check as well. This reduces rt6_qualify_for_ecmp and the mlxsw version to just checking if the nexthop has a gateway which is the real indication of whether entries can be coalesced into a multipath route. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 46bbd8ff9cc6..518d97fbe074 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -68,8 +68,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { - return !(f6i->fib6_flags & (RTF_ADDRCONF|RTF_DYNAMIC)) && - f6i->fib6_nh.fib_nh_gw_family; + return f6i->fib6_nh.fib_nh_gw_family; } void ip6_route_input(struct sk_buff *skb); -- cgit v1.2.3 From be659b8d3c79afc54e087ebf8d849685d7b0d395 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 21 Apr 2019 17:39:18 -0700 Subject: ipv6: Restore RTF_ADDRCONF check in rt6_qualify_for_ecmp The RTF_ADDRCONF flag filters out routes added by RA's in determining which routes can be appended to an existing one to create a multipath route. Restore the flag check and add a comment to document the RA piece. Fixes: 4e54507ab1a9 ("ipv6: Simplify rt6_qualify_for_ecmp") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 518d97fbe074..df9cebc2b20c 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -68,7 +68,9 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr) static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { - return f6i->fib6_nh.fib_nh_gw_family; + /* the RTF_ADDRCONF flag filters out RA's */ + return !(f6i->fib6_flags & RTF_ADDRCONF) && + f6i->fib6_nh.fib_nh_gw_family; } void ip6_route_input(struct sk_buff *skb); -- cgit v1.2.3 From 7df737e991069d75eec1ded1c8b37e81b8c54df9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 19 Apr 2019 07:44:54 -0700 Subject: bpf: remove global variables Move three global variables protected by bpf_verifier_lock into 'struct bpf_verifier_env' to allow parallel verification. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b3ab61fe1932..1305ccbd8fe6 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -295,6 +295,11 @@ struct bpf_verifier_env { const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; + struct { + int *insn_state; + int *insn_stack; + int cur_stack; + } cfg; u32 subprog_cnt; /* number of instructions analyzed by the verifier */ u32 insn_processed; -- cgit v1.2.3 From 0b13c9bb96f6edf03018030f07929a16b4dd77a6 Mon Sep 17 00:00:00 2001 From: "Daniel T. Lee" Date: Sun, 21 Apr 2019 00:50:42 +0900 Subject: include/net/tcp.h: whitespace cleanup at tcp_v4_check This patch makes trivial whitespace fix to the function tcp_v4_check at include/net/tcp.h file. It has stylistic issue, which is "space required after that ','" and it can be confirmed with ./scripts/checkpatch.pl tool. ERROR: space required after that ',' (ctx:VxV) #29: FILE: include/net/tcp.h:1317: + return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); ^ Signed-off-by: Daniel T. Lee Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 68ee02523b87..7cf1181630a3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1314,7 +1314,7 @@ static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq) static inline __sum16 tcp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { - return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); + return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base); } static inline bool tcp_checksum_complete(struct sk_buff *skb) -- cgit v1.2.3 From 7e5f4cdb284be5ff862f84ccda084e2847f73fbb Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 20 Apr 2019 09:27:27 -0700 Subject: ipv6: Remove fib6_info_nh_lwt fib6_info_nh_lwt is no longer used; remove it. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 6b7557b71c8c..352f767bea81 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -450,12 +450,6 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct netlink_ext_ack *extack); void fib6_nh_release(struct fib6_nh *fib6_nh); -static inline -struct lwtunnel_state *fib6_info_nh_lwt(const struct fib6_info *f6i) -{ - return f6i->fib6_nh.fib_nh_lws; -} - void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); -- cgit v1.2.3 From 3c618c1dbb8859625c643121ac80af9a6723533f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sat, 20 Apr 2019 09:28:20 -0700 Subject: net: Rename net/nexthop.h net/rtnh.h The header contains rtnh_ macros so rename the file accordingly. Allows a later patch to use the nexthop.h name for the new nexthop code. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/nexthop.h | 34 ---------------------------------- include/net/rtnh.h | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 34 deletions(-) delete mode 100644 include/net/nexthop.h create mode 100644 include/net/rtnh.h (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h deleted file mode 100644 index 902ff382a6dc..000000000000 --- a/include/net/nexthop.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NET_NEXTHOP_H -#define __NET_NEXTHOP_H - -#include -#include - -static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining) -{ - return remaining >= (int)sizeof(*rtnh) && - rtnh->rtnh_len >= sizeof(*rtnh) && - rtnh->rtnh_len <= remaining; -} - -static inline struct rtnexthop *rtnh_next(const struct rtnexthop *rtnh, - int *remaining) -{ - int totlen = NLA_ALIGN(rtnh->rtnh_len); - - *remaining -= totlen; - return (struct rtnexthop *) ((char *) rtnh + totlen); -} - -static inline struct nlattr *rtnh_attrs(const struct rtnexthop *rtnh) -{ - return (struct nlattr *) ((char *) rtnh + NLA_ALIGN(sizeof(*rtnh))); -} - -static inline int rtnh_attrlen(const struct rtnexthop *rtnh) -{ - return rtnh->rtnh_len - NLA_ALIGN(sizeof(*rtnh)); -} - -#endif diff --git a/include/net/rtnh.h b/include/net/rtnh.h new file mode 100644 index 000000000000..aa2cfc508f7c --- /dev/null +++ b/include/net/rtnh.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_RTNH_H +#define __NET_RTNH_H + +#include +#include + +static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining) +{ + return remaining >= (int)sizeof(*rtnh) && + rtnh->rtnh_len >= sizeof(*rtnh) && + rtnh->rtnh_len <= remaining; +} + +static inline struct rtnexthop *rtnh_next(const struct rtnexthop *rtnh, + int *remaining) +{ + int totlen = NLA_ALIGN(rtnh->rtnh_len); + + *remaining -= totlen; + return (struct rtnexthop *) ((char *) rtnh + totlen); +} + +static inline struct nlattr *rtnh_attrs(const struct rtnexthop *rtnh) +{ + return (struct nlattr *) ((char *) rtnh + NLA_ALIGN(sizeof(*rtnh))); +} + +static inline int rtnh_attrlen(const struct rtnexthop *rtnh) +{ + return rtnh->rtnh_len - NLA_ALIGN(sizeof(*rtnh)); +} + +#endif -- cgit v1.2.3 From a79eda3aaf30bc73752487fff5160cf67e99e313 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sat, 20 Apr 2019 23:29:42 -0400 Subject: net: psample: drop include of module.h from psample.h Ideally, header files under include/linux shouldn't be adding includes of other headers, in anticipation of their consumers, but just the headers needed for the header itself to pass parsing with CPP. The module.h is particularly bad in this sense, as it itself does include a whole bunch of other headers, due to the complexity of module support. There doesn't appear to be anything in psample.h that is module related, and build coverage doesn't appear to show any other files/drivers relying implicitly on getting it from here. So it appears we are simply free to just remove it in this case. Cc: Yotam Gigi Cc: "David S. Miller" Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- include/net/psample.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/psample.h b/include/net/psample.h index 9b80f814ab04..37a4df2325b2 100644 --- a/include/net/psample.h +++ b/include/net/psample.h @@ -3,7 +3,6 @@ #define __NET_PSAMPLE_H #include -#include #include struct psample_group { -- cgit v1.2.3 From c517796ea91d11dd3f0ae7ff61a12fe5d4941eb0 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sat, 20 Apr 2019 23:29:43 -0400 Subject: net: ife: drop include of module.h from net/ife.h Ideally, header files under include/linux shouldn't be adding includes of other headers, in anticipation of their consumers, but just the headers needed for the header itself to pass parsing with CPP. The module.h is particularly bad in this sense, as it itself does include a whole bunch of other headers, due to the complexity of module support. There doesn't appear to be anything in net/ife.h that is module related, and build coverage doesn't appear to show any other files/drivers relying implicitly on getting it from here. So it appears we are simply free to just remove it in this case. Cc: Yotam Gigi Cc: Jamal Hadi Salim Cc: "David S. Miller" Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- include/net/ife.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/ife.h b/include/net/ife.h index e117617e3c34..7e2538d8585b 100644 --- a/include/net/ife.h +++ b/include/net/ife.h @@ -4,7 +4,6 @@ #include #include -#include #include #if IS_ENABLED(CONFIG_NET_IFE) -- cgit v1.2.3 From 113e63286697893127c3ee83471b45ad0cf8d75f Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sat, 20 Apr 2019 23:29:44 -0400 Subject: net: fib: drop include of module.h from fib_notifier.h Ideally, header files under include/linux shouldn't be adding includes of other headers, in anticipation of their consumers, but just the headers needed for the header itself to pass parsing with CPP. The module.h is particularly bad in this sense, as it itself does include a whole bunch of other headers, due to the complexity of module support. Since fib_notifier.h is not going into a module struct looking for specific fields, we can just let it know that module is a struct, just like about 60 other include/linux headers already do. Cc: "David S. Miller" Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- include/net/fib_notifier.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/fib_notifier.h b/include/net/fib_notifier.h index c91ec732afd6..c49d7bfb5c30 100644 --- a/include/net/fib_notifier.h +++ b/include/net/fib_notifier.h @@ -2,10 +2,11 @@ #define __NET_FIB_NOTIFIER_H #include -#include #include #include +struct module; + struct fib_notifier_info { struct net *net; int family; -- cgit v1.2.3 From a130f9b27545f56880034c345da9a4efc2ba2b24 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sat, 20 Apr 2019 23:29:45 -0400 Subject: net: tc_act: drop include of module.h from tc_ife.h Ideally, header files under include/linux shouldn't be adding includes of other headers, in anticipation of their consumers, but just the headers needed for the header itself to pass parsing with CPP. The module.h is particularly bad in this sense, as it itself does include a whole bunch of other headers, due to the complexity of module support. Since tc_ife.h is not going into a module struct looking for specific fields, we can just let it know that module is a struct, just like about 60 other include/linux headers already do. Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: "David S. Miller" Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- include/net/tc_act/tc_ife.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h index 86d13b01b39d..c7f24a2da1ca 100644 --- a/include/net/tc_act/tc_ife.h +++ b/include/net/tc_act/tc_ife.h @@ -5,7 +5,8 @@ #include #include #include -#include + +struct module; struct tcf_ife_params { u8 eth_dst[ETH_ALEN]; -- cgit v1.2.3 From f2ad1a522e9817fba7799008e0a8dc6f8a32bf7d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 22 Apr 2019 12:08:39 +0000 Subject: net: devlink: Add extack to shared buffer operations Add extack to shared buffer set operations, so that meaningful error messages could be propagated to the user. Signed-off-by: Ido Schimmel Acked-by: Jiri Pirko Reviewed-by: Petr Machata Cc: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 70c7d1ac8344..4f5e41613503 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -491,13 +491,14 @@ struct devlink_ops { struct devlink_sb_pool_info *pool_info); int (*sb_pool_set)(struct devlink *devlink, unsigned int sb_index, u16 pool_index, u32 size, - enum devlink_sb_threshold_type threshold_type); + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack); int (*sb_port_pool_get)(struct devlink_port *devlink_port, unsigned int sb_index, u16 pool_index, u32 *p_threshold); int (*sb_port_pool_set)(struct devlink_port *devlink_port, unsigned int sb_index, u16 pool_index, - u32 threshold); + u32 threshold, struct netlink_ext_ack *extack); int (*sb_tc_pool_bind_get)(struct devlink_port *devlink_port, unsigned int sb_index, u16 tc_index, @@ -507,7 +508,8 @@ struct devlink_ops { unsigned int sb_index, u16 tc_index, enum devlink_sb_pool_type pool_type, - u16 pool_index, u32 threshold); + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack); int (*sb_occ_snapshot)(struct devlink *devlink, unsigned int sb_index); int (*sb_occ_max_clear)(struct devlink *devlink, -- cgit v1.2.3 From f24ea52873c726bf7b54318f00ec45050222b367 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Apr 2019 16:44:37 +0200 Subject: xfrm: remove tos indirection from afinfo_policy Only used by ipv4, we can read the fl4 tos value directly instead. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 77eb578a0384..652da5861772 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -329,7 +329,6 @@ struct xfrm_policy_afinfo { void (*decode_session)(struct sk_buff *skb, struct flowi *fl, int reverse); - int (*get_tos)(const struct flowi *fl); int (*init_path)(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len); -- cgit v1.2.3 From 2e8b4aa816eaaf480fe68b1086614259caf1bf3c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Apr 2019 16:44:38 +0200 Subject: xfrm: remove init_path indirection from afinfo_policy handle this directly, its only used by ipv6. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 652da5861772..b8de1622141a 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -329,9 +329,6 @@ struct xfrm_policy_afinfo { void (*decode_session)(struct sk_buff *skb, struct flowi *fl, int reverse); - int (*init_path)(struct xfrm_dst *path, - struct dst_entry *dst, - int nfheader_len); int (*fill_dst)(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl); -- cgit v1.2.3 From c53ac41e3720926301c623d6682bb87ce992a3b3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Apr 2019 16:44:39 +0200 Subject: xfrm: remove decode_session indirection from afinfo_policy No external dependencies, might as well handle this directly. xfrm_afinfo_policy is now 40 bytes on x86_64. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b8de1622141a..18d6b33501b9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -326,9 +326,6 @@ struct xfrm_policy_afinfo { xfrm_address_t *saddr, xfrm_address_t *daddr, u32 mark); - void (*decode_session)(struct sk_buff *skb, - struct flowi *fl, - int reverse); int (*fill_dst)(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl); -- cgit v1.2.3 From bb9cd077e216b886438c5698e1cd75f762ecd3c9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 17 Apr 2019 11:45:13 +0200 Subject: xfrm: remove unneeded export_symbols None of them have any external callers, make them static. Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 18d6b33501b9..eb5018b1cf9c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1568,7 +1568,6 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb); int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb); int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb); -int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err); int xfrm4_protocol_register(struct xfrm4_protocol *handler, unsigned char protocol); int xfrm4_protocol_deregister(struct xfrm4_protocol *handler, unsigned char protocol); int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family); @@ -1584,7 +1583,6 @@ int xfrm6_rcv(struct sk_buff *skb); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto); void xfrm6_local_error(struct sk_buff *skb, u32 mtu); -int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err); int xfrm6_protocol_register(struct xfrm6_protocol *handler, unsigned char protocol); int xfrm6_protocol_deregister(struct xfrm6_protocol *handler, unsigned char protocol); int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family); -- cgit v1.2.3 From 756e161993824961fad4ba62c40045d9ab65bdb8 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Fri, 8 Mar 2019 09:15:43 +0800 Subject: mmc: add SDIO identifiers for MediaTek Bluetooth devices The SDIO identifier for MediaTek Bluetooth devices were defined in the MediaTek Bluetooth driver. Moving the definitions in MMC header file seems common sense. Signed-off-by: Sean Wang Signed-off-by: Marcel Holtmann --- include/linux/mmc/sdio_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 4332199c71c2..d1a5d5df02f5 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -59,6 +59,8 @@ #define SDIO_DEVICE_ID_MARVELL_8797_F0 0x9128 #define SDIO_DEVICE_ID_MARVELL_8887WLAN 0x9134 +#define SDIO_VENDOR_ID_MEDIATEK 0x037a + #define SDIO_VENDOR_ID_SIANO 0x039a #define SDIO_DEVICE_ID_SIANO_NOVA_B0 0x0201 #define SDIO_DEVICE_ID_SIANO_NICE 0x0202 -- cgit v1.2.3 From db0a390835209c1c5dce7669de3d23a8cba10f34 Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Thu, 14 Mar 2019 05:01:58 +0800 Subject: mmc: sdio: Add helper macro for sdio_driver boilerplate This patch introduces the module_sdio_driver macro which is a convenience macro for SDIO driver modules similar to module_usb_driver. It is intended to be used by drivers which init/exit section does nothing but register/ unregister the SDIO driver. By using this macro it is possible to eliminate a few lines of boilerplate code per SDIO driver. Suggested-by: Marcel Holtmann Signed-off-by: Sean Wang Acked-by: Ulf Hansson Signed-off-by: Marcel Holtmann --- include/linux/mmc/sdio_func.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h index 97ca105347a6..5685805533b5 100644 --- a/include/linux/mmc/sdio_func.h +++ b/include/linux/mmc/sdio_func.h @@ -111,6 +111,18 @@ struct sdio_driver { extern int sdio_register_driver(struct sdio_driver *); extern void sdio_unregister_driver(struct sdio_driver *); +/** + * module_sdio_driver() - Helper macro for registering a SDIO driver + * @__sdio_driver: sdio_driver struct + * + * Helper macro for SDIO drivers which do not do anything special in module + * init/exit. This eliminates a lot of boilerplate. Each module may only + * use this macro once, and calling it replaces module_init() and module_exit() + */ +#define module_sdio_driver(__sdio_driver) \ + module_driver(__sdio_driver, sdio_register_driver, \ + sdio_unregister_driver) + /* * SDIO I/O operations */ -- cgit v1.2.3 From 089b19a9204fc090793d389a265f54124eacb05d Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 22 Apr 2019 08:55:44 -0700 Subject: flow_dissector: switch kernel context to struct bpf_flow_dissector struct bpf_flow_dissector has a small subset of sk_buff fields that flow dissector BPF program is allowed to access and an optional pointer to real skb. Real skb is used only in bpf_skb_load_bytes helper to read non-linear data. The real motivation for this is to be able to call flow dissector from eth_get_headlen context where we don't have an skb and need to dissect raw bytes. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 4 ++++ include/net/flow_dissector.h | 7 +++++++ include/net/sch_generic.h | 11 ++++------- 3 files changed, 15 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6f42942a443b..2b7b8228c5c3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1275,6 +1275,10 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) } #endif +struct bpf_flow_dissector; +bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen); + struct bpf_flow_keys; bool __skb_flow_bpf_dissect(struct bpf_prog *prog, const struct sk_buff *skb, diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 2b26979efb48..7c5a8d9a8d2a 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -305,4 +305,11 @@ static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissec return ((char *)target_container) + flow_dissector->offset[key_id]; } +struct bpf_flow_dissector { + struct bpf_flow_keys *flow_keys; + const struct sk_buff *skb; + void *data; + void *data_end; +}; + #endif diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e8f85cd2afce..21f434f3ac9e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -364,13 +364,10 @@ struct tcf_proto { }; struct qdisc_skb_cb { - union { - struct { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; - }; - struct bpf_flow_keys *flow_keys; + struct { + unsigned int pkt_len; + u16 slave_dev_queue_mapping; + u16 tc_classid; }; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; -- cgit v1.2.3 From 3cbf4ffba5eeec60f82868a5facc1962d8a44d00 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 22 Apr 2019 08:55:46 -0700 Subject: net: plumb network namespace into __skb_flow_dissect This new argument will be used in the next patches for the eth_get_headlen use case. eth_get_headlen calls flow dissector with only data (without skb) so there is currently no way to pull attached BPF flow dissector program. With this new argument, we can amend the callers to explicitly pass network namespace so we can use attached BPF program. Signed-off-by: Stanislav Fomichev Reviewed-by: Saeed Mahameed Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2b7b8228c5c3..b466fbface2e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1284,7 +1284,8 @@ bool __skb_flow_bpf_dissect(struct bpf_prog *prog, const struct sk_buff *skb, struct flow_dissector *flow_dissector, struct bpf_flow_keys *flow_keys); -bool __skb_flow_dissect(const struct sk_buff *skb, +bool __skb_flow_dissect(const struct net *net, + const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, void *data, __be16 proto, int nhoff, int hlen, @@ -1294,8 +1295,8 @@ static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, unsigned int flags) { - return __skb_flow_dissect(skb, flow_dissector, target_container, - NULL, 0, 0, 0, flags); + return __skb_flow_dissect(NULL, skb, flow_dissector, + target_container, NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, @@ -1303,18 +1304,19 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, unsigned int flags) { memset(flow, 0, sizeof(*flow)); - return __skb_flow_dissect(skb, &flow_keys_dissector, flow, - NULL, 0, 0, 0, flags); + return __skb_flow_dissect(NULL, skb, &flow_keys_dissector, + flow, NULL, 0, 0, 0, flags); } static inline bool -skb_flow_dissect_flow_keys_basic(const struct sk_buff *skb, +skb_flow_dissect_flow_keys_basic(const struct net *net, + const struct sk_buff *skb, struct flow_keys_basic *flow, void *data, __be16 proto, int nhoff, int hlen, unsigned int flags) { memset(flow, 0, sizeof(*flow)); - return __skb_flow_dissect(skb, &flow_keys_basic_dissector, flow, + return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow, data, proto, nhoff, hlen, flags); } @@ -2492,7 +2494,8 @@ static inline void skb_probe_transport_header(struct sk_buff *skb) if (skb_transport_header_was_set(skb)) return; - if (skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) + if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, + NULL, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); } -- cgit v1.2.3 From 9b52e3f267a6835efd50ed9002d530666d16a411 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 22 Apr 2019 08:55:47 -0700 Subject: flow_dissector: handle no-skb use case When called without skb, gather all required data from the __skb_flow_dissect's arguments and use recently introduces no-skb mode of bpf flow dissector. Note: WARN_ON_ONCE(!net) will now trigger for eth_get_headlen users. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b466fbface2e..998256c2820b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1279,11 +1279,6 @@ struct bpf_flow_dissector; bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen); -struct bpf_flow_keys; -bool __skb_flow_bpf_dissect(struct bpf_prog *prog, - const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - struct bpf_flow_keys *flow_keys); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, -- cgit v1.2.3 From c43f1255b866b423d2381f77eaa2cbc64a9c49aa Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 22 Apr 2019 08:55:48 -0700 Subject: net: pass net_device argument to the eth_get_headlen Update all users of eth_get_headlen to pass network device, fetch network namespace from it and pass it down to the flow dissector. This commit is a noop until administrator inserts BPF flow dissector program. Cc: Maxim Krasnyansky Cc: Saeed Mahameed Cc: Jeff Kirsher Cc: intel-wired-lan@lists.osuosl.org Cc: Yisen Zhuang Cc: Salil Mehta Cc: Michael Chan Cc: Igor Russkikh Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/etherdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index e2f3b21cd72a..c6c1930e28a0 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -33,7 +33,7 @@ struct device; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); -u32 eth_get_headlen(void *data, unsigned int max_len); +u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; -- cgit v1.2.3 From a93f7fe134543649cf2e2d8fc2c50a8f4d742915 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Mon, 22 Apr 2019 21:52:23 +0800 Subject: net: phy: marvell: add new default led configure for m88e151x The default m88e151x LED configuration is 0x1177, used LED[0] for 1000M link, LED[1] for 100M link, and LED[2] for active. But for some boards, which use LED[0] for link, and LED[1] for active, prefer to be 0x1040. To be compatible with this case, this patch defines a new dev_flag, and set it before connect phy in HNS3 driver. When phy initializing, using the new LED configuration if this dev_flag is set. Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan Signed-off-by: David S. Miller --- include/linux/marvell_phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h index 73d04743a2bb..af6b11d4d673 100644 --- a/include/linux/marvell_phy.h +++ b/include/linux/marvell_phy.h @@ -34,5 +34,6 @@ /* struct phy_device dev_flags definitions */ #define MARVELL_PHY_M1145_FLAGS_RESISTANCE 0x00000001 #define MARVELL_PHY_M1118_DNS323_LEDS 0x00000002 +#define MARVELL_PHY_LED0_LINK_LED1_ACTIVE 0x00000004 #endif /* _MARVELL_PHY_H */ -- cgit v1.2.3 From c2273219baa5097a4d7c1c162b992623534f34c1 Mon Sep 17 00:00:00 2001 From: Shay Agroskin Date: Thu, 14 Mar 2019 14:54:07 +0200 Subject: net/mlx5e: XDP, Inline small packets into the TX MPWQE in XDP xmit flow Upon high packet rate with multiple CPUs TX workloads, much of the HCA's resources are spent on prefetching TX descriptors, thus affecting transmission rates. This patch comes to mitigate this problem by moving some workload to the CPU and reducing the HW data prefetch overhead for small packets (<= 256B). When forwarding packets with XDP, a packet that is smaller than a certain size (set to ~256 bytes) would be sent inline within its WQE TX descrptor (mem-copied), when the hardware tx queue is congested beyond a pre-defined water-mark. This is added to better utilize the HW resources (which now makes one less packet data prefetch) and allow better scalability, on the account of CPU usage (which now 'memcpy's the packet into the WQE). To load balance between HW and CPU and get max packet rate, we use watermarks to detect how much the HW is congested and move the work loads back and forth between HW and CPU. Performance: Tested packet rate for UDP 64Byte multi-stream over two dual port ConnectX-5 100Gbps NICs. CPU: Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz * Tested with hyper-threading disabled XDP_TX: | | before | after | | | 24 rings | 51Mpps | 116Mpps | +126% | | 1 ring | 12Mpps | 12Mpps | same | XDP_REDIRECT: ** Below is the transmit rate, not the redirection rate which might be larger, and is not affected by this patch. | | before | after | | | 32 rings | 64Mpps | 92Mpps | +43% | | 1 ring | 6.4Mpps | 6.4Mpps | same | As we can see, feature significantly improves scaling, without hurting single ring performance. Signed-off-by: Shay Agroskin Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/qp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 0343c81d4c5f..3ba4edbd17a6 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -395,6 +395,7 @@ struct mlx5_wqe_signature_seg { struct mlx5_wqe_inline_seg { __be32 byte_count; + __be32 data[0]; }; enum mlx5_sig_type { -- cgit v1.2.3 From f05713e0916ca46f127641b6afa74bd1a0772423 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Apr 2019 18:35:03 -0700 Subject: ipv6: convert fib6_ref to refcount_t We suspect some issues involving fib6_ref 0 -> 1 transitions might cause strange syzbot reports. Lets convert fib6_ref to refcount_t to catch them earlier. Signed-off-by: Eric Dumazet Cc: Wei Wang Acked-by: Wei Wang Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 352f767bea81..5a4a67b38712 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -146,7 +146,7 @@ struct fib6_info { struct list_head fib6_siblings; unsigned int fib6_nsiblings; - atomic_t fib6_ref; + refcount_t fib6_ref; unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] @@ -284,17 +284,17 @@ void fib6_info_destroy_rcu(struct rcu_head *head); static inline void fib6_info_hold(struct fib6_info *f6i) { - atomic_inc(&f6i->fib6_ref); + refcount_inc(&f6i->fib6_ref); } static inline bool fib6_info_hold_safe(struct fib6_info *f6i) { - return atomic_inc_not_zero(&f6i->fib6_ref); + return refcount_inc_not_zero(&f6i->fib6_ref); } static inline void fib6_info_release(struct fib6_info *f6i) { - if (f6i && atomic_dec_and_test(&f6i->fib6_ref)) + if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) call_rcu(&f6i->rcu, fib6_info_destroy_rcu); } -- cgit v1.2.3 From ffa8ce54be3aaf6b15abae3bbd08282b867d3a4f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 23 Apr 2019 08:23:41 -0700 Subject: lwtunnel: Pass encap and encap type attributes to lwtunnel_fill_encap Currently, lwtunnel_fill_encap hardcodes the encap and encap type attributes as RTA_ENCAP and RTA_ENCAP_TYPE, respectively. The nexthop objects want to re-use this code but the encap attributes passed to userspace as NHA_ENCAP and NHA_ENCAP_TYPE. Since that is the only difference, change lwtunnel_fill_encap to take the attribute type as an input. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 671113bcb2cc..5d6c5b1fc695 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -118,8 +118,8 @@ int lwtunnel_build_state(u16 encap_type, unsigned int family, const void *cfg, struct lwtunnel_state **lws, struct netlink_ext_ack *extack); -int lwtunnel_fill_encap(struct sk_buff *skb, - struct lwtunnel_state *lwtstate); +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate, + int encap_attr, int encap_type_attr); int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); @@ -219,7 +219,8 @@ static inline int lwtunnel_build_state(u16 encap_type, } static inline int lwtunnel_fill_encap(struct sk_buff *skb, - struct lwtunnel_state *lwtstate) + struct lwtunnel_state *lwtstate, + int encap_attr, int encap_type_attr) { return 0; } -- cgit v1.2.3 From ecc5663cce8c7d7e4eba32af4e1e3cab296c64b9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 23 Apr 2019 08:48:09 -0700 Subject: net: Change nhc_flags to unsigned char nhc_flags holds the RTNH_F flags for a given nexthop (fib{6}_nh). All of the RTNH_F_ flags fit in an unsigned char, and since the API to userspace (rtnh_flags and lower byte of rtm_flags) is 1 byte it can not grow. Make nhc_flags in fib_nh_common an unsigned char and shrink the size of the struct by 8, from 56 to 48 bytes. Update the flags arguments for up netdevice events and fib_nexthop_info which determines the RTNH_F flags to return on a dump/event. The RTNH_F flags are passed in the lower byte of rtm_flags which is an unsigned int so use a temp variable for the flags to fib_nexthop_info and combine with rtm_flags in the caller. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_route.h | 2 +- include/net/ip_fib.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index df9cebc2b20c..4790beaa86e0 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -182,7 +182,7 @@ int rt6_dump_route(struct fib6_info *f6i, void *p_arg); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); -void rt6_sync_up(struct net_device *dev, unsigned int nh_flags); +void rt6_sync_up(struct net_device *dev, unsigned char nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); void rt6_multipath_rebalance(struct fib6_info *f6i); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index d8195c77e247..772a9e61bd84 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -83,11 +83,11 @@ struct fnhe_hash_bucket { struct fib_nh_common { struct net_device *nhc_dev; int nhc_oif; - unsigned int nhc_flags; - struct lwtunnel_state *nhc_lwtstate; unsigned char nhc_scope; u8 nhc_family; u8 nhc_gw_family; + unsigned char nhc_flags; + struct lwtunnel_state *nhc_lwtstate; union { __be32 ipv4; @@ -425,7 +425,7 @@ int fib_unmerge(struct net *net); int ip_fib_check_default(__be32 gw, struct net_device *dev); int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); int fib_sync_down_addr(struct net_device *dev, __be32 local); -int fib_sync_up(struct net_device *dev, unsigned int nh_flags); +int fib_sync_up(struct net_device *dev, unsigned char nh_flags); void fib_sync_mtu(struct net_device *dev, u32 orig_mtu); #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -500,7 +500,7 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct netlink_callback *cb); int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh, - unsigned int *flags, bool skip_oif); + unsigned char *flags, bool skip_oif); int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh, int nh_weight); #endif /* _NET_FIB_H */ -- cgit v1.2.3 From a65120bae4b7425a39c5783aa3d4fc29677eef0e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 23 Apr 2019 18:05:33 -0700 Subject: ipv6: Use result arg in fib_lookup_arg consistently arg.result is sometimes used as fib6_result and sometimes used to hold the rt6_info. Add rt6_info to fib6_result and make the use of arg.result consistent through ipv6 rules. The rt6 entry is filled in for lookups returning a dst_entry, but not for direct fib_lookups that just want a fib6_info. Fixes: effda4dd97e8 ("ipv6: Pass fib6_result to fib lookups") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 5a4a67b38712..40105738e2f6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -195,6 +195,7 @@ struct fib6_result { struct fib6_info *f6i; u32 fib6_flags; u8 fib6_type; + struct rt6_info *rt6; }; #define for_each_fib6_node_rt_rcu(fn) \ -- cgit v1.2.3 From 9fba2b9b4f1566213a4f0cec658479d8915086fa Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sun, 31 Mar 2019 19:44:43 +0300 Subject: net/mlx5: Expose SW ICM related device memory capabilities Add SW ICM related fields to the device memory capabilities structure and sw ownership capability in flow table properties. The currently supported SW ICM types are steering and header modify and the changes exposes the device memory capabilities for each of these two types. SW ICM memory can be allocated by SW and then be accessed by RDMA operations for direct management of the HW packet handling tables. Signed-off-by: Ariel Levkovich Reviewed-by: Eli Cohen Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 45 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 11e498442134..d96eb0916a44 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -80,6 +80,14 @@ enum { MLX5_SHARED_RESOURCE_UID = 0xffff, }; +enum { + MLX5_OBJ_TYPE_SW_ICM = 0x0008, +}; + +enum { + MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM), +}; + enum { MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, MLX5_CMD_OP_QUERY_ADAPTER = 0x101, @@ -357,7 +365,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 pop_vlan_2[0x1]; u8 push_vlan_2[0x1]; u8 reformat_and_vlan_action[0x1]; - u8 reserved_at_10[0x2]; + u8 reserved_at_10[0x1]; + u8 sw_owner[0x1]; u8 reformat_l3_tunnel_to_l2[0x1]; u8 reformat_l2_to_l3_tunnel[0x1]; u8 reformat_and_modify_action[0x1]; @@ -770,7 +779,19 @@ struct mlx5_ifc_device_mem_cap_bits { u8 max_memic_size[0x20]; - u8 reserved_at_c0[0x740]; + u8 steering_sw_icm_start_address[0x40]; + + u8 reserved_at_100[0x8]; + u8 log_header_modify_sw_icm_size[0x8]; + u8 reserved_at_110[0x2]; + u8 log_sw_icm_alloc_granularity[0x6]; + u8 log_steering_sw_icm_size[0x8]; + + u8 reserved_at_120[0x20]; + + u8 header_modify_sw_icm_start_address[0x40]; + + u8 reserved_at_180[0x680]; }; enum { @@ -919,6 +940,7 @@ enum { enum { MLX5_UCTX_CAP_RAW_TX = 1UL << 0, + MLX5_UCTX_CAP_INTERNAL_DEV_RES = 1UL << 1, }; struct mlx5_ifc_cmd_hca_cap_bits { @@ -2920,6 +2942,7 @@ enum { MLX5_MKC_ACCESS_MODE_MTT = 0x1, MLX5_MKC_ACCESS_MODE_KLMS = 0x2, MLX5_MKC_ACCESS_MODE_KSM = 0x3, + MLX5_MKC_ACCESS_MODE_SW_ICM = 0x4, MLX5_MKC_ACCESS_MODE_MEMIC = 0x5, }; @@ -9491,6 +9514,19 @@ struct mlx5_ifc_uctx_bits { u8 reserved_at_20[0x160]; }; +struct mlx5_ifc_sw_icm_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x18]; + u8 log_sw_icm_size[0x8]; + + u8 reserved_at_60[0x20]; + + u8 sw_icm_start_addr[0x40]; + + u8 reserved_at_c0[0x140]; +}; + struct mlx5_ifc_create_umem_in_bits { u8 opcode[0x10]; u8 uid[0x10]; @@ -9528,6 +9564,11 @@ struct mlx5_ifc_destroy_uctx_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_create_sw_icm_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_sw_icm_bits sw_icm; +}; + struct mlx5_ifc_mtrc_string_db_param_bits { u8 string_db_base_address[0x20]; -- cgit v1.2.3 From 3e07047021d36674d9051e76454e8b6a3b599036 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sun, 31 Mar 2019 19:44:48 +0300 Subject: net/mlx5: Expose TIR ICM address in command outbox Adding the TIR ICM address to the create_tir command outbox through which the device reports the ICM address of the newly created TIR. The TIR address can be used for direct attachment to a steering rule in SW managed steering mode. Signed-off-by: Ariel Levkovich Reviewed-by: Eli Cohen Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index d96eb0916a44..4b37519bd6a5 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -6897,14 +6897,14 @@ struct mlx5_ifc_create_tis_in_bits { struct mlx5_ifc_create_tir_out_bits { u8 status[0x8]; - u8 reserved_at_8[0x18]; + u8 icm_address_63_40[0x18]; u8 syndrome[0x20]; - u8 reserved_at_40[0x8]; + u8 icm_address_39_32[0x8]; u8 tirn[0x18]; - u8 reserved_at_60[0x20]; + u8 icm_address_31_0[0x20]; }; struct mlx5_ifc_create_tir_in_bits { -- cgit v1.2.3 From 96780e4f46b2fc0fc5ae2b95957002e2c42b11d3 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sun, 31 Mar 2019 19:44:49 +0300 Subject: net/mlx5: Introduce new TIR creation core API Introducing new TIR creation core API which allows caller to receive back from the call the full command outbox. This comes as a preparation for the next patch that will retrieve the TIR ICM address from the command outbox. Signed-off-by: Ariel Levkovich Reviewed-by: Eli Cohen Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/transobj.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h index a261d5528ff7..dc6b1e7cb8c4 100644 --- a/include/linux/mlx5/transobj.h +++ b/include/linux/mlx5/transobj.h @@ -50,6 +50,9 @@ int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out); int mlx5_core_query_sq_state(struct mlx5_core_dev *dev, u32 sqn, u8 *state); int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tirn); +int mlx5_core_create_tir_out(struct mlx5_core_dev *dev, + u32 *in, int inlen, + u32 *out, int outlen); int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, int inlen); void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn); -- cgit v1.2.3 From d5bb334a8e171b262e48f378bd2096c0ea458265 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Wed, 24 Apr 2019 22:19:17 +0200 Subject: Bluetooth: Align minimum encryption key size for LE and BR/EDR connections The minimum encryption key size for LE connections is 56 bits and to align LE with BR/EDR, enforce 56 bits of minimum encryption key size for BR/EDR connections as well. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg Cc: stable@vger.kernel.org --- include/net/bluetooth/hci_core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 094e61e07030..05b1b96f4d9e 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -190,6 +190,9 @@ struct adv_info { #define HCI_MAX_SHORT_NAME_LENGTH 10 +/* Min encryption key size to match with SMP */ +#define HCI_MIN_ENC_KEY_SIZE 7 + /* Default LE RPA expiry time, 15 minutes */ #define HCI_DEFAULT_RPA_TIMEOUT (15 * 60) -- cgit v1.2.3 From 118c8e9ae629d35fa9b3d27a7b9d59298b1b4183 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 25 Apr 2019 14:37:23 -0700 Subject: bpf: support BPF_PROG_QUERY for BPF_FLOW_DISSECTOR attach_type target_fd is target namespace. If there is a flow dissector BPF program attached to that namespace, its (single) id is returned. v5: * drop net ref right after rcu unlock (Daniel Borkmann) v4: * add missing put_net (Jann Horn) v3: * add missing inline to skb_flow_dissector_prog_query static def (kbuild test robot ) v2: * don't sleep in rcu critical section (Jakub Kicinski) * check input prog_cnt (exit early) Cc: Jann Horn Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/linux/skbuff.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 998256c2820b..6d58fa8a65fd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1258,11 +1258,19 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, unsigned int key_count); #ifdef CONFIG_NET +int skb_flow_dissector_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog); int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); #else +static inline int skb_flow_dissector_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EOPNOTSUPP; +} + static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { -- cgit v1.2.3 From 1d9373329bcbe0cfbb9af1738139292a9b10fe6a Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Fri, 15 Mar 2019 17:38:58 +0200 Subject: nl80211: increase NL80211_MAX_SUPP_REG_RULES The iwlwifi driver creates one rule per channel, thus it needs more rules than normal. To solve this, increase NL80211_MAX_SUPP_REG_RULES so iwlwifi can also fit UHB (ultra high band) channels. Signed-off-by: Shaul Triebitz Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a99d75bef598..f00dbd82149e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -11,7 +11,7 @@ * Copyright 2008 Jouni Malinen * Copyright 2008 Colin McCabe * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2019 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -2809,7 +2809,7 @@ enum nl80211_attrs { #define NL80211_MAX_SUPP_RATES 32 #define NL80211_MAX_SUPP_HT_RATES 77 -#define NL80211_MAX_SUPP_REG_RULES 64 +#define NL80211_MAX_SUPP_REG_RULES 128 #define NL80211_TKIP_DATA_OFFSET_ENCR_KEY 0 #define NL80211_TKIP_DATA_OFFSET_TX_MIC_KEY 16 #define NL80211_TKIP_DATA_OFFSET_RX_MIC_KEY 24 -- cgit v1.2.3 From f7dacfb11475ba777e1e84ccec2e14b0ba5a17a3 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Fri, 15 Mar 2019 17:39:03 +0200 Subject: cfg80211: support non-inheritance element Subelement profile may specify element IDs it doesn't inherit from the management frame. Support it. Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/net/cfg80211.h | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 48703ec60d06..522881f31938 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2487,6 +2487,7 @@ enum ieee80211_eid_ext { WLAN_EID_EXT_HE_MU_EDCA = 38, WLAN_EID_EXT_MAX_CHANNEL_SWITCH_TIME = 52, WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION = 55, + WLAN_EID_EXT_NON_INHERITANCE = 56, }; /* Action category code */ diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 70432fd638af..777c4f021610 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5534,6 +5534,14 @@ static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid, u64_to_ether_addr(new_bssid_u64, new_bssid); } +/** + * cfg80211_is_element_inherited - returns if element ID should be inherited + * @element: element to check + * @non_inherit_element: non inheritance element + */ +bool cfg80211_is_element_inherited(const struct element *element, + const struct element *non_inherit_element); + /** * enum cfg80211_bss_frame_type - frame type that the BSS data came from * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is -- cgit v1.2.3 From fe806e4992c9047affd263bcc13b2c047029a726 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Fri, 15 Mar 2019 17:39:05 +0200 Subject: cfg80211: support profile split between elements Since an element is limited to 255 octets, a profile may be split split to several elements. Support the split as defined in the 11ax draft 3. Detect legacy split and print a net-rate limited warning, since there is no ROI in supporting this probably non-existent split. Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 777c4f021610..f6665f8eba5a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5542,6 +5542,20 @@ static inline void cfg80211_gen_new_bssid(const u8 *bssid, u8 max_bssid, bool cfg80211_is_element_inherited(const struct element *element, const struct element *non_inherit_element); +/** + * cfg80211_merge_profile - merges a MBSSID profile if it is split between IEs + * @ie: ies + * @ielen: length of IEs + * @mbssid_elem: current MBSSID element + * @sub_elem: current MBSSID subelement (profile) + * @merged_ie: location of the merged profile + * @max_copy_len: max merged profile length + */ +size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, + const struct element *mbssid_elem, + const struct element *sub_elem, + u8 **merged_ie, size_t max_copy_len); + /** * enum cfg80211_bss_frame_type - frame type that the BSS data came from * @CFG80211_BSS_FTYPE_UNKNOWN: driver doesn't know whether the data is -- cgit v1.2.3 From abaea61c79ea7a03fde7db5b48414143546b07c4 Mon Sep 17 00:00:00 2001 From: Liad Kaufman Date: Fri, 15 Mar 2019 17:39:07 +0200 Subject: ieee80211: update HE IEs to D4.0 spec Update the out-dated comments as well, and have them point to the correct sections in the D4.0 spec. Signed-off-by: Liad Kaufman Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 522881f31938..61f0a316c6ac 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1557,7 +1557,7 @@ struct ieee80211_vht_operation { * struct ieee80211_he_cap_elem - HE capabilities element * * This structure is the "HE capabilities element" fixed fields as - * described in P802.11ax_D3.0 section 9.4.2.237.2 and 9.4.2.237.3 + * described in P802.11ax_D4.0 section 9.4.2.242.2 and 9.4.2.242.3 */ struct ieee80211_he_cap_elem { u8 mac_cap_info[6]; @@ -1619,12 +1619,12 @@ struct ieee80211_he_mcs_nss_supp { * struct ieee80211_he_operation - HE capabilities element * * This structure is the "HE operation element" fields as - * described in P802.11ax_D3.0 section 9.4.2.238 + * described in P802.11ax_D4.0 section 9.4.2.243 */ struct ieee80211_he_operation { __le32 he_oper_params; __le16 he_mcs_nss_set; - /* Optional 0,1,3 or 4 bytes: depends on @he_oper_params */ + /* Optional 0,1,3,4,5,7 or 8 bytes: depends on @he_oper_params */ u8 optional[0]; } __packed; @@ -1632,7 +1632,7 @@ struct ieee80211_he_operation { * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field * * This structure is the "MU AC Parameter Record" fields as - * described in P802.11ax_D2.0 section 9.4.2.240 + * described in P802.11ax_D4.0 section 9.4.2.245 */ struct ieee80211_he_mu_edca_param_ac_rec { u8 aifsn; @@ -1644,7 +1644,7 @@ struct ieee80211_he_mu_edca_param_ac_rec { * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element * * This structure is the "MU EDCA Parameter Set element" fields as - * described in P802.11ax_D2.0 section 9.4.2.240 + * described in P802.11ax_D4.0 section 9.4.2.245 */ struct ieee80211_mu_edca_param_set { u8 mu_qos_info; @@ -2026,6 +2026,7 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) #define IEEE80211_HE_OPERATION_VHT_OPER_INFO 0x00004000 #define IEEE80211_HE_OPERATION_CO_HOSTED_BSS 0x00008000 #define IEEE80211_HE_OPERATION_ER_SU_DISABLE 0x00010000 +#define IEEE80211_HE_OPERATION_6GHZ_OP_INFO 0x00020000 #define IEEE80211_HE_OPERATION_BSS_COLOR_MASK 0x3f000000 #define IEEE80211_HE_OPERATION_BSS_COLOR_OFFSET 24 #define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR 0x40000000 @@ -2056,6 +2057,8 @@ ieee80211_he_oper_size(const u8 *he_oper_ie) oper_len += 3; if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) oper_len++; + if (he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO) + oper_len += 4; /* Add the first byte (extension ID) to the total length */ oper_len++; -- cgit v1.2.3 From f2af2df800d3648b1d68e02d5b8a5d77cfee8970 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 16 Mar 2019 18:06:32 +0100 Subject: mac80211: calculate hash for fq without holding fq->lock in itxq enqueue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduces lock contention on enqueue/dequeue of iTXQ packets Signed-off-by: Felix Fietkau Acked-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- include/net/fq_impl.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index be7c0fab3478..2caa86660ab0 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -107,21 +107,23 @@ begin: return skb; } +static u32 fq_flow_idx(struct fq *fq, struct sk_buff *skb) +{ + u32 hash = skb_get_hash_perturb(skb, fq->perturbation); + + return reciprocal_scale(hash, fq->flows_cnt); +} + static struct fq_flow *fq_flow_classify(struct fq *fq, - struct fq_tin *tin, + struct fq_tin *tin, u32 idx, struct sk_buff *skb, fq_flow_get_default_t get_default_func) { struct fq_flow *flow; - u32 hash; - u32 idx; lockdep_assert_held(&fq->lock); - hash = skb_get_hash_perturb(skb, fq->perturbation); - idx = reciprocal_scale(hash, fq->flows_cnt); flow = &fq->flows[idx]; - if (flow->tin && flow->tin != tin) { flow = get_default_func(fq, tin, idx, skb); tin->collisions++; @@ -153,7 +155,7 @@ static void fq_recalc_backlog(struct fq *fq, } static void fq_tin_enqueue(struct fq *fq, - struct fq_tin *tin, + struct fq_tin *tin, u32 idx, struct sk_buff *skb, fq_skb_free_t free_func, fq_flow_get_default_t get_default_func) @@ -163,7 +165,7 @@ static void fq_tin_enqueue(struct fq *fq, lockdep_assert_held(&fq->lock); - flow = fq_flow_classify(fq, tin, skb, get_default_func); + flow = fq_flow_classify(fq, tin, idx, skb, get_default_func); flow->tin = tin; flow->backlog += skb->len; -- cgit v1.2.3 From 6cdd3979a2bdc16116c5b2eb09475abf54ba9e70 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Tue, 19 Mar 2019 21:34:07 +0100 Subject: nl80211/cfg80211: Extended Key ID support Add support for IEEE 802.11-2016 "Extended Key ID for Individually Addressed Frames". Extend cfg80211 and nl80211 to allow pairwise keys to be installed for Rx only, enable Tx separately and allow Key ID 1 for pairwise keys. Signed-off-by: Alexander Wetzel [use NLA_POLICY_RANGE() for NL80211_KEY_MODE] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f6665f8eba5a..2b039802ae2e 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -485,6 +485,7 @@ struct vif_params { * with the get_key() callback, must be in little endian, * length given by @seq_len. * @seq_len: length of @seq. + * @mode: key install mode (RX_TX, NO_TX or SET_TX) */ struct key_params { const u8 *key; @@ -492,6 +493,7 @@ struct key_params { int key_len; int seq_len; u32 cipher; + enum nl80211_key_mode mode; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f00dbd82149e..e75615bf4453 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4152,6 +4152,27 @@ enum nl80211_channel_type { NL80211_CHAN_HT40PLUS }; +/** + * enum nl80211_key_mode - Key mode + * + * @NL80211_KEY_RX_TX: (Default) + * Key can be used for Rx and Tx immediately + * + * The following modes can only be selected for unicast keys and when the + * driver supports @NL80211_EXT_FEATURE_EXT_KEY_ID: + * + * @NL80211_KEY_NO_TX: Only allowed in combination with @NL80211_CMD_NEW_KEY: + * Unicast key can only be used for Rx, Tx not allowed, yet + * @NL80211_KEY_SET_TX: Only allowed in combination with @NL80211_CMD_SET_KEY: + * The unicast key identified by idx and mac is cleared for Tx and becomes + * the preferred Tx key for the station. + */ +enum nl80211_key_mode { + NL80211_KEY_RX_TX, + NL80211_KEY_NO_TX, + NL80211_KEY_SET_TX +}; + /** * enum nl80211_chan_width - channel width definitions * @@ -4395,6 +4416,9 @@ enum nl80211_key_default_types { * @NL80211_KEY_DEFAULT_TYPES: A nested attribute containing flags * attributes, specifying what a key should be set as default as. * See &enum nl80211_key_default_types. + * @NL80211_KEY_MODE: the mode from enum nl80211_key_mode. + * Defaults to @NL80211_KEY_RX_TX. + * * @__NL80211_KEY_AFTER_LAST: internal * @NL80211_KEY_MAX: highest key attribute */ @@ -4408,6 +4432,7 @@ enum nl80211_key_attributes { NL80211_KEY_DEFAULT_MGMT, NL80211_KEY_TYPE, NL80211_KEY_DEFAULT_TYPES, + NL80211_KEY_MODE, /* keep last */ __NL80211_KEY_AFTER_LAST, @@ -5353,6 +5378,8 @@ enum nl80211_feature_flags { * able to rekey an in-use key correctly. Userspace must not rekey PTK keys * if this flag is not set. Ignoring this can leak clear text packets and/or * freeze the connection. + * @NL80211_EXT_FEATURE_EXT_KEY_ID: Driver supports "Extended Key ID for + * Individually Addressed Frames" from IEEE802.11-2016. * * @NL80211_EXT_FEATURE_AIRTIME_FAIRNESS: Driver supports getting airtime * fairness for transmitted packets and has enabled airtime fairness @@ -5406,6 +5433,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_AIRTIME_FAIRNESS, NL80211_EXT_FEATURE_AP_PMKSA_CACHING, NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD, + NL80211_EXT_FEATURE_EXT_KEY_ID, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 96fc6efb9ad9d0cd8cbb4462f0eb2a07092649e6 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Tue, 19 Mar 2019 21:34:08 +0100 Subject: mac80211: IEEE 802.11 Extended Key ID support Add support for Extended Key ID as defined in IEEE 802.11-2016. - Implement the nl80211 API for Extended Key ID - Extend mac80211 API to allow drivers to support Extended Key ID - Enable Extended Key ID by default for drivers only supporting SW crypto (e.g. mac80211_hwsim) - Allow unicast Tx usage to be supressed (IEEE80211_KEY_FLAG_NO_AUTO_TX) - Select the decryption key based on the MPDU keyid - Enforce existing assumptions in the code that rekeys don't change the cipher Signed-off-by: Alexander Wetzel [remove module parameter] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ac2ed8ec662b..c10abca55fde 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1697,6 +1697,7 @@ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); * @IEEE80211_KEY_FLAG_PUT_MIC_SPACE: This flag should be set by the driver for * a TKIP key if it only requires MIC space. Do not set together with * @IEEE80211_KEY_FLAG_GENERATE_MMIC on the same key. + * @IEEE80211_KEY_FLAG_NO_AUTO_TX: Key needs explicit Tx activation. */ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_GENERATE_IV_MGMT = BIT(0), @@ -1708,6 +1709,7 @@ enum ieee80211_key_flags { IEEE80211_KEY_FLAG_RX_MGMT = BIT(6), IEEE80211_KEY_FLAG_RESERVE_TAILROOM = BIT(7), IEEE80211_KEY_FLAG_PUT_MIC_SPACE = BIT(8), + IEEE80211_KEY_FLAG_NO_AUTO_TX = BIT(9), }; /** @@ -2243,6 +2245,9 @@ struct ieee80211_txq { * @IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID: Hardware supports multi BSSID * only for HE APs. Applies if @IEEE80211_HW_SUPPORTS_MULTI_BSSID is set. * + * @IEEE80211_HW_EXT_KEY_ID_NATIVE: Driver and hardware are supporting Extended + * Key ID and can handle two unicast keys per station for Rx and Tx. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2294,6 +2299,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_TX_STATUS_NO_AMPDU_LEN, IEEE80211_HW_SUPPORTS_MULTI_BSSID, IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID, + IEEE80211_HW_EXT_KEY_ID_NATIVE, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS -- cgit v1.2.3 From e96d1cd2635c05efdd01b4eafcfc50c22c40751f Mon Sep 17 00:00:00 2001 From: Ashok Raj Nagarajan Date: Fri, 29 Mar 2019 16:18:21 +0530 Subject: cfg80211: Add support to set tx power for a station associated This patch adds support to set transmit power setting type and transmit power level attributes to NL80211_CMD_SET_STATION in order to facilitate adjusting the transmit power level of a station associated to the AP. The added attributes allow selection of automatic and limited transmit power level, with the level defined in dBm format. Co-developed-by: Balaji Pothunoori Signed-off-by: Ashok Raj Nagarajan Signed-off-by: Balaji Pothunoori Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 22 ++++++++++++++++++++++ include/uapi/linux/nl80211.h | 15 +++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2b039802ae2e..2ea04e94b522 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -975,6 +975,27 @@ enum station_parameters_apply_mask { STATION_PARAM_APPLY_UAPSD = BIT(0), STATION_PARAM_APPLY_CAPABILITY = BIT(1), STATION_PARAM_APPLY_PLINK_STATE = BIT(2), + STATION_PARAM_APPLY_STA_TXPOWER = BIT(3), +}; + +/** + * struct sta_txpwr - station txpower configuration + * + * Used to configure txpower for station. + * + * @power: tx power (in dBm) to be used for sending data traffic. If tx power + * is not provided, the default per-interface tx power setting will be + * overriding. Driver should be picking up the lowest tx power, either tx + * power per-interface or per-station. + * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power + * will be less than or equal to specified from userspace, whereas if TPC + * %type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power. + * NL80211_TX_POWER_FIXED is not a valid configuration option for + * per peer TPC. + */ +struct sta_txpwr { + s16 power; + enum nl80211_tx_power_setting type; }; /** @@ -1049,6 +1070,7 @@ struct station_parameters { const struct ieee80211_he_cap_elem *he_capa; u8 he_capa_len; u16 airtime_weight; + struct sta_txpwr txpwr; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e75615bf4453..25f70dd2b583 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2315,6 +2315,15 @@ enum nl80211_commands { * @NL80211_ATTR_AIRTIME_WEIGHT: Station's weight when scheduled by the airtime * scheduler. * + * @NL80211_ATTR_STA_TX_POWER_SETTING: Transmit power setting type (u8) for + * station associated with the AP. See &enum nl80211_tx_power_setting for + * possible values. + * @NL80211_ATTR_STA_TX_POWER: Transmit power level (s16) in dBm units. This + * allows to set Tx power for a station. If this attribute is not included, + * the default per-interface tx power setting will be overriding. Driver + * should be picking up the lowest tx power, either tx power per-interface + * or per-station. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2765,6 +2774,8 @@ enum nl80211_attrs { NL80211_ATTR_PEER_MEASUREMENTS, NL80211_ATTR_AIRTIME_WEIGHT, + NL80211_ATTR_STA_TX_POWER_SETTING, + NL80211_ATTR_STA_TX_POWER, /* add attributes here, update the policy in nl80211.c */ @@ -5391,6 +5402,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD: Driver supports * filtering of sched scan results using band specific RSSI thresholds. * + * @NL80211_EXT_FEATURE_STA_TX_PWR: This driver supports controlling tx power + * to a station. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5434,6 +5448,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_AP_PMKSA_CACHING, NL80211_EXT_FEATURE_SCHED_SCAN_BAND_SPECIFIC_RSSI_THOLD, NL80211_EXT_FEATURE_EXT_KEY_ID, + NL80211_EXT_FEATURE_STA_TX_PWR, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From ba905bf432f662cb907fd692a4f160e612c0408b Mon Sep 17 00:00:00 2001 From: Ashok Raj Nagarajan Date: Fri, 29 Mar 2019 16:19:09 +0530 Subject: mac80211: store tx power value from user to station This patch introduce a new driver callback drv_sta_set_txpwr. This API will copy the transmit power value passed from user space and call the driver callback to set the tx power for the station. Co-developed-by: Balaji Pothunoori Signed-off-by: Ashok Raj Nagarajan Signed-off-by: Balaji Pothunoori Signed-off-by: Johannes Berg --- include/net/mac80211.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c10abca55fde..d66fbfe8d55d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1889,6 +1889,24 @@ struct ieee80211_sta_rates { } rate[IEEE80211_TX_RATE_TABLE_SIZE]; }; +/** + * struct ieee80211_sta_txpwr - station txpower configuration + * + * Used to configure txpower for station. + * + * @power: indicates the tx power, in dBm, to be used when sending data frames + * to the STA. + * @type: In particular if TPC %type is NL80211_TX_POWER_LIMITED then tx power + * will be less than or equal to specified from userspace, whereas if TPC + * %type is NL80211_TX_POWER_AUTOMATIC then it indicates default tx power. + * NL80211_TX_POWER_FIXED is not a valid configuration option for + * per peer TPC. + */ +struct ieee80211_sta_txpwr { + s16 power; + enum nl80211_tx_power_setting type; +}; + /** * struct ieee80211_sta - station table entry * @@ -1975,6 +1993,7 @@ struct ieee80211_sta { bool support_p2p_ps; u16 max_rc_amsdu_len; u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS]; + struct ieee80211_sta_txpwr txpwr; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1]; @@ -3800,6 +3819,9 @@ struct ieee80211_ops { #endif void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum sta_notify_cmd, struct ieee80211_sta *sta); + int (*sta_set_txpwr)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_sta *sta); int (*sta_state)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, enum ieee80211_sta_state old_state, -- cgit v1.2.3 From 5809a5d54bb9eda3a388b5a712657970c2cb9f8e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 11 Apr 2019 11:59:50 +0300 Subject: cfg80211: don't pass pointer to pointer unnecessarily The cfg80211_merge_profile() and ieee802_11_find_bssid_profile() are a bit cleaner if we just pass the merged_ie pointer instead of a pointer to the pointer. This isn't a functional change, it's just a clean up. Signed-off-by: Dan Carpenter Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2ea04e94b522..944de1802210 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5578,7 +5578,7 @@ bool cfg80211_is_element_inherited(const struct element *element, size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, const struct element *mbssid_elem, const struct element *sub_elem, - u8 **merged_ie, size_t max_copy_len); + u8 *merged_ie, size_t max_copy_len); /** * enum cfg80211_bss_frame_type - frame type that the BSS data came from -- cgit v1.2.3 From 5ab92e7fe49ad74293b50fb9e6f25be5521e2f68 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 11 Apr 2019 13:47:24 -0700 Subject: cfg80211: add support to probe unexercised mesh link Adding support to allow mesh HWMP to measure link metrics on unexercised direct mesh path by sending some data frames to other mesh points which are not currently selected as a primary traffic path but only 1 hop away. The absence of the primary path to the chosen node makes it necessary to apply some form of marking on a chosen packet stream so that the packets can be properly steered to the selected node for testing, and not by the regular mesh path lookup. Tested-by: Pradeep Kumar Chitrapu Signed-off-by: Rajkumar Manoharan Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 +++++ include/uapi/linux/nl80211.h | 17 +++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 944de1802210..298301525f9f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3501,6 +3501,9 @@ struct cfg80211_update_owe_info { * @update_owe_info: Provide updated OWE info to driver. Driver implementing SME * but offloading OWE processing to the user space will get the updated * DH IE through this interface. + * + * @probe_mesh_link: Probe direct Mesh peer's link quality by sending data frame + * and overrule HWMP path selection algorithm. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3817,6 +3820,8 @@ struct cfg80211_ops { struct cfg80211_pmsr_request *request); int (*update_owe_info)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_update_owe_info *owe_info); + int (*probe_mesh_link)(struct wiphy *wiphy, struct net_device *dev, + const u8 *buf, size_t len); }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 25f70dd2b583..6f09d1500960 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1070,6 +1070,21 @@ * OWE AKM by the host drivers that implement SME but rely * on the user space for the cryptographic/DH IE processing in AP mode. * + * @NL80211_CMD_PROBE_MESH_LINK: The requirement for mesh link metric + * refreshing, is that from one mesh point we be able to send some data + * frames to other mesh points which are not currently selected as a + * primary traffic path, but which are only 1 hop away. The absence of + * the primary path to the chosen node makes it necessary to apply some + * form of marking on a chosen packet stream so that the packets can be + * properly steered to the selected node for testing, and not by the + * regular mesh path lookup. Further, the packets must be of type data + * so that the rate control (often embedded in firmware) is used for + * rate selection. + * + * Here attribute %NL80211_ATTR_MAC is used to specify connected mesh + * peer MAC address and %NL80211_ATTR_FRAME is used to specify the frame + * content. The frame is ethernet data. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1292,6 +1307,8 @@ enum nl80211_commands { NL80211_CMD_UPDATE_OWE_INFO, + NL80211_CMD_PROBE_MESH_LINK, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ -- cgit v1.2.3 From 8828f81ad4a2f4e89ebe6e7793c06ed767c31d53 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Thu, 11 Apr 2019 13:47:26 -0700 Subject: mac80211: probe unexercised mesh links The requirement for mesh link metric refreshing, is that from one mesh point we be able to send some data frames to other mesh points which are not currently selected as a primary traffic path, but which are only 1 hop away. The absence of the primary path to the chosen node makes it necessary to apply some form of marking on a chosen packet stream so that the packets can be properly steered to the selected node for testing, and not by the regular mesh path lookup. Tested-by: Pradeep Kumar Chitrapu Signed-off-by: Rajkumar Manoharan Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d66fbfe8d55d..76a443f32fc8 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -807,6 +807,7 @@ enum mac80211_tx_info_flags { * @IEEE80211_TX_CTRL_RATE_INJECT: This frame is injected with rate information * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path + * @IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP: This frame skips mesh path lookup * * These flags are used in tx_info->control.flags. */ @@ -816,6 +817,7 @@ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_RATE_INJECT = BIT(2), IEEE80211_TX_CTRL_AMSDU = BIT(3), IEEE80211_TX_CTRL_FAST_XMIT = BIT(4), + IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP = BIT(5), }; /* -- cgit v1.2.3 From 9df1c28bb75217b244257152ab7d788bb2a386d0 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 26 Apr 2019 11:49:47 -0700 Subject: bpf: add writable context for raw tracepoints This is an opt-in interface that allows a tracepoint to provide a safe buffer that can be written from a BPF_PROG_TYPE_RAW_TRACEPOINT program. The size of the buffer must be a compile-time constant, and is checked before allowing a BPF program to attach to a tracepoint that uses this feature. The pointer to this buffer will be the first argument of tracepoints that opt in; the pointer is valid and can be bpf_probe_read() by both BPF_PROG_TYPE_RAW_TRACEPOINT and BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE programs that attach to such a tracepoint, but the buffer to which it points may only be written by the latter. Signed-off-by: Matt Mullins Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/bpf_types.h | 1 + include/linux/tracepoint-defs.h | 1 + include/trace/bpf_probe.h | 27 +++++++++++++++++++++++++-- include/uapi/linux/bpf.h | 1 + 5 files changed, 30 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f15432d90728..cd6341eabd74 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -272,6 +272,7 @@ enum bpf_reg_type { PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ + PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ }; /* The information passed from prog-specific *_is_valid_access @@ -361,6 +362,7 @@ struct bpf_prog_aux { u32 used_map_cnt; u32 max_ctx_offset; u32 max_pkt_offset; + u32 max_tp_access; u32 stack_depth; u32 id; u32 func_cnt; /* used by non-func prog as the number of func progs */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index d26991a16894..a10d37bce364 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -25,6 +25,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 49ba9cde7e4b..b29950a19205 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -45,6 +45,7 @@ struct bpf_raw_event_map { struct tracepoint *tp; void *bpf_func; u32 num_args; + u32 writable_size; } __aligned(32); #endif diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h index 505dae0bed80..d6e556c0a085 100644 --- a/include/trace/bpf_probe.h +++ b/include/trace/bpf_probe.h @@ -69,8 +69,7 @@ __bpf_trace_##call(void *__data, proto) \ * to make sure that if the tracepoint handling changes, the * bpf probe will fail to compile unless it too is updated. */ -#undef DEFINE_EVENT -#define DEFINE_EVENT(template, call, proto, args) \ +#define __DEFINE_EVENT(template, call, proto, args, size) \ static inline void bpf_test_probe_##call(void) \ { \ check_trace_callback_type_##call(__bpf_trace_##template); \ @@ -81,12 +80,36 @@ __bpf_trace_tp_map_##call = { \ .tp = &__tracepoint_##call, \ .bpf_func = (void *)__bpf_trace_##template, \ .num_args = COUNT_ARGS(args), \ + .writable_size = size, \ }; +#define FIRST(x, ...) x + +#undef DEFINE_EVENT_WRITABLE +#define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \ +static inline void bpf_test_buffer_##call(void) \ +{ \ + /* BUILD_BUG_ON() is ignored if the code is completely eliminated, but \ + * BUILD_BUG_ON_ZERO() uses a different mechanism that is not \ + * dead-code-eliminated. \ + */ \ + FIRST(proto); \ + (void)BUILD_BUG_ON_ZERO(size != sizeof(*FIRST(args))); \ +} \ +__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) + +#undef DEFINE_EVENT +#define DEFINE_EVENT(template, call, proto, args) \ + __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0) #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef DEFINE_EVENT_WRITABLE +#undef __DEFINE_EVENT +#undef FIRST + #endif /* CONFIG_BPF_EVENTS */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index eaf2d3284248..f7fa7a34a62d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -168,6 +168,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_REUSEPORT, BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, }; enum bpf_attach_type { -- cgit v1.2.3 From ea106722c76f08002b69a6983ed84dc18958ba48 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 26 Apr 2019 11:49:48 -0700 Subject: nbd: trace sending nbd requests This adds a tracepoint that can both observe the nbd request being sent to the server, as well as modify that request , e.g., setting a flag in the request that will cause the server to collect detailed tracing data. The struct request * being handled is included to permit correlation with the block tracepoints. Signed-off-by: Matt Mullins Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- include/trace/events/nbd.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 include/trace/events/nbd.h (limited to 'include') diff --git a/include/trace/events/nbd.h b/include/trace/events/nbd.h new file mode 100644 index 000000000000..5928255ed02e --- /dev/null +++ b/include/trace/events/nbd.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM nbd + +#if !defined(_TRACE_NBD_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NBD_H + +#include + +DECLARE_EVENT_CLASS(nbd_send_request, + + TP_PROTO(struct nbd_request *nbd_request, int index, + struct request *rq), + + TP_ARGS(nbd_request, index, rq), + + TP_STRUCT__entry( + __field(struct nbd_request *, nbd_request) + __field(u64, dev_index) + __field(struct request *, request) + ), + + TP_fast_assign( + __entry->nbd_request = 0; + __entry->dev_index = index; + __entry->request = rq; + ), + + TP_printk("nbd%lld: request %p", __entry->dev_index, __entry->request) +); + +#ifdef DEFINE_EVENT_WRITABLE +#undef NBD_DEFINE_EVENT +#define NBD_DEFINE_EVENT(template, call, proto, args, size) \ + DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto), \ + PARAMS(args), size) +#else +#undef NBD_DEFINE_EVENT +#define NBD_DEFINE_EVENT(template, call, proto, args, size) \ + DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args)) +#endif + +NBD_DEFINE_EVENT(nbd_send_request, nbd_send_request, + + TP_PROTO(struct nbd_request *nbd_request, int index, + struct request *rq), + + TP_ARGS(nbd_request, index, rq), + + sizeof(struct nbd_request) +); + +#endif + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 2abd2de712cd891321a06b0890a85aef1e506cb5 Mon Sep 17 00:00:00 2001 From: Andrew Hall Date: Fri, 26 Apr 2019 11:49:49 -0700 Subject: nbd: add tracepoints for send/receive timing This adds four tracepoints to nbd, enabling separate tracing of payload and header sending/receipt. In the send path for headers that have already been sent, we also explicitly initialize the handle so it can be referenced by the later tracepoint. Signed-off-by: Andrew Hall Signed-off-by: Matt Mullins Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- include/trace/events/nbd.h | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'include') diff --git a/include/trace/events/nbd.h b/include/trace/events/nbd.h index 5928255ed02e..9849956f34d8 100644 --- a/include/trace/events/nbd.h +++ b/include/trace/events/nbd.h @@ -7,6 +7,57 @@ #include +DECLARE_EVENT_CLASS(nbd_transport_event, + + TP_PROTO(struct request *req, u64 handle), + + TP_ARGS(req, handle), + + TP_STRUCT__entry( + __field(struct request *, req) + __field(u64, handle) + ), + + TP_fast_assign( + __entry->req = req; + __entry->handle = handle; + ), + + TP_printk( + "nbd transport event: request %p, handle 0x%016llx", + __entry->req, + __entry->handle + ) +); + +DEFINE_EVENT(nbd_transport_event, nbd_header_sent, + + TP_PROTO(struct request *req, u64 handle), + + TP_ARGS(req, handle) +); + +DEFINE_EVENT(nbd_transport_event, nbd_payload_sent, + + TP_PROTO(struct request *req, u64 handle), + + TP_ARGS(req, handle) +); + +DEFINE_EVENT(nbd_transport_event, nbd_header_received, + + TP_PROTO(struct request *req, u64 handle), + + TP_ARGS(req, handle) +); + +DEFINE_EVENT(nbd_transport_event, nbd_payload_received, + + TP_PROTO(struct request *req, u64 handle), + + TP_ARGS(req, handle) +); + DECLARE_EVENT_CLASS(nbd_send_request, TP_PROTO(struct nbd_request *nbd_request, int index, -- cgit v1.2.3 From e950e843367d7990b9d7ea964e3c33876d477c4b Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Fri, 26 Apr 2019 11:49:51 -0700 Subject: selftests: bpf: test writable buffers in raw tps This tests that: * a BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE cannot be attached if it uses either: * a variable offset to the tracepoint buffer, or * an offset beyond the size of the tracepoint buffer * a tracer can modify the buffer provided when attached to a writable tracepoint in bpf_prog_test_run Signed-off-by: Matt Mullins Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/trace/events/bpf_test_run.h | 50 +++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 include/trace/events/bpf_test_run.h (limited to 'include') diff --git a/include/trace/events/bpf_test_run.h b/include/trace/events/bpf_test_run.h new file mode 100644 index 000000000000..265447e3f71a --- /dev/null +++ b/include/trace/events/bpf_test_run.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bpf_test_run + +#if !defined(_TRACE_BPF_TEST_RUN_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BPF_TEST_RUN_H + +#include + +DECLARE_EVENT_CLASS(bpf_test_finish, + + TP_PROTO(int *err), + + TP_ARGS(err), + + TP_STRUCT__entry( + __field(int, err) + ), + + TP_fast_assign( + __entry->err = *err; + ), + + TP_printk("bpf_test_finish with err=%d", __entry->err) +); + +#ifdef DEFINE_EVENT_WRITABLE +#undef BPF_TEST_RUN_DEFINE_EVENT +#define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \ + DEFINE_EVENT_WRITABLE(template, call, PARAMS(proto), \ + PARAMS(args), size) +#else +#undef BPF_TEST_RUN_DEFINE_EVENT +#define BPF_TEST_RUN_DEFINE_EVENT(template, call, proto, args, size) \ + DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args)) +#endif + +BPF_TEST_RUN_DEFINE_EVENT(bpf_test_finish, bpf_test_finish, + + TP_PROTO(int *err), + + TP_ARGS(err), + + sizeof(int) +); + +#endif + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 6ac99e8f23d4b10258406ca0dd7bffca5f31da9d Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 26 Apr 2019 16:39:39 -0700 Subject: bpf: Introduce bpf sk local storage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After allowing a bpf prog to - directly read the skb->sk ptr - get the fullsock bpf_sock by "bpf_sk_fullsock()" - get the bpf_tcp_sock by "bpf_tcp_sock()" - get the listener sock by "bpf_get_listener_sock()" - avoid duplicating the fields of "(bpf_)sock" and "(bpf_)tcp_sock" into different bpf running context. this patch is another effort to make bpf's network programming more intuitive to do (together with memory and performance benefit). When bpf prog needs to store data for a sk, the current practice is to define a map with the usual 4-tuples (src/dst ip/port) as the key. If multiple bpf progs require to store different sk data, multiple maps have to be defined. Hence, wasting memory to store the duplicated keys (i.e. 4 tuples here) in each of the bpf map. [ The smallest key could be the sk pointer itself which requires some enhancement in the verifier and it is a separate topic. ] Also, the bpf prog needs to clean up the elem when sk is freed. Otherwise, the bpf map will become full and un-usable quickly. The sk-free tracking currently could be done during sk state transition (e.g. BPF_SOCK_OPS_STATE_CB). The size of the map needs to be predefined which then usually ended-up with an over-provisioned map in production. Even the map was re-sizable, while the sk naturally come and go away already, this potential re-size operation is arguably redundant if the data can be directly connected to the sk itself instead of proxy-ing through a bpf map. This patch introduces sk->sk_bpf_storage to provide local storage space at sk for bpf prog to use. The space will be allocated when the first bpf prog has created data for this particular sk. The design optimizes the bpf prog's lookup (and then optionally followed by an inline update). bpf_spin_lock should be used if the inline update needs to be protected. BPF_MAP_TYPE_SK_STORAGE: ----------------------- To define a bpf "sk-local-storage", a BPF_MAP_TYPE_SK_STORAGE map (new in this patch) needs to be created. Multiple BPF_MAP_TYPE_SK_STORAGE maps can be created to fit different bpf progs' needs. The map enforces BTF to allow printing the sk-local-storage during a system-wise sk dump (e.g. "ss -ta") in the future. The purpose of a BPF_MAP_TYPE_SK_STORAGE map is not for lookup/update/delete a "sk-local-storage" data from a particular sk. Think of the map as a meta-data (or "type") of a "sk-local-storage". This particular "type" of "sk-local-storage" data can then be stored in any sk. The main purposes of this map are mostly: 1. Define the size of a "sk-local-storage" type. 2. Provide a similar syscall userspace API as the map (e.g. lookup/update, map-id, map-btf...etc.) 3. Keep track of all sk's storages of this "type" and clean them up when the map is freed. sk->sk_bpf_storage: ------------------ The main lookup/update/delete is done on sk->sk_bpf_storage (which is a "struct bpf_sk_storage"). When doing a lookup, the "map" pointer is now used as the "key" to search on the sk_storage->list. The "map" pointer is actually serving as the "type" of the "sk-local-storage" that is being requested. To allow very fast lookup, it should be as fast as looking up an array at a stable-offset. At the same time, it is not ideal to set a hard limit on the number of sk-local-storage "type" that the system can have. Hence, this patch takes a cache approach. The last search result from sk_storage->list is cached in sk_storage->cache[] which is a stable sized array. Each "sk-local-storage" type has a stable offset to the cache[] array. In the future, a map's flag could be introduced to do cache opt-out/enforcement if it became necessary. The cache size is 16 (i.e. 16 types of "sk-local-storage"). Programs can share map. On the program side, having a few bpf_progs running in the networking hotpath is already a lot. The bpf_prog should have already consolidated the existing sock-key-ed map usage to minimize the map lookup penalty. 16 has enough runway to grow. All sk-local-storage data will be removed from sk->sk_bpf_storage during sk destruction. bpf_sk_storage_get() and bpf_sk_storage_delete(): ------------------------------------------------ Instead of using bpf_map_(lookup|update|delete)_elem(), the bpf prog needs to use the new helper bpf_sk_storage_get() and bpf_sk_storage_delete(). The verifier can then enforce the ARG_PTR_TO_SOCKET argument. The bpf_sk_storage_get() also allows to "create" new elem if one does not exist in the sk. It is done by the new BPF_SK_STORAGE_GET_F_CREATE flag. An optional value can also be provided as the initial value during BPF_SK_STORAGE_GET_F_CREATE. The BPF_MAP_TYPE_SK_STORAGE also supports bpf_spin_lock. Together, it has eliminated the potential use cases for an equivalent bpf_map_update_elem() API (for bpf_prog) in this patch. Misc notes: ---------- 1. map_get_next_key is not supported. From the userspace syscall perspective, the map has the socket fd as the key while the map can be shared by pinned-file or map-id. Since btf is enforced, the existing "ss" could be enhanced to pretty print the local-storage. Supporting a kernel defined btf with 4 tuples as the return key could be explored later also. 2. The sk->sk_lock cannot be acquired. Atomic operations is used instead. e.g. cmpxchg is done on the sk->sk_bpf_storage ptr. Please refer to the source code comments for the details in synchronization cases and considerations. 3. The mem is charged to the sk->sk_omem_alloc as the sk filter does. Benchmark: --------- Here is the benchmark data collected by turning on the "kernel.bpf_stats_enabled" sysctl. Two bpf progs are tested: One bpf prog with the usual bpf hashmap (max_entries = 8192) with the sk ptr as the key. (verifier is modified to support sk ptr as the key That should have shortened the key lookup time.) Another bpf prog is with the new BPF_MAP_TYPE_SK_STORAGE. Both are storing a "u32 cnt", do a lookup on "egress_skb/cgroup" for each egress skb and then bump the cnt. netperf is used to drive data with 4096 connected UDP sockets. BPF_MAP_TYPE_HASH with a modifier verifier (152ns per bpf run) 27: cgroup_skb name egress_sk_map tag 74f56e832918070b run_time_ns 58280107540 run_cnt 381347633 loaded_at 2019-04-15T13:46:39-0700 uid 0 xlated 344B jited 258B memlock 4096B map_ids 16 btf_id 5 BPF_MAP_TYPE_SK_STORAGE in this patch (66ns per bpf run) 30: cgroup_skb name egress_sk_stora tag d4aa70984cc7bbf6 run_time_ns 25617093319 run_cnt 390989739 loaded_at 2019-04-15T13:47:54-0700 uid 0 xlated 168B jited 156B memlock 4096B map_ids 17 btf_id 6 Here is a high-level picture on how are the objects organized: sk ┌──────┐ │ │ │ │ │ │ │*sk_bpf_storage─────▶ bpf_sk_storage └──────┘ ┌───────┐ ┌───────────┤ list │ │ │ │ │ │ │ │ │ │ │ └───────┘ │ │ elem │ ┌────────┐ ├─▶│ snode │ │ ├────────┤ │ │ data │ bpf_map │ ├────────┤ ┌─────────┐ │ │map_node│◀─┬─────┤ list │ │ └────────┘ │ │ │ │ │ │ │ │ elem │ │ │ │ ┌────────┐ │ └─────────┘ └─▶│ snode │ │ ├────────┤ │ bpf_map │ data │ │ ┌─────────┐ ├────────┤ │ │ list ├───────▶│map_node│ │ │ │ └────────┘ │ │ │ │ │ │ elem │ └─────────┘ ┌────────┐ │ ┌─▶│ snode │ │ │ ├────────┤ │ │ │ data │ │ │ ├────────┤ │ │ │map_node│◀─┘ │ └────────┘ │ │ │ ┌───────┐ sk └──────────│ list │ ┌──────┐ │ │ │ │ │ │ │ │ │ │ │ │ └───────┘ │*sk_bpf_storage───────▶bpf_sk_storage └──────┘ Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/linux/bpf_types.h | 1 + include/net/bpf_sk_storage.h | 13 +++++++++++++ include/net/sock.h | 5 +++++ include/uapi/linux/bpf.h | 44 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 include/net/bpf_sk_storage.h (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cd6341eabd74..9a21848fdb07 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -184,6 +184,7 @@ enum bpf_arg_type { ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ + ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ /* the following constraints used to prototype bpf_memcmp() and other * functions that access data on eBPF program stack @@ -204,6 +205,7 @@ enum bpf_arg_type { ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ ARG_PTR_TO_INT, /* pointer to int */ ARG_PTR_TO_LONG, /* pointer to long */ + ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ }; /* type of values returned from helper functions */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a10d37bce364..5a9975678d6f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -61,6 +61,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) #if defined(CONFIG_BPF_STREAM_PARSER) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h new file mode 100644 index 000000000000..b9dcb02e756b --- /dev/null +++ b/include/net/bpf_sk_storage.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2019 Facebook */ +#ifndef _BPF_SK_STORAGE_H +#define _BPF_SK_STORAGE_H + +struct sock; + +void bpf_sk_storage_free(struct sock *sk); + +extern const struct bpf_func_proto bpf_sk_storage_get_proto; +extern const struct bpf_func_proto bpf_sk_storage_delete_proto; + +#endif /* _BPF_SK_STORAGE_H */ diff --git a/include/net/sock.h b/include/net/sock.h index 784cd19d5ff7..4d208c0f9c14 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -236,6 +236,8 @@ struct sock_common { /* public: */ }; +struct bpf_sk_storage; + /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock @@ -510,6 +512,9 @@ struct sock { #endif void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; +#ifdef CONFIG_BPF_SYSCALL + struct bpf_sk_storage __rcu *sk_bpf_storage; +#endif struct rcu_head sk_rcu; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f7fa7a34a62d..72336bac7573 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,7 @@ enum bpf_map_type { BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, BPF_MAP_TYPE_QUEUE, BPF_MAP_TYPE_STACK, + BPF_MAP_TYPE_SK_STORAGE, }; /* Note that tracing related programs such as @@ -2630,6 +2631,42 @@ union bpf_attr { * was provided. * * **-ERANGE** if resulting value was out of range. + * + * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * Description + * Get a bpf-local-storage from a sk. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem(map, &sk)** except this + * helper enforces the key must be a **bpf_fullsock()** + * and the map must be a BPF_MAP_TYPE_SK_STORAGE also. + * + * Underneath, the value is stored locally at *sk* instead of + * the map. The *map* is used as the bpf-local-storage **type**. + * The bpf-local-storage **type** (i.e. the *map*) is searched + * against all bpf-local-storages residing at sk. + * + * An optional *flags* (BPF_SK_STORAGE_GET_F_CREATE) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with BPF_SK_STORAGE_GET_F_CREATE to specify + * the initial value of a bpf-local-storage. If *value* is + * NULL, the new bpf-local-storage will be zero initialized. + * Return + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + * + * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * Description + * Delete a bpf-local-storage from a sk. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2738,7 +2775,9 @@ union bpf_attr { FN(sysctl_get_new_value), \ FN(sysctl_set_new_value), \ FN(strtol), \ - FN(strtoul), + FN(strtoul), \ + FN(sk_storage_get), \ + FN(sk_storage_delete), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2814,6 +2853,9 @@ enum bpf_func_id { /* BPF_FUNC_sysctl_get_name flags. */ #define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +/* BPF_FUNC_sk_storage_get flags */ +#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) + /* Mode for BPF_FUNC_skb_adjust_room helper. */ enum bpf_adj_room_mode { BPF_ADJ_ROOM_NET, -- cgit v1.2.3 From 9e9957973c7785b1f8fa77f099cac661cc5e7e5b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2019 12:32:02 -0700 Subject: net/tls: remove old exports of sk_destruct functions tls_device_sk_destruct being set on a socket used to indicate that socket is a kTLS device one. That is no longer true - now we use sk_validate_xmit_skb pointer for that purpose. Remove the export. tls_device_attach() needs to be moved. While at it, remove the dead declaration of tls_sk_destruct(). Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index d9d0ac66f040..20196cb31ecc 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -317,7 +317,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx); int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tls_device_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); -void tls_device_sk_destruct(struct sock *sk); void tls_device_free_resources_tx(struct sock *sk); void tls_device_init(void); void tls_device_cleanup(void); @@ -336,7 +335,6 @@ static inline u32 tls_record_start_seq(struct tls_record_info *rec) return rec->end_seq - rec->len; } -void tls_sk_destruct(struct sock *sk, struct tls_context *ctx); int tls_push_sg(struct sock *sk, struct tls_context *ctx, struct scatterlist *sg, u16 first_offset, int flags); -- cgit v1.2.3 From da68b4ad02343862fee1e3e8c6315984f16a4778 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2019 12:32:03 -0700 Subject: net/tls: move definition of tls ops into net/tls.h There seems to be no reason for tls_ops to be defined in netdevice.h which is included in a lot of places. Don't wrap the struct/enum declaration in ifdefs, it trickles down unnecessary ifdefs into driver code. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 23 +---------------------- include/net/tls.h | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c46d218a0456..44b47e9df94a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -914,34 +914,13 @@ struct xfrmdev_ops { }; #endif -#if IS_ENABLED(CONFIG_TLS_DEVICE) -enum tls_offload_ctx_dir { - TLS_OFFLOAD_CTX_DIR_RX, - TLS_OFFLOAD_CTX_DIR_TX, -}; - -struct tls_crypto_info; -struct tls_context; - -struct tlsdev_ops { - int (*tls_dev_add)(struct net_device *netdev, struct sock *sk, - enum tls_offload_ctx_dir direction, - struct tls_crypto_info *crypto_info, - u32 start_offload_tcp_sn); - void (*tls_dev_del)(struct net_device *netdev, - struct tls_context *ctx, - enum tls_offload_ctx_dir direction); - void (*tls_dev_resync_rx)(struct net_device *netdev, - struct sock *sk, u32 seq, u64 rcd_sn); -}; -#endif - struct dev_ifalias { struct rcu_head rcuhead; char ifalias[]; }; struct devlink; +struct tlsdev_ops; /* * This structure defines the management hooks for network devices. diff --git a/include/net/tls.h b/include/net/tls.h index 20196cb31ecc..41a2ee643fc5 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -277,6 +277,23 @@ struct tls_context { void (*unhash)(struct sock *sk); }; +enum tls_offload_ctx_dir { + TLS_OFFLOAD_CTX_DIR_RX, + TLS_OFFLOAD_CTX_DIR_TX, +}; + +struct tlsdev_ops { + int (*tls_dev_add)(struct net_device *netdev, struct sock *sk, + enum tls_offload_ctx_dir direction, + struct tls_crypto_info *crypto_info, + u32 start_offload_tcp_sn); + void (*tls_dev_del)(struct net_device *netdev, + struct tls_context *ctx, + enum tls_offload_ctx_dir direction); + void (*tls_dev_resync_rx)(struct net_device *netdev, + struct sock *sk, u32 seq, u64 rcd_sn); +}; + struct tls_offload_context_rx { /* sw must be the first member of tls_offload_context_rx */ struct tls_sw_context_rx sw; -- cgit v1.2.3 From 63a1c95f3fe48b4e9fe0c261b376e5e527b71b25 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Apr 2019 12:32:04 -0700 Subject: net/tls: byte swap device req TCP seq no upon setting To avoid a sparse warning byteswap the be32 sequence number before it's stored in the atomic value. While at it drop unnecessary brackets and use kernel's u64 type. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/tls.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 41a2ee643fc5..39ea62f0c1f6 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -562,7 +562,7 @@ static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx); - atomic64_set(&rx_ctx->resync_req, ((((uint64_t)seq) << 32) | 1)); + atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | 1); } -- cgit v1.2.3 From ae0be8de9a53cda3505865c11826d8ff0640237c Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 26 Apr 2019 11:13:06 +0200 Subject: netlink: make nla_nest_start() add NLA_F_NESTED flag Even if the NLA_F_NESTED flag was introduced more than 11 years ago, most netlink based interfaces (including recently added ones) are still not setting it in kernel generated messages. Without the flag, message parsers not aware of attribute semantics (e.g. wireshark dissector or libmnl's mnl_nlmsg_fprintf()) cannot recognize nested attributes and won't display the structure of their contents. Unfortunately we cannot just add the flag everywhere as there may be userspace applications which check nlattr::nla_type directly rather than through a helper masking out the flags. Therefore the patch renames nla_nest_start() to nla_nest_start_noflag() and introduces nla_nest_start() as a wrapper adding NLA_F_NESTED. The calls which add NLA_F_NESTED manually are rewritten to use nla_nest_start(). Except for changes in include/net/netlink.h, the patch was generated using this semantic patch: @@ expression E1, E2; @@ -nla_nest_start(E1, E2) +nla_nest_start_noflag(E1, E2) @@ expression E1, E2; @@ -nla_nest_start_noflag(E1, E2 | NLA_F_NESTED) +nla_nest_start(E1, E2) Signed-off-by: Michal Kubecek Acked-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netfilter/ipset/ip_set.h | 2 +- include/net/netlink.h | 26 +++++++++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index f2e1e6b13ca4..965dc6c6653e 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -401,7 +401,7 @@ ip_set_get_h16(const struct nlattr *attr) return ntohs(nla_get_be16(attr)); } -#define ipset_nest_start(skb, attr) nla_nest_start(skb, attr | NLA_F_NESTED) +#define ipset_nest_start(skb, attr) nla_nest_start(skb, attr) #define ipset_nest_end(skb, start) nla_nest_end(skb, start) static inline int nla_put_ipaddr4(struct sk_buff *skb, int type, __be32 ipaddr) diff --git a/include/net/netlink.h b/include/net/netlink.h index 23f27b0b3cef..1f18b47f41b4 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1415,13 +1415,18 @@ static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp) } /** - * nla_nest_start - Start a new level of nested attributes + * nla_nest_start_noflag - Start a new level of nested attributes * @skb: socket buffer to add attributes to * @attrtype: attribute type of container * - * Returns the container attribute + * This function exists for backward compatibility to use in APIs which never + * marked their nest attributes with NLA_F_NESTED flag. New APIs should use + * nla_nest_start() which sets the flag. + * + * Returns the container attribute or NULL on error */ -static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) +static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, + int attrtype) { struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb); @@ -1431,6 +1436,21 @@ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) return start; } +/** + * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED + * @skb: socket buffer to add attributes to + * @attrtype: attribute type of container + * + * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED + * flag. This is the preferred function to use in new code. + * + * Returns the container attribute or NULL on error + */ +static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) +{ + return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED); +} + /** * nla_nest_end - Finalize nesting of attributes * @skb: socket buffer the attributes are stored in -- cgit v1.2.3 From 12ad5f65f030ae7b8a2425f6f79137c4217e30d4 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Fri, 26 Apr 2019 11:13:09 +0200 Subject: ipset: drop ipset_nest_start() and ipset_nest_end() After the previous commit, both ipset_nest_start() and ipset_nest_end() are just aliases for nla_nest_start() and nla_nest_end() so that there is no need to keep them. Signed-off-by: Michal Kubecek Acked-by: Jozsef Kadlecsik Signed-off-by: David S. Miller --- include/linux/netfilter/ipset/ip_set.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 965dc6c6653e..e499d170f12d 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -401,33 +401,30 @@ ip_set_get_h16(const struct nlattr *attr) return ntohs(nla_get_be16(attr)); } -#define ipset_nest_start(skb, attr) nla_nest_start(skb, attr) -#define ipset_nest_end(skb, start) nla_nest_end(skb, start) - static inline int nla_put_ipaddr4(struct sk_buff *skb, int type, __be32 ipaddr) { - struct nlattr *__nested = ipset_nest_start(skb, type); + struct nlattr *__nested = nla_nest_start(skb, type); int ret; if (!__nested) return -EMSGSIZE; ret = nla_put_in_addr(skb, IPSET_ATTR_IPADDR_IPV4, ipaddr); if (!ret) - ipset_nest_end(skb, __nested); + nla_nest_end(skb, __nested); return ret; } static inline int nla_put_ipaddr6(struct sk_buff *skb, int type, const struct in6_addr *ipaddrptr) { - struct nlattr *__nested = ipset_nest_start(skb, type); + struct nlattr *__nested = nla_nest_start(skb, type); int ret; if (!__nested) return -EMSGSIZE; ret = nla_put_in6_addr(skb, IPSET_ATTR_IPADDR_IPV6, ipaddrptr); if (!ret) - ipset_nest_end(skb, __nested); + nla_nest_end(skb, __nested); return ret; } -- cgit v1.2.3 From 6f455f5f4e9c28aefaefbe18ce7304b499645d75 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:27 +0200 Subject: netlink: add NLA_MIN_LEN Rather than using NLA_UNSPEC for this type of thing, use NLA_MIN_LEN so we can make NLA_UNSPEC be NLA_REJECT under certain conditions for future attributes. While at it, also use NLA_EXACT_LEN for the struct example. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 1f18b47f41b4..c77ed51c18f1 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -183,6 +183,7 @@ enum { NLA_REJECT, NLA_EXACT_LEN, NLA_EXACT_LEN_WARN, + NLA_MIN_LEN, __NLA_TYPE_MAX, }; @@ -212,6 +213,7 @@ enum nla_policy_validation { * NLA_NUL_STRING Maximum length of string (excluding NUL) * NLA_FLAG Unused * NLA_BINARY Maximum length of attribute payload + * NLA_MIN_LEN Minimum length of attribute payload * NLA_NESTED, * NLA_NESTED_ARRAY Length verification is done by checking len of * nested header (or empty); len field is used if @@ -230,6 +232,7 @@ enum nla_policy_validation { * it is rejected. * NLA_EXACT_LEN_WARN Attribute should have exactly this length, a warning * is logged if it is longer, shorter is rejected. + * NLA_MIN_LEN Minimum length of attribute payload * All other Minimum length of attribute payload * * Meaning of `validation_data' field: @@ -281,7 +284,7 @@ enum nla_policy_validation { * static const struct nla_policy my_policy[ATTR_MAX+1] = { * [ATTR_FOO] = { .type = NLA_U16 }, * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ }, - * [ATTR_BAZ] = { .len = sizeof(struct mystruct) }, + * [ATTR_BAZ] = { .type = NLA_EXACT_LEN, .len = sizeof(struct mystruct) }, * [ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags }, * }; */ @@ -302,6 +305,7 @@ struct nla_policy { #define NLA_POLICY_EXACT_LEN(_len) { .type = NLA_EXACT_LEN, .len = _len } #define NLA_POLICY_EXACT_LEN_WARN(_len) { .type = NLA_EXACT_LEN_WARN, \ .len = _len } +#define NLA_POLICY_MIN_LEN(_len) { .type = NLA_MIN_LEN, .len = _len } #define NLA_POLICY_ETH_ADDR NLA_POLICY_EXACT_LEN(ETH_ALEN) #define NLA_POLICY_ETH_ADDR_COMPAT NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN) -- cgit v1.2.3 From 8cb081746c031fb164089322e2336a0bf5b3070c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:28 +0200 Subject: netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 16 ++-- include/net/netlink.h | 238 ++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 199 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 6850c7b1a3a6..897cdba13569 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -165,7 +165,7 @@ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) } /** - * genlmsg_parse - parse attributes of a genetlink message + * genlmsg_parse_deprecated - parse attributes of a genetlink message * @nlh: netlink message header * @family: genetlink message family * @tb: destination array with maxtype+1 elements @@ -173,14 +173,14 @@ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) * @policy: validation policy * @extack: extended ACK report struct */ -static inline int genlmsg_parse(const struct nlmsghdr *nlh, - const struct genl_family *family, - struct nlattr *tb[], int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh, + const struct genl_family *family, + struct nlattr *tb[], int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { - return nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, - policy, extack); + return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, + policy, NL_VALIDATE_LIBERAL, extack); } /** diff --git a/include/net/netlink.h b/include/net/netlink.h index c77ed51c18f1..ab26a5e3558b 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -369,21 +369,48 @@ struct nl_info { bool skip_notify; }; +/** + * enum netlink_validation - netlink message/attribute validation levels + * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about + * extra data at the end of the message, attributes being longer than + * they should be, or unknown attributes being present. + * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing. + * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING + * this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict(). + * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy. + * This can safely be set by the kernel when the given policy has no + * NLA_UNSPEC anymore, and can thus be used to ensure policy entries + * are enforced going forward. + * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g. + * U8, U16, U32 must have exact size, etc.) + */ +enum netlink_validation { + NL_VALIDATE_LIBERAL = 0, + NL_VALIDATE_TRAILING = BIT(0), + NL_VALIDATE_MAXTYPE = BIT(1), + NL_VALIDATE_UNSPEC = BIT(2), + NL_VALIDATE_STRICT_ATTRS = BIT(3), +}; + +#define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\ + NL_VALIDATE_MAXTYPE) +#define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\ + NL_VALIDATE_MAXTYPE |\ + NL_VALIDATE_UNSPEC |\ + NL_VALIDATE_STRICT_ATTRS) + int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)); int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags); -int nla_validate(const struct nlattr *head, int len, int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack); -int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, - int len, const struct nla_policy *policy, - struct netlink_ext_ack *extack); -int nla_parse_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, - int len, const struct nla_policy *policy, - struct netlink_ext_ack *extack); +int __nla_validate(const struct nlattr *head, int len, int maxtype, + const struct nla_policy *policy, unsigned int validate, + struct netlink_ext_ack *extack); +int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, + int len, const struct nla_policy *policy, unsigned int validate, + struct netlink_ext_ack *extack); int nla_policy_len(const struct nla_policy *, int); struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype); size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize); @@ -512,42 +539,121 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining) } /** - * nlmsg_parse - parse attributes of a netlink message + * nla_parse_deprecated - Parse a stream of attributes into a tb buffer + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @head: head of attribute stream + * @len: length of attribute stream + * @policy: validation policy + * @extack: extended ACK pointer + * + * Parses a stream of attributes and stores a pointer to each attribute in + * the tb array accessible via the attribute type. Attributes with a type + * exceeding maxtype will be ignored and attributes from the policy are not + * always strictly validated (only for new attributes). + * + * Returns 0 on success or a negative error code. + */ +static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, + const struct nlattr *head, int len, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_parse(tb, maxtype, head, len, policy, + NL_VALIDATE_LIBERAL, extack); +} + +/** + * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @head: head of attribute stream + * @len: length of attribute stream + * @policy: validation policy + * @extack: extended ACK pointer + * + * Parses a stream of attributes and stores a pointer to each attribute in + * the tb array accessible via the attribute type. Attributes with a type + * exceeding maxtype will be rejected as well as trailing data, but the + * policy is not completely strictly validated (only for new attributes). + * + * Returns 0 on success or a negative error code. + */ +static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype, + const struct nlattr *head, + int len, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_parse(tb, maxtype, head, len, policy, + NL_VALIDATE_DEPRECATED_STRICT, extack); +} + +/** + * __nlmsg_parse - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy + * @validate: validation strictness * @extack: extended ACK report struct * * See nla_parse() */ -static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, - struct nlattr *tb[], int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr *tb[], int maxtype, + const struct nla_policy *policy, + unsigned int validate, + struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) { NL_SET_ERR_MSG(extack, "Invalid header length"); return -EINVAL; } - return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), policy, extack); + return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), policy, validate, + extack); } -static inline int nlmsg_parse_strict(const struct nlmsghdr *nlh, int hdrlen, - struct nlattr *tb[], int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +/** + * nlmsg_parse_deprecated - parse attributes of a netlink message + * @nlh: netlink message header + * @hdrlen: length of family specific header + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @extack: extended ACK report struct + * + * See nla_parse_deprecated() + */ +static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr *tb[], int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { - if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) { - NL_SET_ERR_MSG(extack, "Invalid header length"); - return -EINVAL; - } + return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, + NL_VALIDATE_LIBERAL, extack); +} - return nla_parse_strict(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), policy, extack); +/** + * nlmsg_parse_deprecated_strict - parse attributes of a netlink message + * @nlh: netlink message header + * @hdrlen: length of family specific header + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @extack: extended ACK report struct + * + * See nla_parse_deprecated_strict() + */ +static inline int +nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr *tb[], int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, + NL_VALIDATE_DEPRECATED_STRICT, extack); } /** @@ -566,26 +672,53 @@ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, } /** - * nlmsg_validate - validate a netlink message including attributes + * nla_validate_deprecated - Validate a stream of attributes + * @head: head of attribute stream + * @len: length of attribute stream + * @maxtype: maximum attribute type to be expected + * @policy: validation policy + * @validate: validation strictness + * @extack: extended ACK report struct + * + * Validates all attributes in the specified attribute stream against the + * specified policy. Validation is done in liberal mode. + * See documenation of struct nla_policy for more details. + * + * Returns 0 on success or a negative error code. + */ +static inline int nla_validate_deprecated(const struct nlattr *head, int len, + int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL, + extack); +} + + +/** + * nlmsg_validate_deprecated - validate a netlink message including attributes * @nlh: netlinket message header * @hdrlen: length of familiy specific header * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ -static inline int nlmsg_validate(const struct nlmsghdr *nlh, - int hdrlen, int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh, + int hdrlen, int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; - return nla_validate(nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), maxtype, policy, - extack); + return __nla_validate(nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), maxtype, + policy, NL_VALIDATE_LIBERAL, extack); } + + /** * nlmsg_report - need to report back to application? * @nlh: netlink message header @@ -899,22 +1032,22 @@ nla_find_nested(const struct nlattr *nla, int attrtype) } /** - * nla_parse_nested - parse nested attributes + * nla_parse_nested_deprecated - parse nested attributes * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy * @extack: extended ACK report struct * - * See nla_parse() + * See nla_parse_deprecated() */ -static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, - const struct nlattr *nla, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype, + const struct nlattr *nla, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { - return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, - extack); + return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, + NL_VALIDATE_LIBERAL, extack); } /** @@ -1489,6 +1622,7 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) * @start: container attribute * @maxtype: maximum attribute type to be expected * @policy: validation policy + * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the nested attribute stream against the @@ -1497,12 +1631,22 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) * * Returns 0 on success or a negative error code. */ -static inline int nla_validate_nested(const struct nlattr *start, int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) +static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, + const struct nla_policy *policy, + unsigned int validate, + struct netlink_ext_ack *extack) +{ + return __nla_validate(nla_data(start), nla_len(start), maxtype, policy, + validate, extack); +} + +static inline int +nla_validate_nested_deprecated(const struct nlattr *start, int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { - return nla_validate(nla_data(start), nla_len(start), maxtype, policy, - extack); + return __nla_validate_nested(start, maxtype, policy, + NL_VALIDATE_LIBERAL, extack); } /** -- cgit v1.2.3 From 3de644035446567017e952f16da2594d6bd195fc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:29 +0200 Subject: netlink: re-add parse/validate functions in strict mode This re-adds the parse and validate functions like nla_parse() that are now actually strict after the previous rename and were just split out to make sure everything is converted (and if not compilation of the previous patch would fail.) Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 19 +++++++++++ include/net/netlink.h | 87 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 897cdba13569..68de579cfe5e 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -183,6 +183,25 @@ static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh, policy, NL_VALIDATE_LIBERAL, extack); } +/** + * genlmsg_parse - parse attributes of a genetlink message + * @nlh: netlink message header + * @family: genetlink message family + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @policy: validation policy + * @extack: extended ACK report struct + */ +static inline int genlmsg_parse(const struct nlmsghdr *nlh, + const struct genl_family *family, + struct nlattr *tb[], int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, + policy, NL_VALIDATE_STRICT, extack); +} + /** * genl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number diff --git a/include/net/netlink.h b/include/net/netlink.h index ab26a5e3558b..e4dd874412bf 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -538,6 +538,31 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining) return (struct nlmsghdr *) ((unsigned char *) nlh + totlen); } +/** + * nla_parse - Parse a stream of attributes into a tb buffer + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @head: head of attribute stream + * @len: length of attribute stream + * @policy: validation policy + * @extack: extended ACK pointer + * + * Parses a stream of attributes and stores a pointer to each attribute in + * the tb array accessible via the attribute type. Attributes with a type + * exceeding maxtype will be rejected, policy must be specified, attributes + * will be validated in the strictest way possible. + * + * Returns 0 on success or a negative error code. + */ +static inline int nla_parse(struct nlattr **tb, int maxtype, + const struct nlattr *head, int len, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_parse(tb, maxtype, head, len, policy, + NL_VALIDATE_STRICT, extack); +} + /** * nla_parse_deprecated - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements @@ -617,6 +642,27 @@ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, extack); } +/** + * nlmsg_parse - parse attributes of a netlink message + * @nlh: netlink message header + * @hdrlen: length of family specific header + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @validate: validation strictness + * @extack: extended ACK report struct + * + * See nla_parse() + */ +static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, + struct nlattr *tb[], int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), + nlmsg_attrlen(nlh, hdrlen), policy, + NL_VALIDATE_STRICT, extack); +} + /** * nlmsg_parse_deprecated - parse attributes of a netlink message * @nlh: netlink message header @@ -695,6 +741,28 @@ static inline int nla_validate_deprecated(const struct nlattr *head, int len, extack); } +/** + * nla_validate - Validate a stream of attributes + * @head: head of attribute stream + * @len: length of attribute stream + * @maxtype: maximum attribute type to be expected + * @policy: validation policy + * @validate: validation strictness + * @extack: extended ACK report struct + * + * Validates all attributes in the specified attribute stream against the + * specified policy. Validation is done in strict mode. + * See documenation of struct nla_policy for more details. + * + * Returns 0 on success or a negative error code. + */ +static inline int nla_validate(const struct nlattr *head, int len, int maxtype, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT, + extack); +} /** * nlmsg_validate_deprecated - validate a netlink message including attributes @@ -1031,6 +1099,25 @@ nla_find_nested(const struct nlattr *nla, int attrtype) return nla_find(nla_data(nla), nla_len(nla), attrtype); } +/** + * nla_parse_nested - parse nested attributes + * @tb: destination array with maxtype+1 elements + * @maxtype: maximum attribute type to be expected + * @nla: attribute containing the nested attributes + * @policy: validation policy + * @extack: extended ACK report struct + * + * See nla_parse() + */ +static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, + const struct nlattr *nla, + const struct nla_policy *policy, + struct netlink_ext_ack *extack) +{ + return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, + NL_VALIDATE_STRICT, extack); +} + /** * nla_parse_nested_deprecated - parse nested attributes * @tb: destination array with maxtype+1 elements -- cgit v1.2.3 From 56738f460841761abc70347c919d5c45f6f05a42 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:30 +0200 Subject: netlink: add strict parsing for future attributes Unfortunately, we cannot add strict parsing for all attributes, as that would break existing userspace. We currently warn about it, but that's about all we can do. For new attributes, however, the story is better: nobody is using them, so we can reject bad sizes. Also, for new attributes, we need not accept them when the policy doesn't declare their usage. David Ahern and I went back and forth on how to best encode this, and the best way we found was to have a "boundary type", from which point on new attributes have all possible validation applied, and NLA_UNSPEC is rejected. As we didn't want to add another argument to all functions that get a netlink policy, the workaround is to encode that boundary in the first entry of the policy array (which is for type 0 and thus probably not really valid anyway). I put it into the validation union for the rare possibility that somebody is actually using attribute 0, which would continue to work fine unless they tried to use the extended validation, which isn't likely. We also didn't find any in-tree users with type 0. The reason for setting the "start strict here" attribute is that we never really need to start strict from 0, which is invalid anyway (or in legacy families where that isn't true, it cannot be set to strict), so we can thus reserve the value 0 for "don't do this check" and don't have to add the tag to all policies right now. Thus, policies can now opt in to this validation, which we should do for all existing policies, at least when adding new attributes. Note that entirely *new* policies won't need to set it, as the use of that should be using nla_parse()/nlmsg_parse() etc. which anyway do fully strict validation now, regardless of this. So in effect, this patch only covers the "existing command with new attribute" case. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index e4dd874412bf..679f649748d4 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -299,6 +299,24 @@ struct nla_policy { }; int (*validate)(const struct nlattr *attr, struct netlink_ext_ack *extack); + /* This entry is special, and used for the attribute at index 0 + * only, and specifies special data about the policy, namely it + * specifies the "boundary type" where strict length validation + * starts for any attribute types >= this value, also, strict + * nesting validation starts here. + * + * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT + * for any types >= this, so need to use NLA_MIN_LEN to get the + * previous pure { .len = xyz } behaviour. The advantage of this + * is that types not specified in the policy will be rejected. + * + * For completely new families it should be set to 1 so that the + * validation is enforced for all attributes. For existing ones + * it should be set at least when new attributes are added to + * the enum used by the policy, and be set to the new value that + * was added to enforce strict validation from thereon. + */ + u16 strict_start_type; }; }; -- cgit v1.2.3 From ef6243acb4782df587a4d7d6c310fa5b5d82684b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Apr 2019 14:07:31 +0200 Subject: genetlink: optionally validate strictly/dumps Add options to strictly validate messages and dump messages, sometimes perhaps validating dump messages non-strictly may be required, so add an option for that as well. Since none of this can really be applied to existing commands, set the options everwhere using the following spatch: @@ identifier ops; expression X; @@ struct genl_ops ops[] = { ..., { .cmd = X, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, ... }, ... }; For new commands one should just not copy the .validate 'opt-out' flags and thus get strict validation. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 68de579cfe5e..9292f1c588b7 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -121,6 +121,12 @@ static inline int genl_err_attr(struct genl_info *info, int err, return err; } +enum genl_validate_flags { + GENL_DONT_VALIDATE_STRICT = BIT(0), + GENL_DONT_VALIDATE_DUMP = BIT(1), + GENL_DONT_VALIDATE_DUMP_STRICT = BIT(2), +}; + /** * struct genl_ops - generic netlink operations * @cmd: command identifier @@ -141,6 +147,7 @@ struct genl_ops { u8 cmd; u8 internal_flags; u8 flags; + u8 validate; }; int genl_register_family(struct genl_family *family); -- cgit v1.2.3 From 875138f81d71af3cfa80df57e32fe9efbc4f95bc Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 19:37:11 +0200 Subject: dsa: Move tagger name into its ops structure Rather than keep a list to map a tagger ops to a name, place the name into the ops structure. This removes the hard coded list, a step towards making the taggers more dynamic. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli v2: Move name to end of structure, keeping the hot entries at the beginning. Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 0cfc2f828b87..801346e31e9b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -56,6 +56,7 @@ struct dsa_device_ops { int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, int *offset); unsigned int overhead; + const char *name; }; struct dsa_switch_tree { -- cgit v1.2.3 From 0b42f03363706609d621c31324fae5c1250f579f Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 19:37:12 +0200 Subject: dsa: Add MODULE_ALIAS to taggers in preparation to become modules When the tag drivers become modules, we will need to dynamically load them based on what the switch drivers need. Add aliases to map between the TAG protocol and the driver. In order to do this, we need the tag protocol number as something which the C pre-processor can stringinfy. Only the compiler knows the value of an enum, CPP cannot use them. So add #defines. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 801346e31e9b..8f3d5e0825a2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -30,20 +30,33 @@ struct phy_device; struct fixed_phy_status; struct phylink_link_state; +#define DSA_TAG_PROTO_NONE_VALUE 0 +#define DSA_TAG_PROTO_BRCM_VALUE 1 +#define DSA_TAG_PROTO_BRCM_PREPEND_VALUE 2 +#define DSA_TAG_PROTO_DSA_VALUE 3 +#define DSA_TAG_PROTO_EDSA_VALUE 4 +#define DSA_TAG_PROTO_GSWIP_VALUE 5 +#define DSA_TAG_PROTO_KSZ9477_VALUE 6 +#define DSA_TAG_PROTO_KSZ9893_VALUE 7 +#define DSA_TAG_PROTO_LAN9303_VALUE 8 +#define DSA_TAG_PROTO_MTK_VALUE 9 +#define DSA_TAG_PROTO_QCA_VALUE 10 +#define DSA_TAG_PROTO_TRAILER_VALUE 11 + enum dsa_tag_protocol { - DSA_TAG_PROTO_NONE = 0, - DSA_TAG_PROTO_BRCM, - DSA_TAG_PROTO_BRCM_PREPEND, - DSA_TAG_PROTO_DSA, - DSA_TAG_PROTO_EDSA, - DSA_TAG_PROTO_GSWIP, - DSA_TAG_PROTO_KSZ9477, - DSA_TAG_PROTO_KSZ9893, - DSA_TAG_PROTO_LAN9303, - DSA_TAG_PROTO_MTK, - DSA_TAG_PROTO_QCA, - DSA_TAG_PROTO_TRAILER, - DSA_TAG_LAST, /* MUST BE LAST */ + DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, + DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, + DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, + DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, + DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, + DSA_TAG_PROTO_GSWIP = DSA_TAG_PROTO_GSWIP_VALUE, + DSA_TAG_PROTO_KSZ9477 = DSA_TAG_PROTO_KSZ9477_VALUE, + DSA_TAG_PROTO_KSZ9893 = DSA_TAG_PROTO_KSZ9893_VALUE, + DSA_TAG_PROTO_LAN9303 = DSA_TAG_PROTO_LAN9303_VALUE, + DSA_TAG_PROTO_MTK = DSA_TAG_PROTO_MTK_VALUE, + DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, + DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, + DSA_TAG_LAST, /* MUST BE LAST */ }; struct packet_type; @@ -59,6 +72,10 @@ struct dsa_device_ops { const char *name; }; +#define DSA_TAG_DRIVER_ALIAS "dsa_tag-" +#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ + MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) + struct dsa_switch_tree { struct list_head list; -- cgit v1.2.3 From 056eed2fb071c11535527fc792bdfb985a9a3e26 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 19:37:14 +0200 Subject: dsa: Add TAG protocol to tag ops In order that we can match the tagging protocol a switch driver request to the tagger, we need to know what protocol the tagger supports. Add this information to the ops structure. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli v2 More tag protocol to end of structure to keep hot members at the beginning. Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 8f3d5e0825a2..720036f48fb3 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -70,6 +70,7 @@ struct dsa_device_ops { int *offset); unsigned int overhead; const char *name; + enum dsa_tag_protocol proto; }; #define DSA_TAG_DRIVER_ALIAS "dsa_tag-" -- cgit v1.2.3 From d3b8c04988ca1685700e345a82a1396df79e6291 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 19:37:15 +0200 Subject: dsa: Add boilerplate helper to register DSA tag driver modules A DSA tag driver module will need to register the tag protocols it implements with the DSA core. Add macros containing this boiler plate. The registration/unregistration code is currently just a stub. A Later patch will add the real implementation. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli v2 Fix indent of #endif Rewrite to move list pointer into a new structure v3 Move kdoc next to macro Fix THIS_MODULE indentation Signed-off-by: David S. Miller --- include/net/dsa.h | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 720036f48fb3..08ac05c014e3 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -594,4 +594,70 @@ int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data); int dsa_port_get_phy_sset_count(struct dsa_port *dp); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); +struct dsa_tag_driver { + const struct dsa_device_ops *ops; + struct list_head list; + struct module *owner; +}; + +void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], + unsigned int count, + struct module *owner); +void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], + unsigned int count); + +#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \ +static int __init dsa_tag_driver_module_init(void) \ +{ \ + dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \ + THIS_MODULE); \ + return 0; \ +} \ +module_init(dsa_tag_driver_module_init); \ + \ +static void __exit dsa_tag_driver_module_exit(void) \ +{ \ + dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \ +} \ +module_exit(dsa_tag_driver_module_exit) + +/** + * module_dsa_tag_drivers() - Helper macro for registering DSA tag + * drivers + * @__ops_array: Array of tag driver strucutres + * + * Helper macro for DSA tag drivers which do not do anything special + * in module init/exit. Each module may only use this macro once, and + * calling it replaces module_init() and module_exit(). + */ +#define module_dsa_tag_drivers(__ops_array) \ +dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array)) + +#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops + +/* Create a static structure we can build a linked list of dsa_tag + * drivers + */ +#define DSA_TAG_DRIVER(__ops) \ +static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \ + .ops = &__ops, \ +} + +/** + * module_dsa_tag_driver() - Helper macro for registering a single DSA tag + * driver + * @__ops: Single tag driver structures + * + * Helper macro for DSA tag drivers which do not do anything special + * in module init/exit. Each module may only use this macro once, and + * calling it replaces module_init() and module_exit(). + */ +#define module_dsa_tag_driver(__ops) \ +DSA_TAG_DRIVER(__ops); \ + \ +static struct dsa_tag_driver *dsa_tag_driver_array[] = { \ + &DSA_TAG_DRIVER_NAME(__ops) \ +}; \ +module_dsa_tag_drivers(dsa_tag_driver_array) #endif + -- cgit v1.2.3 From f81a43e8da07ccd91c4d923a44ffffaeee39dcc8 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 19:37:21 +0200 Subject: dsa: Cleanup unneeded table and make tag structures static Now that tag drivers dynamically register, we don't need the static table. Remove it. This also means the tag driver structures can be made static. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 08ac05c014e3..b550f7bb5314 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -56,7 +56,6 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_MTK = DSA_TAG_PROTO_MTK_VALUE, DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, - DSA_TAG_LAST, /* MUST BE LAST */ }; struct packet_type; -- cgit v1.2.3 From 316793fb2d907d4726b4977b6be26ec653827774 Mon Sep 17 00:00:00 2001 From: Eli Britstein Date: Mon, 29 Apr 2019 18:14:01 +0000 Subject: net/mlx5: E-Switch: Introduce prio tag mode Current ConnectX HW is unable to perform VLAN pop in TX path and VLAN push on RX path. To workaround that limitation untagged packets will be tagged with VLAN ID 0x000 (priority tag) and pop/push actions will be replaced by VLAN re-write actions (which are supported by the HW). Introduce prio tag mode as a pre-step to controlling the workaround behavior. Signed-off-by: Eli Britstein Reviewed-by: Oz Shlomo Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4b37519bd6a5..eeedf3f53ed3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -951,7 +951,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_srq_sz[0x8]; u8 log_max_qp_sz[0x8]; - u8 reserved_at_90[0xb]; + u8 reserved_at_90[0x8]; + u8 prio_tag_required[0x1]; + u8 reserved_at_99[0x2]; u8 log_max_qp[0x5]; u8 reserved_at_a0[0xb]; -- cgit v1.2.3 From 27b942fbbd3107d4e969ece133925cd646239ef4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 29 Apr 2019 18:14:02 +0000 Subject: net/mlx5: Get rid of storing copy of device name Currently mlx5 core stores copy of the PCI device name in a mlx5_priv structure and uses pr_warn, pr_err helpers. Get rid of the copy of this name; instead store the parent device pointer that contains name as well as dma specific parameters. This also allows to use kernel's well defined dev_warn, dev_err, dev_dbg device specific print routines. This is also a preparation patch to access non PCI parent device in future. Signed-off-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6c43191c0186..582a9680b182 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -56,7 +56,6 @@ enum { MLX5_BOARD_ID_LEN = 64, - MLX5_MAX_NAME_LEN = 16, }; enum { @@ -514,7 +513,6 @@ struct mlx5_rl_table { }; struct mlx5_priv { - char name[MLX5_MAX_NAME_LEN]; struct mlx5_eq_table *eq_table; /* pages stuff */ @@ -641,6 +639,7 @@ struct mlx5_fw_tracer; struct mlx5_vxlan; struct mlx5_core_dev { + struct device *device; struct pci_dev *pdev; /* sync pci state */ struct mutex pci_status_mutex; -- cgit v1.2.3 From d83eb50e29de36ddc819863ab7b9d2da58bccbd0 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 29 Apr 2019 18:14:12 +0000 Subject: net/mlx5: Add support in RDMA RX steering Add new flow steering namespace - MLX5_FLOW_NAMESPACE_RDMA_RX. Flow steering rules in this namespace are used to filter RDMA traffic. Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 6 ++++++ include/linux/mlx5/fs.h | 1 + include/linux/mlx5/mlx5_ifc.h | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index f93a5598b942..28ebb6c93542 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1166,6 +1166,12 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_FLOWTABLE_SNIFFER_TX_MAX(mdev, cap) \ MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_transmit_sniffer.cap) +#define MLX5_CAP_FLOWTABLE_RDMA_RX(mdev, cap) \ + MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive_rdma.cap) + +#define MLX5_CAP_FLOWTABLE_RDMA_RX_MAX(mdev, cap) \ + MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_receive_rdma.cap) + #define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \ MLX5_GET(flow_table_eswitch_cap, \ mdev->caps.hca_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap) diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index fd91df3a4e09..e690ba0f965c 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -73,6 +73,7 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_SNIFFER_RX, MLX5_FLOW_NAMESPACE_SNIFFER_TX, MLX5_FLOW_NAMESPACE_EGRESS, + MLX5_FLOW_NAMESPACE_RDMA_RX, }; enum { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index eeedf3f53ed3..89e7194b3d97 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -598,7 +598,7 @@ struct mlx5_ifc_flow_table_nic_cap_bits { struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive; - u8 reserved_at_400[0x200]; + struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_rdma; struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_sniffer; -- cgit v1.2.3 From f6f7d6b5bd818cc84eebb55ba6bcba38cfd3b385 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 29 Apr 2019 18:14:14 +0000 Subject: net/mlx5: Add new miss flow table action Flow table supports three types of miss action: 1. Default miss action - go to default miss table according to table. 2. Go to specific table. 3. Switch domain - go to the root table of an alternative steering table domain. New table miss action was added - switch_domain. The next domain for RDMA_RX namespace is the NIC RX domain. Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 89e7194b3d97..7d9264b282d1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -370,7 +370,9 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reformat_l3_tunnel_to_l2[0x1]; u8 reformat_l2_to_l3_tunnel[0x1]; u8 reformat_and_modify_action[0x1]; - u8 reserved_at_15[0xb]; + u8 reserved_at_15[0x2]; + u8 table_miss_action_domain[0x1]; + u8 reserved_at_18[0x8]; u8 reserved_at_20[0x2]; u8 log_max_ft_size[0x6]; u8 log_max_modify_header_context[0x8]; @@ -1284,6 +1286,12 @@ enum mlx5_flow_destination_type { MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM = 0x101, }; +enum mlx5_flow_table_miss_action { + MLX5_FLOW_TABLE_MISS_ACTION_DEF, + MLX5_FLOW_TABLE_MISS_ACTION_FWD, + MLX5_FLOW_TABLE_MISS_ACTION_SWITCH_DOMAIN, +}; + struct mlx5_ifc_dest_format_struct_bits { u8 destination_type[0x8]; u8 destination_id[0x18]; -- cgit v1.2.3 From 80f09dfc237f181e92968a72d97b7a4202baa453 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 29 Apr 2019 18:14:16 +0000 Subject: net/mlx5: Eswitch, enable RoCE loopback traffic When in switchdev mode, we would like to treat loopback RoCE traffic (on eswitch manager) as RDMA and not as regular Ethernet traffic In order to enable it we add flow steering rule that forward RoCE loopback traffic to the HW RoCE filter (by adding allow rule). In addition we add RoCE address in GID index 0, which will be set in the RoCE loopback packet. Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Acked-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 582a9680b182..7fa95270dd59 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -512,6 +512,12 @@ struct mlx5_rl_table { struct mlx5_rl_entry *rl_entry; }; +struct mlx5_core_roce { + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg; + struct mlx5_flow_handle *allow_rule; +}; + struct mlx5_priv { struct mlx5_eq_table *eq_table; @@ -565,6 +571,7 @@ struct mlx5_priv { struct mlx5_lag *lag; struct mlx5_devcom *devcom; unsigned long pci_dev_data; + struct mlx5_core_roce roce; struct mlx5_fc_stats fc_stats; struct mlx5_rl_table rl_table; -- cgit v1.2.3 From 75d90e7def8e20b57c1de9e2b672c3bf9249da83 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 29 Apr 2019 18:14:18 +0000 Subject: net/mlx5: Geneve, Add basic Geneve encap/decap flow table capabilities Introduce support for Geneve flow specification and allow the creation of rules that are matching on basic Geneve protocol fields: VNI, OAM bit, protocol type, options length. Reviewed-by: Oz Shlomo Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 7d9264b282d1..268ac126b3bb 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -307,7 +307,11 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 outer_gre_protocol[0x1]; u8 outer_gre_key[0x1]; u8 outer_vxlan_vni[0x1]; - u8 reserved_at_1a[0x5]; + u8 outer_geneve_vni[0x1]; + u8 outer_geneve_oam[0x1]; + u8 outer_geneve_protocol_type[0x1]; + u8 outer_geneve_opt_len[0x1]; + u8 reserved_at_1e[0x1]; u8 source_eswitch_port[0x1]; u8 inner_dmac[0x1]; @@ -480,7 +484,9 @@ struct mlx5_ifc_fte_match_set_misc_bits { u8 vxlan_vni[0x18]; u8 reserved_at_b8[0x8]; - u8 reserved_at_c0[0x20]; + u8 geneve_vni[0x18]; + u8 reserved_at_d8[0x7]; + u8 geneve_oam[0x1]; u8 reserved_at_e0[0xc]; u8 outer_ipv6_flow_label[0x14]; @@ -488,7 +494,11 @@ struct mlx5_ifc_fte_match_set_misc_bits { u8 reserved_at_100[0xc]; u8 inner_ipv6_flow_label[0x14]; - u8 reserved_at_120[0x28]; + u8 reserved_at_120[0xa]; + u8 geneve_opt_len[0x6]; + u8 geneve_protocol_type[0x10]; + + u8 reserved_at_140[0x8]; u8 bth_dst_qp[0x18]; u8 reserved_at_160[0x20]; u8 outer_esp_spi[0x20]; -- cgit v1.2.3 From b169e64a24442e02cafee1586f17fcb713fe65a6 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 29 Apr 2019 18:14:20 +0000 Subject: net/mlx5: Geneve, Add flow table capabilities for Geneve decap with TLV options Introduce specification for Geneve decap flow with encapsulation options and allow creation of rules that are matching on Geneve TLV options. Reviewed-by: Oz Shlomo Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 4 +++- include/linux/mlx5/mlx5_ifc.h | 50 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 28ebb6c93542..4ab801040e98 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1001,7 +1001,8 @@ enum { MLX5_MATCH_OUTER_HEADERS = 1 << 0, MLX5_MATCH_MISC_PARAMETERS = 1 << 1, MLX5_MATCH_INNER_HEADERS = 1 << 2, - + MLX5_MATCH_MISC_PARAMETERS_2 = 1 << 3, + MLX5_MATCH_MISC_PARAMETERS_3 = 1 << 4, }; enum { @@ -1045,6 +1046,7 @@ enum mlx5_mpls_supported_fields { }; enum mlx5_flex_parser_protos { + MLX5_FLEX_PROTO_GENEVE = 1 << 3, MLX5_FLEX_PROTO_CW_MPLS_GRE = 1 << 4, MLX5_FLEX_PROTO_CW_MPLS_UDP = 1 << 5, }; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 268ac126b3bb..6a7fc18a9fe3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -86,6 +86,11 @@ enum { enum { MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM), + MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11), +}; + +enum { + MLX5_OBJ_TYPE_GENEVE_TLV_OPT = 0x000b, }; enum { @@ -339,7 +344,8 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 inner_tcp_flags[0x1]; u8 reserved_at_37[0x9]; - u8 reserved_at_40[0x5]; + u8 geneve_tlv_option_0_data[0x1]; + u8 reserved_at_41[0x4]; u8 outer_first_mpls_over_udp[0x4]; u8 outer_first_mpls_over_gre[0x4]; u8 inner_first_mpls[0x4]; @@ -528,6 +534,12 @@ struct mlx5_ifc_fte_match_set_misc2_bits { u8 reserved_at_1a0[0x60]; }; +struct mlx5_ifc_fte_match_set_misc3_bits { + u8 reserved_at_0[0x120]; + u8 geneve_tlv_option_0_data[0x20]; + u8 reserved_at_140[0xc0]; +}; + struct mlx5_ifc_cmd_pas_bits { u8 pa_h[0x20]; @@ -1247,9 +1259,13 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 num_of_uars_per_page[0x20]; u8 flex_parser_protocols[0x20]; - u8 reserved_at_560[0x20]; - u8 reserved_at_580[0x3c]; + u8 max_geneve_tlv_options[0x8]; + u8 reserved_at_568[0x3]; + u8 max_geneve_tlv_option_data_len[0x5]; + u8 reserved_at_570[0x10]; + + u8 reserved_at_580[0x1c]; u8 mini_cqe_resp_stride_index[0x1]; u8 cqe_128_always[0x1]; u8 cqe_compression_128[0x1]; @@ -1283,7 +1299,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 uctx_cap[0x20]; - u8 reserved_at_6c0[0x140]; + u8 reserved_at_6c0[0x4]; + u8 flex_parser_id_geneve_tlv_option_0[0x4]; + u8 reserved_at_6c8[0x138]; }; enum mlx5_flow_destination_type { @@ -1341,7 +1359,9 @@ struct mlx5_ifc_fte_match_param_bits { struct mlx5_ifc_fte_match_set_misc2_bits misc_parameters_2; - u8 reserved_at_800[0x800]; + struct mlx5_ifc_fte_match_set_misc3_bits misc_parameters_3; + + u8 reserved_at_a00[0x600]; }; enum { @@ -4850,6 +4870,7 @@ enum { MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS = 0x1, MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_INNER_HEADERS = 0x2, MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2 = 0x3, + MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_3 = 0x4, }; struct mlx5_ifc_query_flow_group_out_bits { @@ -9545,6 +9566,20 @@ struct mlx5_ifc_sw_icm_bits { u8 sw_icm_start_addr[0x40]; u8 reserved_at_c0[0x140]; +}; + +struct mlx5_ifc_geneve_tlv_option_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x18]; + u8 geneve_option_fte_index[0x8]; + + u8 option_class[0x10]; + u8 option_type[0x8]; + u8 reserved_at_78[0x3]; + u8 option_data_length[0x5]; + + u8 reserved_at_80[0x180]; }; struct mlx5_ifc_create_umem_in_bits { @@ -9589,6 +9624,11 @@ struct mlx5_ifc_create_sw_icm_in_bits { struct mlx5_ifc_sw_icm_bits sw_icm; }; +struct mlx5_ifc_create_geneve_tlv_option_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_geneve_tlv_option_bits geneve_tlv_opt; +}; + struct mlx5_ifc_mtrc_string_db_param_bits { u8 string_db_base_address[0x20]; -- cgit v1.2.3 From f1f86d09ca7e35fb161a47bc54ec9cb2f4fe42d8 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 15 Apr 2019 16:43:14 -0400 Subject: netfilter: nf_tables: relocate header content to consumer The nf_tables.h header is used in a lot of files, but it turns out that there is only one actual user of nft_expr_clone(). Hence we relocate that function to be with the one consumer of it and avoid having to process it with CPP for all the other files. This will also enable a reduction in the other headers that the nf_tables.h itself has to include just to be stand-alone, hence a pending further significant reduction in the CPP content that needs to get processed for each netfilter file. Note that the explicit "inline" has been dropped as part of this relocation. In similar changes to this, I believe Dave has asked this be done, so we free up gcc to make the choice of whether to inline or not. Signed-off-by: Paul Gortmaker Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2d5a0a1a87b8..706f744f7308 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -806,23 +806,6 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr); -static inline int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) -{ - int err; - - if (src->ops->clone) { - dst->ops = src->ops; - err = src->ops->clone(dst, src); - if (err < 0) - return err; - } else { - memcpy(dst, src, src->ops->size); - } - - __module_get(src->ops->type->owner); - return 0; -} - /** * struct nft_rule - nf_tables rule * -- cgit v1.2.3 From a4cb98f32c9046fea28bcb4979182f2ff731a27a Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Mon, 15 Apr 2019 16:43:16 -0400 Subject: netfilter: nf_tables: drop include of module.h from nf_tables.h Ideally, header files under include/linux shouldn't be adding includes of other headers, in anticipation of their consumers, but just the headers needed for the header itself to pass parsing with CPP. The module.h is particularly bad in this sense, as it itself does include a whole bunch of other headers, due to the complexity of module support. Since nf_tables.h is not going into a module struct looking for specific fields, we can just let it know that module is a struct, just like about 60 other include/linux headers already do. Signed-off-by: Paul Gortmaker Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 706f744f7308..5b8624ae4a27 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -2,7 +2,6 @@ #ifndef _NET_NF_TABLES_H #define _NET_NF_TABLES_H -#include #include #include #include @@ -13,6 +12,8 @@ #include #include +struct module; + #define NFT_JUMP_STACK_SIZE 16 struct nft_pktinfo { -- cgit v1.2.3 From 8f14c99c7edaaba9c0bb1727d44db6ebf157cc61 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Sun, 7 Apr 2019 08:14:20 -0700 Subject: netfilter: conntrack: limit sysctl setting for boolean options We use the zero and one to limit the boolean options setting. After this patch we only set 0 or 1 to boolean options for nf conntrack sysctl. Signed-off-by: Tonghao Zhang Signed-off-by: Pablo Neira Ayuso --- include/net/netns/conntrack.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index f19b53130bf7..806454e767bf 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -24,9 +24,9 @@ struct nf_generic_net { struct nf_tcp_net { unsigned int timeouts[TCP_CONNTRACK_TIMEOUT_MAX]; - unsigned int tcp_loose; - unsigned int tcp_be_liberal; - unsigned int tcp_max_retrans; + int tcp_loose; + int tcp_be_liberal; + int tcp_max_retrans; }; enum udp_conntrack { -- cgit v1.2.3 From e1f172e162c0a11721f1188f12e5b4c3f9f80de6 Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Wed, 17 Apr 2019 11:46:14 -0300 Subject: netfilter: use macros to create module aliases. Each NAT helper creates a module alias which follows a pattern. Use macros for consistency. Signed-off-by: Flavio Leitner Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index ec52a8dc32fd..28bd4569aa64 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -15,6 +15,10 @@ #include #include +#define NF_NAT_HELPER_NAME(name) "ip_nat_" name +#define MODULE_ALIAS_NF_NAT_HELPER(name) \ + MODULE_ALIAS(NF_NAT_HELPER_NAME(name)) + struct module; enum nf_ct_helper_flags { -- cgit v1.2.3 From 08010a21602678932894c5e87014a282af0079cf Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Wed, 17 Apr 2019 11:46:15 -0300 Subject: netfilter: add API to manage NAT helpers. The API allows a conntrack helper to indicate its corresponding NAT helper which then can be loaded and reference counted. Signed-off-by: Flavio Leitner Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 28bd4569aa64..44b5a00a9c64 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -15,7 +15,8 @@ #include #include -#define NF_NAT_HELPER_NAME(name) "ip_nat_" name +#define NF_NAT_HELPER_PREFIX "ip_nat_" +#define NF_NAT_HELPER_NAME(name) NF_NAT_HELPER_PREFIX name #define MODULE_ALIAS_NF_NAT_HELPER(name) \ MODULE_ALIAS(NF_NAT_HELPER_NAME(name)) @@ -58,6 +59,8 @@ struct nf_conntrack_helper { unsigned int queue_num; /* length of userspace private data stored in nf_conn_help->data */ u16 data_len; + /* name of NAT helper module */ + char nat_mod_name[NF_CT_HELPER_NAME_LEN]; }; /* Must be kept in sync with the classes defined by helpers */ @@ -157,4 +160,21 @@ nf_ct_helper_expectfn_find_by_symbol(const void *symbol); extern struct hlist_head *nf_ct_helper_hash; extern unsigned int nf_ct_helper_hsize; +struct nf_conntrack_nat_helper { + struct list_head list; + char mod_name[NF_CT_HELPER_NAME_LEN]; /* module name */ + struct module *module; /* pointer to self */ +}; + +#define NF_CT_NAT_HELPER_INIT(name) \ + { \ + .mod_name = NF_NAT_HELPER_NAME(name), \ + .module = THIS_MODULE \ + } + +void nf_nat_helper_register(struct nf_conntrack_nat_helper *nat); +void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat); +int nf_nat_helper_try_module_get(const char *name, u16 l3num, + u8 protonum); +void nf_nat_helper_put(struct nf_conntrack_helper *helper); #endif /*_NF_CONNTRACK_HELPER_H*/ -- cgit v1.2.3 From 3087c3f7c23b9c54b956ee5519e97a42413ddf22 Mon Sep 17 00:00:00 2001 From: Brett Mastbergen Date: Wed, 24 Apr 2019 10:48:44 -0400 Subject: netfilter: nft_ct: Add ct id support The 'id' key returns the unique id of the conntrack entry as returned by nf_ct_get_id(). Signed-off-by: Brett Mastbergen Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 061bb3eb20c3..f0cf7b0f4f35 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -967,6 +967,7 @@ enum nft_socket_keys { * @NFT_CT_SRC_IP6: conntrack layer 3 protocol source (IPv6 address) * @NFT_CT_DST_IP6: conntrack layer 3 protocol destination (IPv6 address) * @NFT_CT_TIMEOUT: connection tracking timeout policy assigned to conntrack + * @NFT_CT_ID: conntrack id */ enum nft_ct_keys { NFT_CT_STATE, @@ -993,6 +994,7 @@ enum nft_ct_keys { NFT_CT_SRC_IP6, NFT_CT_DST_IP6, NFT_CT_TIMEOUT, + NFT_CT_ID, __NFT_CT_MAX }; #define NFT_CT_MAX (__NFT_CT_MAX - 1) -- cgit v1.2.3 From 33162e9a0590f16e1b21be764caae517e2bb310c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 28 Apr 2019 21:45:43 +0300 Subject: net: dsa: Store vlan_filtering as a property of dsa_port This allows drivers to query the VLAN setting imposed by the bridge driver directly from DSA, instead of keeping their own state based on the .port_vlan_filtering callback. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index b550f7bb5314..79a87913126c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -161,6 +161,7 @@ struct dsa_port { const char *mac; struct device_node *dn; unsigned int ageing_time; + bool vlan_filtering; u8 stp_state; struct net_device *bridge_dev; struct devlink_port devlink_port; -- cgit v1.2.3 From 8f5d16f638b9a1adf544a7f8cfd11ac1c01c6e25 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 28 Apr 2019 21:45:44 +0300 Subject: net: dsa: Be aware of switches where VLAN filtering is a global setting On some switches, the action of whether to parse VLAN frame headers and use that information for ingress admission is configurable, but not per port. Such is the case for the Broadcom BCM53xx and the NXP SJA1105 families, for example. In that case, DSA can prevent the bridge core from trying to apply different VLAN filtering settings on net devices that belong to the same switch. Signed-off-by: Vladimir Oltean Suggested-by: Florian Fainelli Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 79a87913126c..aab3c2029edd 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -228,6 +228,11 @@ struct dsa_switch { /* Number of switch port queues */ unsigned int num_tx_queues; + /* Disallow bridge core from requesting different VLAN awareness + * settings on ports if not hardware-supported + */ + bool vlan_filtering_is_global; + unsigned long *bitmap; unsigned long _bitmap; -- cgit v1.2.3 From 145746765f06a3dbc7869c81d0165b3ab96f935a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 28 Apr 2019 21:45:48 +0300 Subject: net: dsa: Keep the vlan_filtering setting in dsa_switch if it's global The current behavior is not as obvious as one would assume (which is that, if the driver set vlan_filtering_is_global = 1, then checking any dp->vlan_filtering would yield the same result). Only the ports which are actively enslaved into a bridge would have vlan_filtering set. This makes it tricky for drivers to check what the global state is. So fix this and make the struct dsa_switch hold this global setting. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index aab3c2029edd..4e0f7e9c5aa1 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -233,6 +233,11 @@ struct dsa_switch { */ bool vlan_filtering_is_global; + /* In case vlan_filtering_is_global is set, the VLAN awareness state + * should be retrieved from here and not from the per-port settings. + */ + bool vlan_filtering; + unsigned long *bitmap; unsigned long _bitmap; -- cgit v1.2.3 From cf2d45f5ba9a730df6ec190e0345cecde80b1d8b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 28 Apr 2019 21:45:49 +0300 Subject: net: dsa: Add helper function to retrieve VLAN awareness setting Since different types of hardware may or may not support this setting per-port, DSA keeps it either in dsa_switch or in dsa_port. While drivers may know the characteristics of their hardware and retrieve it from the correct place without the need of helpers, it is cumbersone to find out an unambigous answer from generic DSA code. Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 4e0f7e9c5aa1..1e6b4efc80b9 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -305,6 +305,16 @@ static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port) return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index); } +static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) +{ + const struct dsa_switch *ds = dp->ds; + + if (ds->vlan_filtering_is_global) + return ds->vlan_filtering; + else + return dp->vlan_filtering; +} + typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { -- cgit v1.2.3 From 93e86b3bc842c159a60e6987444bf3952adcd4db Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 28 Apr 2019 02:56:23 +0200 Subject: net: dsa: Remove legacy probing support Now that all drivers can be probed using more traditional methods, remove the legacy probe code. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1e6b4efc80b9..18db7b8e7a8e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -318,15 +318,6 @@ static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { -#if IS_ENABLED(CONFIG_NET_DSA_LEGACY) - /* - * Legacy probing. - */ - const char *(*probe)(struct device *dsa_dev, - struct device *host_dev, int sw_addr, - void **priv); -#endif - enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds, int port); @@ -516,20 +507,6 @@ struct dsa_switch_driver { const struct dsa_switch_ops *ops; }; -#if IS_ENABLED(CONFIG_NET_DSA_LEGACY) -/* Legacy driver registration */ -void register_switch_driver(struct dsa_switch_driver *type); -void unregister_switch_driver(struct dsa_switch_driver *type); -struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev); - -#else -static inline void register_switch_driver(struct dsa_switch_driver *type) { } -static inline void unregister_switch_driver(struct dsa_switch_driver *type) { } -static inline struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev) -{ - return NULL; -} -#endif struct net_device *dsa_dev_to_net_device(struct device *dev); /* Keep inline for faster access in hot path */ -- cgit v1.2.3 From b587bdaf5f820cf7dac2c1b351db97bf98e1f427 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 29 Apr 2019 12:41:45 +0300 Subject: devlink: Change devlink health locking mechanism The devlink health reporters create/destroy and user commands currently use the devlink->lock as a locking mechanism. Different reporters have different rules in the driver and are being created/destroyed during different stages of driver load/unload/running. So during execution of a reporter recover the flow can go through another reporter's destroy and create. Such flow leads to deadlock trying to lock a mutex already held. With the new locking mechanism the different reporters share mutex lock only to protect access to shared reporters list. Added refcount per reporter, to protect the reporters from destroy while being used. Signed-off-by: Moshe Shemesh Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4f5e41613503..1c4adfb4195a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -32,6 +32,7 @@ struct devlink { struct list_head region_list; u32 snapshot_id; struct list_head reporter_list; + struct mutex reporters_lock; /* protects reporter_list */ struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; -- cgit v1.2.3 From a3d43c0d56f1b94e74963a2fbadfb70126d92213 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 29 Apr 2019 15:48:31 -0700 Subject: taprio: Add support adding an admin schedule The IEEE 802.1Q-2018 defines two "types" of schedules, the "Oper" (from operational?) and "Admin" ones. Up until now, 'taprio' only had support for the "Oper" one, added when the qdisc is created. This adds support for the "Admin" one, which allows the .change() operation to be supported. Just for clarification, some quick (and dirty) definitions, the "Oper" schedule is the currently (as in this instant) running one, and it's read-only. The "Admin" one is the one that the system configurator has installed, it can be changed, and it will be "promoted" to "Oper" when it's 'base-time' is reached. The idea behing this patch is that calling something like the below, (after taprio is already configured with an initial schedule): $ tc qdisc change taprio dev IFACE parent root \ base-time X \ sched-entry \ ... Will cause a new admin schedule to be created and programmed to be "promoted" to "Oper" at instant X. If an "Admin" schedule already exists, it will be overwritten with the new parameters. Up until now, there was some code that was added to ease the support of changing a single entry of a schedule, but was ultimately unused. Now, that we have support for "change" with more well thought semantics, updating a single entry seems to be less useful. So we remove what is in practice dead code, and return a "not supported" error if the user tries to use it. If changing a single entry would make the user's life easier we may ressurrect this idea, but at this point, removing it simplifies the code. For now, only the schedule specific bits are allowed to be added for a new schedule, that means that 'clockid', 'num_tc', 'map' and 'queues' cannot be modified. Example: $ tc qdisc change dev IFACE parent root handle 100 taprio \ base-time $BASE_TIME \ sched-entry S 00 500000 \ sched-entry S 0f 500000 \ clockid CLOCK_TAI The only change in the netlink API introduced by this change is the introduction of an "admin" type in the response to a dump request, that type allows userspace to separate the "oper" schedule from the "admin" schedule. If userspace doesn't support the "admin" type, it will only display the "oper" schedule. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 7ee74c3474bf..d59770d0eb84 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1148,6 +1148,16 @@ enum { #define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) +/* The format for the admin sched (dump only): + * [TCA_TAPRIO_SCHED_ADMIN_SCHED] + * [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_ATTR_SCHED_ENTRY_INTERVAL] + */ + enum { TCA_TAPRIO_ATTR_UNSPEC, TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ @@ -1156,6 +1166,7 @@ enum { TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ TCA_TAPRIO_PAD, + TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ __TCA_TAPRIO_ATTR_MAX, }; -- cgit v1.2.3 From 6ca6a6654225f3cd001304d33429c817e0c0b85f Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 29 Apr 2019 15:48:32 -0700 Subject: taprio: Add support for setting the cycle-time manually IEEE 802.1Q-2018 defines that a the cycle-time of a schedule may be overridden, so the schedule is truncated to a determined "width". Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index d59770d0eb84..7a32276838e1 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1167,6 +1167,7 @@ enum { TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ TCA_TAPRIO_PAD, TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ + TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ __TCA_TAPRIO_ATTR_MAX, }; -- cgit v1.2.3 From c25031e993440debdd530278ce2171ce477df029 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 29 Apr 2019 15:48:33 -0700 Subject: taprio: Add support for cycle-time-extension IEEE 802.1Q-2018 defines the concept of a cycle-time-extension, so the last entry of a schedule before the start of a new schedule can be extended, so "too-short" entries can be avoided. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 7a32276838e1..8b2f993cbb77 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1168,6 +1168,7 @@ enum { TCA_TAPRIO_PAD, TCA_TAPRIO_ATTR_ADMIN_SCHED, /* The admin sched, only used in dump */ TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */ __TCA_TAPRIO_ATTR_MAX, }; -- cgit v1.2.3 From 8425c41d1ef762cc15d9501d7117f009a79f3fe9 Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Tue, 30 Apr 2019 09:17:49 +0200 Subject: net: ll_temac: Extend support to non-device-tree platforms Support initialization with platdata, so the driver can be used on non-device-tree platforms. For currently supported device-tree platforms, the driver should behave as before. Signed-off-by: Esben Haabendal Signed-off-by: David S. Miller --- include/linux/platform_data/xilinx-ll-temac.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/platform_data/xilinx-ll-temac.h (limited to 'include') diff --git a/include/linux/platform_data/xilinx-ll-temac.h b/include/linux/platform_data/xilinx-ll-temac.h new file mode 100644 index 000000000000..82e2f80648b0 --- /dev/null +++ b/include/linux/platform_data/xilinx-ll-temac.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_XILINX_LL_TEMAC_H +#define __LINUX_XILINX_LL_TEMAC_H + +#include +#include + +struct ll_temac_platform_data { + bool txcsum; /* Enable/disable TX checksum */ + bool rxcsum; /* Enable/disable RX checksum */ + u8 mac_addr[ETH_ALEN]; /* MAC address (6 bytes) */ + /* Clock frequency for input to MDIO clock generator */ + u32 mdio_clk_freq; + unsigned long long mdio_bus_id; /* Unique id for MDIO bus */ + int phy_addr; /* Address of the PHY to connect to */ + phy_interface_t phy_interface; /* PHY interface mode */ +}; + +#endif /* __LINUX_XILINX_LL_TEMAC_H */ -- cgit v1.2.3 From a3246dc41aa3c9d799478ccc8dac5d19c509a923 Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Tue, 30 Apr 2019 09:17:51 +0200 Subject: net: ll_temac: Add support for non-native register endianness Replace the powerpc specific MMIO register access functions with the generic big-endian mmio access functions, and add support for little-endian access depending on configuration. Big-endian access is maintained as the default, but little-endian can be configured in device-tree binding or in platform data. The temac_ior()/temac_iow() functions are replaced with macro wrappers to avoid modifying existing code more than necessary. Signed-off-by: Esben Haabendal Signed-off-by: David S. Miller --- include/linux/platform_data/xilinx-ll-temac.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/xilinx-ll-temac.h b/include/linux/platform_data/xilinx-ll-temac.h index 82e2f80648b0..af87927abab3 100644 --- a/include/linux/platform_data/xilinx-ll-temac.h +++ b/include/linux/platform_data/xilinx-ll-temac.h @@ -14,6 +14,8 @@ struct ll_temac_platform_data { unsigned long long mdio_bus_id; /* Unique id for MDIO bus */ int phy_addr; /* Address of the PHY to connect to */ phy_interface_t phy_interface; /* PHY interface mode */ + bool reg_little_endian; /* Little endian TEMAC register access */ + bool dma_little_endian; /* Little endian DMA register access */ }; #endif /* __LINUX_XILINX_LL_TEMAC_H */ -- cgit v1.2.3 From f14f5c11f051ca4a41e65017d94408e5e702ba9d Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Tue, 30 Apr 2019 09:17:54 +0200 Subject: net: ll_temac: Support indirect_mutex share within TEMAC IP Indirect register access goes through a DCR bus bridge, which allows only one outstanding transaction. And to make matters worse, each TEMAC IP block contains two Ethernet interfaces, and although they seem to have separate registers for indirect access, they actually share the registers. Or to be more specific, MSW, LSW and CTL registers are physically shared between Ethernet interfaces in same TEMAC IP, with RDY register being (almost) specificic to the Ethernet interface. The 0x10000 bit in RDY reflects combined bus ready state though. So we need to take care to synchronize not only within a single device, but also between devices in same TEMAC IP. This commit allows to do that with legacy platform devices. For OF devices, the xlnx,compound parent of the temac node should be used to find siblings, and setup a shared indirect_mutex between them. I will leave this work to somebody else, as I don't have hardware to test that. No regression is introduced by that, as before this commit using two Ethernet interfaces in same TEMAC block is simply broken. Signed-off-by: Esben Haabendal Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/platform_data/xilinx-ll-temac.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/xilinx-ll-temac.h b/include/linux/platform_data/xilinx-ll-temac.h index af87927abab3..b0b8238a9b7d 100644 --- a/include/linux/platform_data/xilinx-ll-temac.h +++ b/include/linux/platform_data/xilinx-ll-temac.h @@ -16,6 +16,12 @@ struct ll_temac_platform_data { phy_interface_t phy_interface; /* PHY interface mode */ bool reg_little_endian; /* Little endian TEMAC register access */ bool dma_little_endian; /* Little endian DMA register access */ + /* Pre-initialized mutex to use for synchronizing indirect + * register access. When using both interfaces of a single + * TEMAC IP block, the same mutex should be passed here, as + * they share the same DCR bus bridge. + */ + struct mutex *indirect_mutex; }; #endif /* __LINUX_XILINX_LL_TEMAC_H */ -- cgit v1.2.3 From 7e97a194aca03c6ff86f84e46e196f5c9ed5c32c Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Tue, 30 Apr 2019 09:17:58 +0200 Subject: net: ll_temac: Allow configuration of IRQ coalescing This allows custom setup of IRQ coalescing for platforms using legacy platform_device. The irq timeout and count parameters can be used for tuning cpu load vs. latency. I have maintained the 0x00000400 bit in TX_CHNL_CTRL. It is specified as unused in the documentation I have available. It does not make any difference in the hardware I have available, so it is left in to not risk breaking other platforms where it might be used. Signed-off-by: Esben Haabendal Signed-off-by: David S. Miller --- include/linux/platform_data/xilinx-ll-temac.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/xilinx-ll-temac.h b/include/linux/platform_data/xilinx-ll-temac.h index b0b8238a9b7d..368530f98176 100644 --- a/include/linux/platform_data/xilinx-ll-temac.h +++ b/include/linux/platform_data/xilinx-ll-temac.h @@ -22,6 +22,11 @@ struct ll_temac_platform_data { * they share the same DCR bus bridge. */ struct mutex *indirect_mutex; + /* DMA channel control setup */ + u8 tx_irq_timeout; /* TX Interrupt Delay Time-out */ + u8 tx_irq_count; /* TX Interrupt Coalescing Threshold Count */ + u8 rx_irq_timeout; /* RX Interrupt Delay Time-out */ + u8 rx_irq_count; /* RX Interrupt Coalescing Threshold Count */ }; #endif /* __LINUX_XILINX_LL_TEMAC_H */ -- cgit v1.2.3 From 91a40a48d52d13fbde3239d5839335cabd9a4eae Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Wed, 1 May 2019 03:21:05 +0000 Subject: net/mlx5: Fix broken hca cap offset The cited commit broke the offsets of hca cap struct, fix it. While at it, cleanup a white space introduced by the same commit. Fixes: b169e64a2444 ("net/mlx5: Geneve, Add flow table capabilities for Geneve decap with TLV options") Reported-by: Qian Cai Cc: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6a7fc18a9fe3..6b2e6b710ac0 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1265,7 +1265,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 max_geneve_tlv_option_data_len[0x5]; u8 reserved_at_570[0x10]; - u8 reserved_at_580[0x1c]; + u8 reserved_at_580[0x3c]; u8 mini_cqe_resp_stride_index[0x1]; u8 cqe_128_always[0x1]; u8 cqe_compression_128[0x1]; @@ -9566,7 +9566,7 @@ struct mlx5_ifc_sw_icm_bits { u8 sw_icm_start_addr[0x40]; u8 reserved_at_c0[0x140]; -}; +}; struct mlx5_ifc_geneve_tlv_option_bits { u8 modify_field_select[0x40]; -- cgit v1.2.3 From 0e1a2a3e6e7d37cea9f8586f6d7745b539147d9f Mon Sep 17 00:00:00 2001 From: Erez Alfasi Date: Tue, 5 Mar 2019 15:42:23 +0200 Subject: ethtool: Add SFF-8436 and SFF-8636 max EEPROM length definitions Added max EEPROM length defines for ethtool usage: #define ETH_MODULE_SFF_8636_MAX_LEN 640 #define ETH_MODULE_SFF_8436_MAX_LEN 640 These definitions are used to determine the EEPROM data length when reading high eeprom pages. For example, SFF-8636 EEPROM data from page 03h needs to be stored at data[512] - data[639]. Signed-off-by: Erez Alfasi Signed-off-by: Saeed Mahameed --- include/uapi/linux/ethtool.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 818ad368b586..3534ce157ae9 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1712,6 +1712,9 @@ static inline int ethtool_validate_duplex(__u8 duplex) #define ETH_MODULE_SFF_8436 0x4 #define ETH_MODULE_SFF_8436_LEN 256 +#define ETH_MODULE_SFF_8636_MAX_LEN 640 +#define ETH_MODULE_SFF_8436_MAX_LEN 640 + /* Reset flags */ /* The reset() operation must clear the flags for the components which * were actually reset. On successful return, the flags indicate the -- cgit v1.2.3 From a708fb7b1f8dcc7a8ed949839958cd5d812dd939 Mon Sep 17 00:00:00 2001 From: Erez Alfasi Date: Thu, 21 Mar 2019 15:02:13 +0200 Subject: net/mlx5e: ethtool, Add support for EEPROM high pages query Add the support to read additional EEPROM information from high pages. Information for modules such as SFF-8436 and SFF-8636: 1) Application select table 2) User writable EEPROM 3) Thresholds and alarms Signed-off-by: Erez Alfasi Signed-off-by: Saeed Mahameed --- include/linux/mlx5/port.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 64e78394fc9c..de9a272c9f3d 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -60,6 +60,7 @@ enum mlx5_an_status { #define MLX5_I2C_ADDR_LOW 0x50 #define MLX5_I2C_ADDR_HIGH 0x51 #define MLX5_EEPROM_PAGE_LENGTH 256 +#define MLX5_EEPROM_HIGH_PAGE_LENGTH 128 enum mlx5e_link_mode { MLX5E_1000BASE_CX_SGMII = 0, -- cgit v1.2.3 From c9bbfb378bc35fcd0b51e4a8950bd50447f39832 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Fri, 12 Apr 2019 16:14:03 -0500 Subject: net/mlx5: Remove unused mlx5_query_nic_vport_vlans mlx5_query_nic_vport_vlans() is not used anymore. Hence remove it. This patch doesn't change any functionality. Signed-off-by: Bodong Wang Reviewed-by: Parav Pandit Signed-off-by: Saeed Mahameed --- include/linux/mlx5/vport.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 0eef548b9946..3d1c6cdbbba7 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -118,10 +118,6 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, int promisc_uc, int promisc_mc, int promisc_all); -int mlx5_query_nic_vport_vlans(struct mlx5_core_dev *dev, - u16 vport, - u16 vlans[], - int *size); int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev, u16 vlans[], int list_size); -- cgit v1.2.3 From 6f4e02193c9a9ea54dd3151cf97489fa787cd0e6 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Thu, 18 Apr 2019 18:24:15 -0500 Subject: net/mlx5: E-Switch, Use atomic rep state to serialize state change When the state of rep was introduced, it was also designed to prevent duplicate unloading of the same rep. Considering the following two flows when an eswitch manager is at switchdev mode with n VF reps loaded. +--------------------------------------+--------------------------------+ | cpu-0 | cpu-1 | | -------- | -------- | | mlx5_ib_remove | mlx5_eswitch_disable_sriov | | mlx5_ib_unregister_vport_reps | esw_offloads_cleanup | | mlx5_eswitch_unregister_vport_reps | esw_offloads_unload_all_reps | | __unload_reps_all_vport | __unload_reps_all_vport | +--------------------------------------+--------------------------------+ These two flows will try to unload the same rep. Per original design, once one flow unloads the rep, the state moves to REGISTERED. The 2nd flow will no longer needs to do the unload and bails out. However, as read and write of the state is not atomic, when 1st flow is doing the unload, the state is still LOADED, 2nd flow is able to do the same unload action. Kernel crash will happen. To solve this, driver should do atomic test-and-set for the state. So that only one flow can change the rep state from LOADED to REGISTERED, and proceed to do the actual unloading. Since the state is changing to atomic type, all other read/write should be atomic action as well. Fixes: f121e0ea9586 (net/mlx5: E-Switch, Add state to eswitch vport representors) Signed-off-by: Bodong Wang Reviewed-by: Parav Pandit Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eswitch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index 96d8435421de..0ca77dd1429c 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -35,7 +35,7 @@ struct mlx5_eswitch_rep_if { void (*unload)(struct mlx5_eswitch_rep *rep); void *(*get_proto_dev)(struct mlx5_eswitch_rep *rep); void *priv; - u8 state; + atomic_t state; }; struct mlx5_eswitch_rep { -- cgit v1.2.3 From 8b952747844526cef50fa2e0ae903f586e3cb2e4 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Fri, 3 May 2019 12:36:58 +0200 Subject: net: macb: shrink macb_platform_data structure This structure was used intensively for machine specific values when DT was not used. Since the removal of AVR32 from the kernel, this structure is only used for passing clocks from PCI macb wrapper, all other fields being 0. All other known platforms use DT. Remove the leftovers but make sure that PCI macb still works as expected by using default values: - phydev->irq is set to PHY_POLL by mdiobus_alloc() - mii_bus->phy_mask is cleared while allocating it - bp->phy_interface is set to PHY_INTERFACE_MODE_MII if mode not found in DT. This simplifies driver probe path and particularly phy handling. Signed-off-by: Nicolas Ferre Signed-off-by: David S. Miller --- include/linux/platform_data/macb.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/platform_data/macb.h b/include/linux/platform_data/macb.h index 7815d50c26ff..2bc51b822956 100644 --- a/include/linux/platform_data/macb.h +++ b/include/linux/platform_data/macb.h @@ -12,19 +12,10 @@ /** * struct macb_platform_data - platform data for MACB Ethernet - * @phy_mask: phy mask passed when register the MDIO bus - * within the driver - * @phy_irq_pin: PHY IRQ - * @is_rmii: using RMII interface? - * @rev_eth_addr: reverse Ethernet address byte order * @pclk: platform clock * @hclk: AHB clock */ struct macb_platform_data { - u32 phy_mask; - int phy_irq_pin; - u8 is_rmii; - u8 rev_eth_addr; struct clk *pclk; struct clk *hclk; }; -- cgit v1.2.3 From 554aae35007e49f533d3d10e788295f7141725bc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 2 May 2019 23:23:29 +0300 Subject: lib: Add support for generic packing operations This provides an unified API for accessing register bit fields regardless of memory layout. The basic unit of data for these API functions is the u64. The process of transforming an u64 from native CPU encoding into the peripheral's encoding is called 'pack', and transforming it from peripheral to native CPU encoding is 'unpack'. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/packing.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 include/linux/packing.h (limited to 'include') diff --git a/include/linux/packing.h b/include/linux/packing.h new file mode 100644 index 000000000000..54667735cc67 --- /dev/null +++ b/include/linux/packing.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright (c) 2016-2018, NXP Semiconductors + * Copyright (c) 2018-2019, Vladimir Oltean + */ +#ifndef _LINUX_PACKING_H +#define _LINUX_PACKING_H + +#include +#include + +#define QUIRK_MSB_ON_THE_RIGHT BIT(0) +#define QUIRK_LITTLE_ENDIAN BIT(1) +#define QUIRK_LSW32_IS_FIRST BIT(2) + +enum packing_op { + PACK, + UNPACK, +}; + +/** + * packing - Convert numbers (currently u64) between a packed and an unpacked + * format. Unpacked means laid out in memory in the CPU's native + * understanding of integers, while packed means anything else that + * requires translation. + * + * @pbuf: Pointer to a buffer holding the packed value. + * @uval: Pointer to an u64 holding the unpacked value. + * @startbit: The index (in logical notation, compensated for quirks) where + * the packed value starts within pbuf. Must be larger than, or + * equal to, endbit. + * @endbit: The index (in logical notation, compensated for quirks) where + * the packed value ends within pbuf. Must be smaller than, or equal + * to, startbit. + * @op: If PACK, then uval will be treated as const pointer and copied (packed) + * into pbuf, between startbit and endbit. + * If UNPACK, then pbuf will be treated as const pointer and the logical + * value between startbit and endbit will be copied (unpacked) to uval. + * @quirks: A bit mask of QUIRK_LITTLE_ENDIAN, QUIRK_LSW32_IS_FIRST and + * QUIRK_MSB_ON_THE_RIGHT. + * + * Return: 0 on success, EINVAL or ERANGE if called incorrectly. Assuming + * correct usage, return code may be discarded. + * If op is PACK, pbuf is modified. + * If op is UNPACK, uval is modified. + */ +int packing(void *pbuf, u64 *uval, int startbit, int endbit, size_t pbuflen, + enum packing_op op, u8 quirks); + +#endif -- cgit v1.2.3 From 8aa9ebccae87621d997707e4f25e53fddd7e30e4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 2 May 2019 23:23:30 +0300 Subject: net: dsa: Introduce driver for NXP SJA1105 5-port L2 switch At this moment the following is supported: * Link state management through phylib * Autonomous L2 forwarding managed through iproute2 bridge commands. IP termination must be done currently through the master netdevice, since the switch is unmanaged at this point and using DSA_TAG_PROTO_NONE. Signed-off-by: Vladimir Oltean Signed-off-by: Georg Waibel Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 include/linux/dsa/sja1105.h (limited to 'include') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h new file mode 100644 index 000000000000..30559d1d0e1b --- /dev/null +++ b/include/linux/dsa/sja1105.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2019, Vladimir Oltean + */ + +/* Included by drivers/net/dsa/sja1105/sja1105.h */ + +#ifndef _NET_DSA_SJA1105_H +#define _NET_DSA_SJA1105_H + +/* The switch can only be convinced to stay in unmanaged mode and not trap any + * link-local traffic by actually telling it to filter frames sent at the + * 00:00:00:00:00:00 destination MAC. + */ +#define SJA1105_LINKLOCAL_FILTER_A 0x000000000000ull +#define SJA1105_LINKLOCAL_FILTER_A_MASK 0xFFFFFFFFFFFFull +#define SJA1105_LINKLOCAL_FILTER_B 0x000000000000ull +#define SJA1105_LINKLOCAL_FILTER_B_MASK 0xFFFFFFFFFFFFull + +#endif /* _NET_DSA_SJA1105_H */ -- cgit v1.2.3 From bf5bc3ce8a8f32a0d45b6820ede8f9fc3e9c23df Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 2 May 2019 23:23:33 +0300 Subject: ether: Add dedicated Ethertype for pseudo-802.1Q DSA tagging There are two possible utilizations so far: - Switch devices that don't support a native insertion/extraction header on the CPU port may still enjoy the benefits of port isolation with a custom VLAN tag. For this, they need to have a customizable TPID in hardware and a new Ethertype to distinguish between real 802.1Q traffic and the private tags used for port separation. - Switches that don't support the deactivation of VLAN awareness, but still want to have a mode in which they accept all traffic, including frames that are tagged with a VLAN not configured on their ports, may use this as a fake to trick the hardware into thinking that the TPID for VLAN is something other than 0x8100. What follows after the ETH_P_DSA_8021Q EtherType is a regular VLAN header (TCI), however there is no other EtherType that can be used for this purpose and doesn't already have a well-defined meaning. ETH_P_8021AD, ETH_P_QINQ1, ETH_P_QINQ2 and ETH_P_QINQ3 expect that another follow-up VLAN tag is present, which is not the case here. Signed-off-by: Vladimir Oltean Suggested-by: Andrew Lunn Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 3a45b4ad71a3..3158ba672b72 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -109,6 +109,7 @@ #define ETH_P_QINQ2 0x9200 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_QINQ3 0x9300 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_EDSA 0xDADA /* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_DSA_8021Q 0xDADB /* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_IFE 0xED3E /* ForCES inter-FE LFB type */ #define ETH_P_AF_IUCV 0xFBFB /* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */ -- cgit v1.2.3 From 6666cebc5e306f49a25bd20aa8c1cb8ef8950df5 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 2 May 2019 23:23:34 +0300 Subject: net: dsa: sja1105: Add support for VLAN operations VLAN filtering cannot be properly disabled in SJA1105. So in order to emulate the "no VLAN awareness" behavior (not dropping traffic that is tagged with a VID that isn't configured on the port), we need to hack another switch feature: programmable TPID (which is 0x8100 for 802.1Q). We are reprogramming the TPID to a bogus value which leaves the switch thinking that all traffic is untagged, and therefore accepts it. Under a vlan_filtering bridge, the proper TPID of ETH_P_8021Q is installed again, and the switch starts identifying 802.1Q-tagged traffic. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index 30559d1d0e1b..abf3977e34fd 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -7,6 +7,10 @@ #ifndef _NET_DSA_SJA1105_H #define _NET_DSA_SJA1105_H +#include + +#define ETH_P_SJA1105 ETH_P_DSA_8021Q + /* The switch can only be convinced to stay in unmanaged mode and not trap any * link-local traffic by actually telling it to filter frames sent at the * 00:00:00:00:00:00 destination MAC. -- cgit v1.2.3 From a27415decd84dac124c6185f1184b6c779d0a5ab Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 1 May 2019 00:10:50 +0200 Subject: net: dsa: mv88e6xxx: Pass interrupt number in platform data Allow an interrupt number to be passed in the platform data. The driver will then use it if not zero, otherwise it will poll for interrupts. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/platform_data/mv88e6xxx.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_data/mv88e6xxx.h b/include/linux/platform_data/mv88e6xxx.h index 963730b44aea..21452a9365e1 100644 --- a/include/linux/platform_data/mv88e6xxx.h +++ b/include/linux/platform_data/mv88e6xxx.h @@ -13,6 +13,7 @@ struct dsa_mv88e6xxx_pdata { unsigned int enabled_ports; struct net_device *netdev; u32 eeprom_len; + int irq; }; #endif -- cgit v1.2.3 From 141b6b2ad75d92770240de3af98d55c41ce7cd18 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 1 May 2019 19:56:59 -0700 Subject: net: add a generic tracepoint for TX queue timeout Although devlink health report does a nice job on reporting TX timeout and other NIC errors, unfortunately it requires drivers to support it but currently only mlx5 has implemented it. Before other drivers could catch up, it is useful to have a generic tracepoint to monitor this kind of TX timeout. We have been suffering TX timeout with different drivers, we plan to start to monitor it with rasdaemon which just needs a new tracepoint. Sample output: ksoftirqd/1-16 [001] ..s2 144.043173: net_dev_xmit_timeout: dev=ens3 driver=e1000 queue=0 Cc: Eran Ben Elisha Cc: Jiri Pirko Signed-off-by: Cong Wang Acked-by: Jiri Pirko Reviewed-by: Eran Ben Elisha Signed-off-by: David S. Miller --- include/trace/events/net.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/trace/events/net.h b/include/trace/events/net.h index 1efd7d9b25fe..2399073c3afc 100644 --- a/include/trace/events/net.h +++ b/include/trace/events/net.h @@ -95,6 +95,29 @@ TRACE_EVENT(net_dev_xmit, __get_str(name), __entry->skbaddr, __entry->len, __entry->rc) ); +TRACE_EVENT(net_dev_xmit_timeout, + + TP_PROTO(struct net_device *dev, + int queue_index), + + TP_ARGS(dev, queue_index), + + TP_STRUCT__entry( + __string( name, dev->name ) + __string( driver, netdev_drivername(dev)) + __field( int, queue_index ) + ), + + TP_fast_assign( + __assign_str(name, dev->name); + __assign_str(driver, netdev_drivername(dev)); + __entry->queue_index = queue_index; + ), + + TP_printk("dev=%s driver=%s queue=%d", + __get_str(name), __get_str(driver), __entry->queue_index) +); + DECLARE_EVENT_CLASS(net_dev_template, TP_PROTO(struct sk_buff *skb), -- cgit v1.2.3 From 22c0ef6b1475aef4765efc4aa764b8580018123c Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 1 May 2019 21:34:43 +0200 Subject: net: phy: improve pause handling When probing the phy device we set sym and asym pause in the "supported" bitmap (unless the PHY tells us otherwise). However we don't know yet whether the MAC supports pause. Simply copying phy->supported to phy->advertising will trigger advertising pause, and that's not what we want. Therefore add phy_advertise_supported() that copies all modes but doesn't touch the pause bits. In phy_support_(a)sym_pause we shouldn't set any bits in the supported bitmap because we may set a bit the PHY intentionally disabled. Effective pause support should be the AND-combined PHY and MAC pause capabilities. If the MAC supports everything, then it's only relevant what the PHY supports. If MAC supports sym pause only, then we have to clear the asym bit in phydev->supported. Copy the pause flags only and don't touch the modes, because a driver may have intentionally removed a mode from phydev->advertising. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 0f9552b17ee7..4a03f8a46d33 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1154,6 +1154,7 @@ void phy_request_interrupt(struct phy_device *phydev); void phy_print_status(struct phy_device *phydev); int phy_set_max_speed(struct phy_device *phydev, u32 max_speed); void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode); +void phy_advertise_supported(struct phy_device *phydev); void phy_support_sym_pause(struct phy_device *phydev); void phy_support_asym_pause(struct phy_device *phydev); void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx, -- cgit v1.2.3 From f24098f80748ea95d53603a7bb7954a41bb3ca1b Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 1 May 2019 22:14:21 +0200 Subject: net: phy: improve resuming from hibernation I got an interesting report [0] that after resuming from hibernation the link has 100Mbps instead of 1Gbps. Reason is that another OS has been used whilst Linux was hibernated. And this OS speeds down the link due to WoL. Therefore, when resuming, we shouldn't expect that what the PHY advertises is what it did when hibernating. Easiest way to do this is removing state PHY_RESUMING. Instead always go via PHY_UP that configures PHY advertisement. [0] https://bugzilla.kernel.org/show_bug.cgi?id=202851 Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 4a03f8a46d33..073fb151b5a9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -308,13 +308,7 @@ struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr); * * HALTED: PHY is up, but no polling or interrupts are done. Or * PHY is in an error state. - * - * - phy_start moves to RESUMING - * - * RESUMING: PHY was halted, but now wants to run again. - * - If we are forcing, or aneg is done, timer moves to RUNNING - * - If aneg is not done, timer moves to AN - * - phy_stop moves to HALTED + * - phy_start moves to UP */ enum phy_state { PHY_DOWN = 0, @@ -324,7 +318,6 @@ enum phy_state { PHY_RUNNING, PHY_NOLINK, PHY_FORCING, - PHY_RESUMING }; /** -- cgit v1.2.3 From b424e432e770d6dd572765459d5b6a96a19c5286 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 2 May 2019 16:15:10 +0200 Subject: netlink: add validation of NLA_F_NESTED flag Add new validation flag NL_VALIDATE_NESTED which adds three consistency checks of NLA_F_NESTED_FLAG: - the flag is set on attributes with NLA_NESTED{,_ARRAY} policy - the flag is not set on attributes with other policies except NLA_UNSPEC - the flag is set on attribute passed to nla_parse_nested() Signed-off-by: Michal Kubecek v2: change error messages to mention NLA_F_NESTED explicitly Reviewed-by: Johannes Berg Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/netlink.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 679f649748d4..395b4406f4b0 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -401,6 +401,8 @@ struct nl_info { * are enforced going forward. * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g. * U8, U16, U32 must have exact size, etc.) + * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY) + * and unset for other policies. */ enum netlink_validation { NL_VALIDATE_LIBERAL = 0, @@ -408,6 +410,7 @@ enum netlink_validation { NL_VALIDATE_MAXTYPE = BIT(1), NL_VALIDATE_UNSPEC = BIT(2), NL_VALIDATE_STRICT_ATTRS = BIT(3), + NL_VALIDATE_NESTED = BIT(4), }; #define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\ @@ -415,7 +418,8 @@ enum netlink_validation { #define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\ NL_VALIDATE_MAXTYPE |\ NL_VALIDATE_UNSPEC |\ - NL_VALIDATE_STRICT_ATTRS) + NL_VALIDATE_STRICT_ATTRS |\ + NL_VALIDATE_NESTED) int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, @@ -1132,6 +1136,11 @@ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { + if (!(nla->nla_type & NLA_F_NESTED)) { + NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing"); + return -EINVAL; + } + return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, NL_VALIDATE_STRICT, extack); } -- cgit v1.2.3 From 0f457a36626fa94026e483836fbf29e451434567 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 30 Apr 2019 07:45:48 -0700 Subject: ipv4: Move cached routes to fib_nh_common While the cached routes, nh_pcpu_rth_output and nh_rth_input, are IPv4 specific, a later patch wants to make them accessible for IPv6 nexthops with IPv4 routes using a fib6_nh. Move the cached routes from fib_nh to fib_nh_common and update references. Initialization of the cached entries is moved to fib_nh_common_init, and free is moved to fib_nh_common_release. Change in location only, from fib_nh up to fib_nh_common; no functional change intended. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 772a9e61bd84..659c5081c40b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -96,6 +96,10 @@ struct fib_nh_common { int nhc_weight; atomic_t nhc_upper_bound; + + /* v4 specific, but allows fib6_nh with v4 routes */ + struct rtable __rcu * __percpu *nhc_pcpu_rth_output; + struct rtable __rcu *nhc_rth_input; }; struct fib_nh { @@ -107,8 +111,6 @@ struct fib_nh { #endif __be32 nh_saddr; int nh_saddr_genid; - struct rtable __rcu * __percpu *nh_pcpu_rth_output; - struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket __rcu *nh_exceptions; #define fib_nh_family nh_common.nhc_family #define fib_nh_dev nh_common.nhc_dev -- cgit v1.2.3 From a5995e7107eb3d9c44744d3cf47d49fabfef01f5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 30 Apr 2019 07:45:50 -0700 Subject: ipv4: Move exception bucket to nh_common Similar to the cached routes, make IPv4 exceptions accessible when using an IPv6 nexthop struct with IPv4 routes. Simplify the exception functions by passing in fib_nh_common since that is all it needs, and then cleanup the call sites that have extraneous fib_nh conversions. As with the cached routes this is a change in location only, from fib_nh up to fib_nh_common; no functional change intended. Signed-off-by: David Ahern Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/ip_fib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 659c5081c40b..d0e28f4ab099 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -100,6 +100,7 @@ struct fib_nh_common { /* v4 specific, but allows fib6_nh with v4 routes */ struct rtable __rcu * __percpu *nhc_pcpu_rth_output; struct rtable __rcu *nhc_rth_input; + struct fnhe_hash_bucket __rcu *nhc_exceptions; }; struct fib_nh { @@ -111,7 +112,6 @@ struct fib_nh { #endif __be32 nh_saddr; int nh_saddr_genid; - struct fnhe_hash_bucket __rcu *nh_exceptions; #define fib_nh_family nh_common.nhc_family #define fib_nh_dev nh_common.nhc_dev #define fib_nh_oif nh_common.nhc_oif -- cgit v1.2.3 From f80c5dad7b6467b884c445ffea45985793b4b2d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Paulo=20Rechi=20Vita?= Date: Thu, 2 May 2019 10:01:52 +0800 Subject: Bluetooth: Ignore CC events not matching the last HCI command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit makes the kernel not send the next queued HCI command until a command complete arrives for the last HCI command sent to the controller. This change avoids a problem with some buggy controllers (seen on two SKUs of QCA9377) that send an extra command complete event for the previous command after the kernel had already sent a new HCI command to the controller. The problem was reproduced when starting an active scanning procedure, where an extra command complete event arrives for the LE_SET_RANDOM_ADDR command. When this happends the kernel ends up not processing the command complete for the following commmand, LE_SET_SCAN_PARAM, and ultimately behaving as if a passive scanning procedure was being performed, when in fact controller is performing an active scanning procedure. This makes it impossible to discover BLE devices as no device found events are sent to userspace. This problem is reproducible on 100% of the attempts on the affected controllers. The extra command complete event can be seen at timestamp 27.420131 on the btmon logs bellow. Bluetooth monitor ver 5.50 = Note: Linux version 5.0.0+ (x86_64) 0.352340 = Note: Bluetooth subsystem version 2.22 0.352343 = New Index: 80:C5:F2:8F:87:84 (Primary,USB,hci0) [hci0] 0.352344 = Open Index: 80:C5:F2:8F:87:84 [hci0] 0.352345 = Index Info: 80:C5:F2:8F:87:84 (Qualcomm) [hci0] 0.352346 @ MGMT Open: bluetoothd (privileged) version 1.14 {0x0001} 0.352347 @ MGMT Open: btmon (privileged) version 1.14 {0x0002} 0.352366 @ MGMT Open: btmgmt (privileged) version 1.14 {0x0003} 27.302164 @ MGMT Command: Start Discovery (0x0023) plen 1 {0x0003} [hci0] 27.302310 Address type: 0x06 LE Public LE Random < HCI Command: LE Set Random Address (0x08|0x0005) plen 6 #1 [hci0] 27.302496 Address: 15:60:F2:91:B2:24 (Non-Resolvable) > HCI Event: Command Complete (0x0e) plen 4 #2 [hci0] 27.419117 LE Set Random Address (0x08|0x0005) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Scan Parameters (0x08|0x000b) plen 7 #3 [hci0] 27.419244 Type: Active (0x01) Interval: 11.250 msec (0x0012) Window: 11.250 msec (0x0012) Own address type: Random (0x01) Filter policy: Accept all advertisement (0x00) > HCI Event: Command Complete (0x0e) plen 4 #4 [hci0] 27.420131 LE Set Random Address (0x08|0x0005) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Scan Enable (0x08|0x000c) plen 2 #5 [hci0] 27.420259 Scanning: Enabled (0x01) Filter duplicates: Enabled (0x01) > HCI Event: Command Complete (0x0e) plen 4 #6 [hci0] 27.420969 LE Set Scan Parameters (0x08|0x000b) ncmd 1 Status: Success (0x00) > HCI Event: Command Complete (0x0e) plen 4 #7 [hci0] 27.421983 LE Set Scan Enable (0x08|0x000c) ncmd 1 Status: Success (0x00) @ MGMT Event: Command Complete (0x0001) plen 4 {0x0003} [hci0] 27.422059 Start Discovery (0x0023) plen 1 Status: Success (0x00) Address type: 0x06 LE Public LE Random @ MGMT Event: Discovering (0x0013) plen 2 {0x0003} [hci0] 27.422067 Address type: 0x06 LE Public LE Random Discovery: Enabled (0x01) @ MGMT Event: Discovering (0x0013) plen 2 {0x0002} [hci0] 27.422067 Address type: 0x06 LE Public LE Random Discovery: Enabled (0x01) @ MGMT Event: Discovering (0x0013) plen 2 {0x0001} [hci0] 27.422067 Address type: 0x06 LE Public LE Random Discovery: Enabled (0x01) Signed-off-by: João Paulo Rechi Vita Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index fbba43e9bef5..9a5330eed794 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -282,6 +282,7 @@ enum { HCI_FORCE_BREDR_SMP, HCI_FORCE_STATIC_ADDR, HCI_LL_RPA_RESOLUTION, + HCI_CMD_PENDING, __HCI_NUM_FLAGS, }; -- cgit v1.2.3 From 47d3d7fdb10a21c223036b58bd70ffdc24a472c4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 May 2019 08:24:44 -0700 Subject: ip6: fix skb leak in ip6frag_expire_frag_queue() Since ip6frag_expire_frag_queue() now pulls the head skb from frag queue, we should no longer use skb_get(), since this leads to an skb leak. Stefan Bader initially reported a problem in 4.4.stable [1] caused by the skb_get(), so this patch should also fix this issue. 296583.091021] kernel BUG at /build/linux-6VmqmP/linux-4.4.0/net/core/skbuff.c:1207! [296583.091734] Call Trace: [296583.091749] [] __pskb_pull_tail+0x50/0x350 [296583.091764] [] _decode_session6+0x26a/0x400 [296583.091779] [] __xfrm_decode_session+0x39/0x50 [296583.091795] [] icmpv6_route_lookup+0xf0/0x1c0 [296583.091809] [] icmp6_send+0x5e1/0x940 [296583.091823] [] ? __netif_receive_skb+0x18/0x60 [296583.091838] [] ? netif_receive_skb_internal+0x32/0xa0 [296583.091858] [] ? ixgbe_clean_rx_irq+0x594/0xac0 [ixgbe] [296583.091876] [] ? nf_ct_net_exit+0x50/0x50 [nf_defrag_ipv6] [296583.091893] [] icmpv6_send+0x21/0x30 [296583.091906] [] ip6_expire_frag_queue+0xe0/0x120 [296583.091921] [] nf_ct_frag6_expire+0x1f/0x30 [nf_defrag_ipv6] [296583.091938] [] call_timer_fn+0x37/0x140 [296583.091951] [] ? nf_ct_net_exit+0x50/0x50 [nf_defrag_ipv6] [296583.091968] [] run_timer_softirq+0x234/0x330 [296583.091982] [] __do_softirq+0x109/0x2b0 Fixes: d4289fcc9b16 ("net: IP6 defrag: use rbtrees for IPv6 defrag") Signed-off-by: Eric Dumazet Reported-by: Stefan Bader Cc: Peter Oskolkov Cc: Florian Westphal Signed-off-by: David S. Miller --- include/net/ipv6_frag.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 28aa9b30aece..1f77fb4dc79d 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -94,7 +94,6 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) goto out; head->dev = dev; - skb_get(head); spin_unlock(&fq->q.lock); icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); -- cgit v1.2.3 From 9b3040a6aafd7898ece7fc7efcbca71e42aa8069 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 5 May 2019 11:16:20 -0700 Subject: ipv4: Define __ipv4_neigh_lookup_noref when CONFIG_INET is disabled Define __ipv4_neigh_lookup_noref to return NULL when CONFIG_INET is disabled. Fixes: 4b2a2bfeb3f0 ("neighbor: Call __ipv4_neigh_lookup_noref in neigh_xmit") Reported-by: kbuild test robot Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/arp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/arp.h b/include/net/arp.h index 977aabfcdc03..c8f580a0e6b1 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -18,6 +18,7 @@ static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 return val * hash_rnd[0]; } +#ifdef CONFIG_INET static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) @@ -25,6 +26,13 @@ static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } +#else +static inline +struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) +{ + return NULL; +} +#endif static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) { -- cgit v1.2.3 From 522e4077e8dcdfc5b8e96469d3bc2324bc5d6466 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Sun, 28 Apr 2019 15:12:19 +0800 Subject: netfilter: slightly optimize nf_inet_addr_mask using 64bit computation to slightly optimize nf_inet_addr_mask Signed-off-by: Li RongQing Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index a7252f3baeb0..996bc247ef6e 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -41,10 +41,19 @@ static inline void nf_inet_addr_mask(const union nf_inet_addr *a1, union nf_inet_addr *result, const union nf_inet_addr *mask) { +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + const unsigned long *ua = (const unsigned long *)a1; + unsigned long *ur = (unsigned long *)result; + const unsigned long *um = (const unsigned long *)mask; + + ur[0] = ua[0] & um[0]; + ur[1] = ua[1] & um[1]; +#else result->all[0] = a1->all[0] & mask->all[0]; result->all[1] = a1->all[1] & mask->all[1]; result->all[2] = a1->all[2] & mask->all[2]; result->all[3] = a1->all[3] & mask->all[3]; +#endif } int netfilter_init(void); -- cgit v1.2.3 From a7a7be6087b07563490725f61f4dbf4826f099e2 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:16 -0700 Subject: net/sched: add sample action to the hardware intermediate representation Add sample action to the hardware intermediate representation model which would subsequently allow it to be used by drivers for offload. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_offload.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index d035183c8d03..9a6c89b2c2bb 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -118,6 +118,7 @@ enum flow_action_id { FLOW_ACTION_MARK, FLOW_ACTION_WAKE, FLOW_ACTION_QUEUE, + FLOW_ACTION_SAMPLE, }; /* This is mirroring enum pedit_header_type definition for easy mapping between @@ -157,6 +158,12 @@ struct flow_action_entry { u32 index; u8 vf; } queue; + struct { /* FLOW_ACTION_SAMPLE */ + struct psample_group *psample_group; + u32 rate; + u32 trunc_size; + bool truncate; + } sample; }; }; -- cgit v1.2.3 From f00cbf1968145afbae385a867a66c69845e30711 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:17 -0700 Subject: net/sched: use the hardware intermediate representation for matchall Extends matchall offload to make use of the hardware intermediate representation. More specifically, this patch moves the native TC actions in cls_matchall offload to the newer flow_action representation. This ultimately allows us to avoid a direct dependency on native TC actions for matchall. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index d5e7a1af346f..c852ed502cc6 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -789,6 +789,7 @@ enum tc_matchall_command { struct tc_cls_matchall_offload { struct tc_cls_common_offload common; enum tc_matchall_command command; + struct flow_rule *rule; struct tcf_exts *exts; unsigned long cookie; }; -- cgit v1.2.3 From ab79af32b0a5606324ce04c0f04a0d2f90b94464 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:18 -0700 Subject: mlxsw: use intermediate representation for matchall offload Updates the Mellanox spectrum driver to use the newer intermediate representation for flow actions in matchall offloads. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_offload.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 9a6c89b2c2bb..3bf67dd64be5 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -177,6 +177,17 @@ static inline bool flow_action_has_entries(const struct flow_action *action) return action->num_entries; } +/** + * flow_action_has_one_action() - check if exactly one action is present + * @action: tc filter flow offload action + * + * Returns true if exactly one action is present. + */ +static inline bool flow_offload_has_one_action(const struct flow_action *action) +{ + return action->num_entries == 1; +} + #define flow_action_for_each(__i, __act, __actions) \ for (__i = 0, __act = &(__actions)->entries[0]; __i < (__actions)->num_entries; __act = &(__actions)->entries[++__i]) -- cgit v1.2.3 From dfcb19f0fae3d07f9c56f6efe2c9bbebef6826c9 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:20 -0700 Subject: net/sched: remove unused functions for matchall offload Cleanup unused functions and variables after porting to the newer intermediate representation. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 25 ------------------------- 1 file changed, 25 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index c852ed502cc6..2d0470661277 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -371,30 +371,6 @@ static inline bool tcf_exts_has_actions(struct tcf_exts *exts) #endif } -/** - * tcf_exts_has_one_action - check if exactly one action is present - * @exts: tc filter extensions handle - * - * Returns true if exactly one action is present. - */ -static inline bool tcf_exts_has_one_action(struct tcf_exts *exts) -{ -#ifdef CONFIG_NET_CLS_ACT - return exts->nr_actions == 1; -#else - return false; -#endif -} - -static inline struct tc_action *tcf_exts_first_action(struct tcf_exts *exts) -{ -#ifdef CONFIG_NET_CLS_ACT - return exts->actions[0]; -#else - return NULL; -#endif -} - /** * tcf_exts_exec - execute tc filter extensions * @skb: socket buffer @@ -790,7 +766,6 @@ struct tc_cls_matchall_offload { struct tc_cls_common_offload common; enum tc_matchall_command command; struct flow_rule *rule; - struct tcf_exts *exts; unsigned long cookie; }; -- cgit v1.2.3 From fa762da94d9860f584c909621d1f8ccbe24c5d5e Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:21 -0700 Subject: net/sched: move police action structures to header Move tcf_police_params, tcf_police and tc_police_compat structures to a header. Making them usable to other code for example drivers that would offload police actions to hardware. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/tc_act/tc_police.h | 70 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 include/net/tc_act/tc_police.h (limited to 'include') diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h new file mode 100644 index 000000000000..8b9ef3664262 --- /dev/null +++ b/include/net/tc_act/tc_police.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_TC_POLICE_H +#define __NET_TC_POLICE_H + +#include + +struct tcf_police_params { + int tcfp_result; + u32 tcfp_ewma_rate; + s64 tcfp_burst; + u32 tcfp_mtu; + s64 tcfp_mtu_ptoks; + struct psched_ratecfg rate; + bool rate_present; + struct psched_ratecfg peak; + bool peak_present; + struct rcu_head rcu; +}; + +struct tcf_police { + struct tc_action common; + struct tcf_police_params __rcu *params; + + spinlock_t tcfp_lock ____cacheline_aligned_in_smp; + s64 tcfp_toks; + s64 tcfp_ptoks; + s64 tcfp_t_c; +}; + +#define to_police(pc) ((struct tcf_police *)pc) + +/* old policer structure from before tc actions */ +struct tc_police_compat { + u32 index; + int action; + u32 limit; + u32 burst; + u32 mtu; + struct tc_ratespec rate; + struct tc_ratespec peakrate; +}; + +static inline bool is_tcf_police(const struct tc_action *act) +{ +#ifdef CONFIG_NET_CLS_ACT + if (act->ops && act->ops->id == TCA_ID_POLICE) + return true; +#endif + return false; +} + +static inline u64 tcf_police_rate_bytes_ps(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_bh(police->params); + return params->rate.rate_bytes_ps; +} + +static inline s64 tcf_police_tcfp_burst(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_bh(police->params); + return params->tcfp_burst; +} + +#endif /* __NET_TC_POLICE_H */ -- cgit v1.2.3 From 8c8cfc6ed274e6fb86f00b53f3e7811afce29043 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:22 -0700 Subject: net/sched: add police action to the hardware intermediate representation Add police action to the hardware intermediate representation which would subsequently allow it to be used by drivers for offload. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/flow_offload.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 3bf67dd64be5..6200900434e1 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -119,6 +119,7 @@ enum flow_action_id { FLOW_ACTION_WAKE, FLOW_ACTION_QUEUE, FLOW_ACTION_SAMPLE, + FLOW_ACTION_POLICE, }; /* This is mirroring enum pedit_header_type definition for easy mapping between @@ -164,6 +165,10 @@ struct flow_action_entry { u32 trunc_size; bool truncate; } sample; + struct { /* FLOW_ACTION_POLICE */ + s64 burst; + u64 rate_bytes_ps; + } police; }; }; -- cgit v1.2.3 From b7fe4ab8a6013c3c721bed91f73e76eec8fb5d89 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:23 -0700 Subject: net/sched: extend matchall offload for hardware statistics Introduce a new command for matchall classifiers that allows hardware to update statistics. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 2d0470661277..161fcf8516ac 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -760,12 +760,14 @@ tc_cls_flower_offload_flow_rule(struct tc_cls_flower_offload *tc_flow_cmd) enum tc_matchall_command { TC_CLSMATCHALL_REPLACE, TC_CLSMATCHALL_DESTROY, + TC_CLSMATCHALL_STATS, }; struct tc_cls_matchall_offload { struct tc_cls_common_offload common; enum tc_matchall_command command; struct flow_rule *rule; + struct flow_stats stats; unsigned long cookie; }; -- cgit v1.2.3 From 88c44a5200849c8182eaf36535b4ceae6b90b19d Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Sat, 4 May 2019 04:46:25 -0700 Subject: net/sched: add block pointer to tc_cls_common_offload structure Some actions like the police action are stateful and could share state between devices. This is incompatible with offloading to multiple devices and drivers might want to test for shared blocks when offloading. Store a pointer to the tcf_block structure in the tc_cls_common_offload structure to allow drivers to determine when offloads apply to a shared block. Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 161fcf8516ac..eed98f8fcb5e 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -100,6 +100,11 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); #else +static inline bool tcf_block_shared(struct tcf_block *block) +{ + return false; +} + static inline int tcf_block_get(struct tcf_block **p_block, struct tcf_proto __rcu **p_filter_chain, struct Qdisc *q, @@ -624,6 +629,7 @@ struct tc_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; + struct tcf_block *block; struct netlink_ext_ack *extack; }; @@ -725,11 +731,13 @@ static inline bool tc_in_hw(u32 flags) static inline void tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common, const struct tcf_proto *tp, u32 flags, + struct tcf_block *block, struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio; + cls_common->block = block; if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } -- cgit v1.2.3 From f9bbe4477c30ece44296437ee26142b42ef8070b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:22 +0300 Subject: net: dsa: Optional VLAN-based port separation for switches without tagging This patch provides generic DSA code for using VLAN (802.1Q) tags for the same purpose as a dedicated switch tag for injection/extraction. It is based on the discussions and interest that has been so far expressed in https://www.spinics.net/lists/netdev/msg556125.html. Unlike all other DSA-supported tagging protocols, CONFIG_NET_DSA_TAG_8021Q does not offer a complete solution for drivers (nor can it). Instead, it provides generic code that driver can opt into calling: - dsa_8021q_xmit: Inserts a VLAN header with the specified contents. Can be called from another tagging protocol's xmit function. Currently the LAN9303 driver is inserting headers that are simply 802.1Q with custom fields, so this is an opportunity for code reuse. - dsa_8021q_rcv: Retrieves the TPID and TCI from a VLAN-tagged skb. Removing the VLAN header is left as a decision for the caller to make. - dsa_port_setup_8021q_tagging: For each user port, installs an Rx VID and a Tx VID, for proper untagged traffic identification on ingress and steering on egress. Also sets up the VLAN trunk on the upstream (CPU or DSA) port. Drivers are intentionally left to call this function explicitly, depending on the context and hardware support. The expected switch behavior and VLAN semantics should not be violated under any conditions. That is, after calling dsa_port_setup_8021q_tagging, the hardware should still pass all ingress traffic, be it tagged or untagged. For uniformity with the other tagging protocols, a module for the dsa_8021q_netdev_ops structure is registered, but the typical usage is to set up another tagging protocol which selects CONFIG_NET_DSA_TAG_8021Q, and calls the API from tag_8021q.h. Null function definitions are also provided so that a "depends on" is not forced in the Kconfig. This tagging protocol only works when switch ports are standalone, or when they are added to a VLAN-unaware bridge. It will probably remain this way for the reasons below. When added to a bridge that has vlan_filtering 1, the bridge core will install its own VLANs and reset the pvids through switchdev. For the bridge core, switchdev is a write-only pipe. All VLAN-related state is kept in the bridge core and nothing is read from DSA/switchdev or from the driver. So the bridge core will break this port separation because it will install the vlan_default_pvid into all switchdev ports. Even if we could teach the bridge driver about switchdev preference of a certain vlan_default_pvid (task difficult in itself since the current setting is per-bridge but we would need it per-port), there would still exist many other challenges. Firstly, in the DSA rcv callback, a driver would have to perform an iterative reverse lookup to find the correct switch port. That is because the port is a bridge slave, so its Rx VID (port PVID) is subject to user configuration. How would we ensure that the user doesn't reset the pvid to a different value (which would make an O(1) translation impossible), or to a non-unique value within this DSA switch tree (which would make any translation impossible)? Finally, not all switch ports are equal in DSA, and that makes it difficult for the bridge to be completely aware of this anyway. The CPU port needs to transmit tagged packets (VLAN trunk) in order for the DSA rcv code to be able to decode source information. But the bridge code has absolutely no idea which switch port is the CPU port, if nothing else then just because there is no netdevice registered by DSA for the CPU port. Also DSA does not currently allow the user to specify that they want the CPU port to do VLAN trunking anyway. VLANs are added to the CPU port using the same flags as they were added on the user port. So the VLANs installed by dsa_port_setup_8021q_tagging per driver request should remain private from the bridge's and user's perspective, and should not alter the VLAN semantics observed by the user. In the current implementation a VLAN range ending at 4095 (VLAN_N_VID) is reserved for this purpose. Each port receives a unique Rx VLAN and a unique Tx VLAN. Separate VLANs are needed for Rx and Tx because they serve different purposes: on Rx the switch must process traffic as untagged and process it with a port-based VLAN, but with care not to hinder bridging. On the other hand, the Tx VLAN is where the reachability restrictions are imposed, since by tagging frames in the xmit callback we are telling the switch onto which port to steer the frame. Some general guidance on how this support might be employed for real-life hardware (some comments made by Florian Fainelli): - If the hardware supports VLAN tag stacking, it should somehow back up its private VLAN settings when the bridge tries to override them. Then the driver could re-apply them as outer tags. Dedicating an outer tag per bridge device would allow identical inner tag VID numbers to co-exist, yet preserve broadcast domain isolation. - If the switch cannot handle VLAN tag stacking, it should disable this port separation when added as slave to a vlan_filtering bridge, in that case having reduced functionality. - Drivers for old switches that don't support the entire VLAN_N_VID range will need to rework the current range selection mechanism. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 76 +++++++++++++++++++++++++++++++++++++++++++++++ include/net/dsa.h | 2 ++ 2 files changed, 78 insertions(+) create mode 100644 include/linux/dsa/8021q.h (limited to 'include') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h new file mode 100644 index 000000000000..3911e0586478 --- /dev/null +++ b/include/linux/dsa/8021q.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2019, Vladimir Oltean + */ + +#ifndef _NET_DSA_8021Q_H +#define _NET_DSA_8021Q_H + +#include + +struct dsa_switch; +struct sk_buff; +struct net_device; +struct packet_type; + +#if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) + +int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, + bool enabled); + +struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, + u16 tpid, u16 tci); + +struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev, + struct packet_type *pt, u16 *tpid, u16 *tci); + +u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port); + +u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port); + +int dsa_8021q_rx_switch_id(u16 vid); + +int dsa_8021q_rx_source_port(u16 vid); + +#else + +int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, + bool enabled) +{ + return 0; +} + +struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, + u16 tpid, u16 tci) +{ + return NULL; +} + +struct sk_buff *dsa_8021q_rcv(struct sk_buff *skb, struct net_device *netdev, + struct packet_type *pt, u16 *tpid, u16 *tci) +{ + return NULL; +} + +u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port) +{ + return 0; +} + +u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port) +{ + return 0; +} + +int dsa_8021q_rx_switch_id(u16 vid) +{ + return 0; +} + +int dsa_8021q_rx_source_port(u16 vid) +{ + return 0; +} + +#endif /* IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) */ + +#endif /* _NET_DSA_8021Q_H */ diff --git a/include/net/dsa.h b/include/net/dsa.h index 18db7b8e7a8e..69f3714f42ba 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -42,6 +42,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_MTK_VALUE 9 #define DSA_TAG_PROTO_QCA_VALUE 10 #define DSA_TAG_PROTO_TRAILER_VALUE 11 +#define DSA_TAG_PROTO_8021Q_VALUE 12 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -56,6 +57,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_MTK = DSA_TAG_PROTO_MTK_VALUE, DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, + DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, }; struct packet_type; -- cgit v1.2.3 From cc1939e4b3aaf534fb2f3706820012036825731c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:23 +0300 Subject: net: dsa: Allow drivers to filter packets they can decode source port from Frames get processed by DSA and redirected to switch port net devices based on the ETH_P_XDSA multiplexed packet_type handler found by the network stack when calling eth_type_trans(). The running assumption is that once the DSA .rcv function is called, DSA is always able to decode the switch tag in order to change the skb->dev from its master. However there are tagging protocols (such as the new DSA_TAG_PROTO_SJA1105, user of DSA_TAG_PROTO_8021Q) where this assumption is not completely true, since switch tagging piggybacks on the absence of a vlan_filtering bridge. Moreover, management traffic (BPDU, PTP) for this switch doesn't rely on switch tagging, but on a different mechanism. So it would make sense to at least be able to terminate that. Having DSA receive traffic it can't decode would put it in an impossible situation: the eth_type_trans() function would invoke the DSA .rcv(), which could not change skb->dev, then eth_type_trans() would be invoked again, which again would call the DSA .rcv, and the packet would never be able to exit the DSA filter and would spiral in a loop until the whole system dies. This happens because eth_type_trans() doesn't actually look at the skb (so as to identify a potential tag) when it deems it as being ETH_P_XDSA. It just checks whether skb->dev has a DSA private pointer installed (therefore it's a DSA master) and that there exists a .rcv callback (everybody except DSA_TAG_PROTO_NONE has that). This is understandable as there are many switch tags out there, and exhaustively checking for all of them is far from ideal. The solution lies in introducing a filtering function for each tagging protocol. In the absence of a filtering function, all traffic is passed to the .rcv DSA callback. The tagging protocol should see the filtering function as a pre-validation that it can decode the incoming skb. The traffic that doesn't match the filter will bypass the DSA .rcv callback and be left on the master netdevice, which wasn't previously possible. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 69f3714f42ba..c90ceeec7d1f 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -69,6 +69,11 @@ struct dsa_device_ops { struct packet_type *pt); int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, int *offset); + /* Used to determine which traffic should match the DSA filter in + * eth_type_trans, and which, if any, should bypass it and be processed + * as regular on the master net device. + */ + bool (*filter)(const struct sk_buff *skb, struct net_device *dev); unsigned int overhead; const char *name; enum dsa_tag_protocol proto; @@ -148,6 +153,7 @@ struct dsa_port { struct dsa_switch_tree *dst; struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); + bool (*filter)(const struct sk_buff *skb, struct net_device *dev); enum { DSA_PORT_TYPE_UNUSED = 0, @@ -520,6 +526,15 @@ static inline bool netdev_uses_dsa(struct net_device *dev) return false; } +static inline bool dsa_can_decode(const struct sk_buff *skb, + struct net_device *dev) +{ +#if IS_ENABLED(CONFIG_NET_DSA) + return !dev->dsa_ptr->filter || dev->dsa_ptr->filter(skb, dev); +#endif + return false; +} + struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n); void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); -- cgit v1.2.3 From b68b0dd0fb2d91056d5241a19960cf47d4a80f05 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:24 +0300 Subject: net: dsa: Keep private info in the skb->cb Map a DSA structure over the 48-byte control block that will hold skb info on transmit and receive. This is only for use within the DSA processing layer (e.g. communicating between DSA core and tagger) and not for passing info around with other layers such as the master net device. Also add a DSA_SKB_CB_PRIV() macro which retrieves a pointer to the space up to 48 bytes that the DSA structure does not use. This space can be used for drivers to add their own private info. One use is for the PTP timestamping code path. When cloning a skb, annotate the original with a pointer to the clone, which the driver can then find easily and place the timestamp to. This avoids the need of a separate queue to hold clones and a way to match an original to a cloned skb. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index c90ceeec7d1f..d628587e0bde 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -83,6 +83,37 @@ struct dsa_device_ops { #define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) +struct dsa_skb_cb { + struct sk_buff *clone; +}; + +struct __dsa_skb_cb { + struct dsa_skb_cb cb; + u8 priv[48 - sizeof(struct dsa_skb_cb)]; +}; + +#define __DSA_SKB_CB(skb) ((struct __dsa_skb_cb *)((skb)->cb)) + +#define DSA_SKB_CB(skb) ((struct dsa_skb_cb *)((skb)->cb)) + +#define DSA_SKB_CB_COPY(nskb, skb) \ + { *__DSA_SKB_CB(nskb) = *__DSA_SKB_CB(skb); } + +#define DSA_SKB_CB_ZERO(skb) \ + { *__DSA_SKB_CB(skb) = (struct __dsa_skb_cb) {0}; } + +#define DSA_SKB_CB_PRIV(skb) \ + ((void *)(skb)->cb + offsetof(struct __dsa_skb_cb, priv)) + +#define DSA_SKB_CB_CLONE(_clone, _skb) \ + { \ + struct sk_buff *clone = _clone; \ + struct sk_buff *skb = _skb; \ + \ + DSA_SKB_CB_COPY(clone, skb); \ + DSA_SKB_CB(skb)->clone = clone; \ + } + struct dsa_switch_tree { struct list_head list; -- cgit v1.2.3 From 97a69a0dea9a048c6769249f1552de5f56731524 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:25 +0300 Subject: net: dsa: Add support for deferred xmit Some hardware needs to take work to get convinced to receive frames on the CPU port (such as the sja1105 which takes temporary L2 forwarding rules over SPI that last for a single frame). Such work needs a sleepable context, and because the regular .ndo_start_xmit is atomic, this cannot be done in the tagger. So introduce a generic DSA mechanism that sets up a transmit skb queue and a workqueue for deferred transmission. The new driver callback (.port_deferred_xmit) is in dsa_switch and not in the tagger because the operations that require sleeping typically also involve interacting with the hardware, and not simply skb manipulations. Therefore having it there simplifies the structure a bit and makes it unnecessary to export functions from the driver to the tagger. The driver is responsible of calling dsa_enqueue_skb which transfers it to the master netdevice. This is so that it has a chance of performing some more work afterwards, such as cleanup or TX timestamping. To tell DSA that skb xmit deferral is required, I have thought about changing the return type of the tagger .xmit from struct sk_buff * into a enum dsa_tx_t that could potentially encode a DSA_XMIT_DEFER value. But the trailer tagger is reallocating every skb on xmit and therefore making a valid use of the pointer return value. So instead of reworking the API in complicated ways, right now a boolean property in the newly introduced DSA_SKB_CB is set. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index d628587e0bde..0260b73938e2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -85,6 +85,7 @@ struct dsa_device_ops { struct dsa_skb_cb { struct sk_buff *clone; + bool deferred_xmit; }; struct __dsa_skb_cb { @@ -205,6 +206,10 @@ struct dsa_port { struct net_device *bridge_dev; struct devlink_port devlink_port; struct phylink *pl; + + struct work_struct xmit_work; + struct sk_buff_head xmit_queue; + /* * Original copy of the master netdev ethtool_ops */ @@ -539,6 +544,12 @@ struct dsa_switch_ops { struct sk_buff *clone, unsigned int type); bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); + + /* + * Deferred frame Tx + */ + netdev_tx_t (*port_deferred_xmit)(struct dsa_switch *ds, int port, + struct sk_buff *skb); }; struct dsa_switch_driver { @@ -634,6 +645,7 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, #define BRCM_TAG_GET_QUEUE(v) ((v) & 0xff) +netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data); int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data); int dsa_port_get_phy_sset_count(struct dsa_port *dp); -- cgit v1.2.3 From c362beb072e14b929eb657dc174d83ccdd9b0eed Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:26 +0300 Subject: net: dsa: Add a private structure pointer to dsa_port This is supposed to share information between the driver and the tagger, or used by the tagger to keep some state. Its use is optional. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 0260b73938e2..e20be1ceb306 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -210,6 +210,12 @@ struct dsa_port { struct work_struct xmit_work; struct sk_buff_head xmit_queue; + /* + * Give the switch driver somewhere to hang its per-port private data + * structures (accessible from the tagger). + */ + void *priv; + /* * Original copy of the master netdev ethtool_ops */ -- cgit v1.2.3 From 227d07a07ef126272ea2eed97fd136cd7a803d81 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 5 May 2019 13:19:27 +0300 Subject: net: dsa: sja1105: Add support for traffic through standalone ports In order to support this, we are creating a make-shift switch tag out of a VLAN trunk configured on the CPU port. Termination of normal traffic on switch ports only works when not under a vlan_filtering bridge. Termination of management (PTP, BPDU) traffic works under all circumstances because it uses a different tagging mechanism (incl_srcpt). We are making use of the generic CONFIG_NET_DSA_TAG_8021Q code and leveraging it from our own CONFIG_NET_DSA_TAG_SJA1105. There are two types of traffic: regular and link-local. The link-local traffic received on the CPU port is trapped from the switch's regular forwarding decisions because it matched one of the two DMAC filters for management traffic. On transmission, the switch requires special massaging for these link-local frames. Due to a weird implementation of the switching IP, by default it drops link-local frames that originate on the CPU port. It needs to be told where to forward them to, through an SPI command ("management route") that is valid for only a single frame. So when we're sending link-local traffic, we are using the dsa_defer_xmit mechanism. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/sja1105.h | 35 ++++++++++++++++++++++++++--------- include/net/dsa.h | 2 ++ 2 files changed, 28 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/dsa/sja1105.h b/include/linux/dsa/sja1105.h index abf3977e34fd..603a02e5a8cb 100644 --- a/include/linux/dsa/sja1105.h +++ b/include/linux/dsa/sja1105.h @@ -2,22 +2,39 @@ * Copyright (c) 2019, Vladimir Oltean */ -/* Included by drivers/net/dsa/sja1105/sja1105.h */ +/* Included by drivers/net/dsa/sja1105/sja1105.h and net/dsa/tag_sja1105.c */ #ifndef _NET_DSA_SJA1105_H #define _NET_DSA_SJA1105_H +#include #include +#include #define ETH_P_SJA1105 ETH_P_DSA_8021Q -/* The switch can only be convinced to stay in unmanaged mode and not trap any - * link-local traffic by actually telling it to filter frames sent at the - * 00:00:00:00:00:00 destination MAC. - */ -#define SJA1105_LINKLOCAL_FILTER_A 0x000000000000ull -#define SJA1105_LINKLOCAL_FILTER_A_MASK 0xFFFFFFFFFFFFull -#define SJA1105_LINKLOCAL_FILTER_B 0x000000000000ull -#define SJA1105_LINKLOCAL_FILTER_B_MASK 0xFFFFFFFFFFFFull +/* IEEE 802.3 Annex 57A: Slow Protocols PDUs (01:80:C2:xx:xx:xx) */ +#define SJA1105_LINKLOCAL_FILTER_A 0x0180C2000000ull +#define SJA1105_LINKLOCAL_FILTER_A_MASK 0xFFFFFF000000ull +/* IEEE 1588 Annex F: Transport of PTP over Ethernet (01:1B:19:xx:xx:xx) */ +#define SJA1105_LINKLOCAL_FILTER_B 0x011B19000000ull +#define SJA1105_LINKLOCAL_FILTER_B_MASK 0xFFFFFF000000ull + +enum sja1105_frame_type { + SJA1105_FRAME_TYPE_NORMAL = 0, + SJA1105_FRAME_TYPE_LINK_LOCAL, +}; + +struct sja1105_skb_cb { + enum sja1105_frame_type type; +}; + +#define SJA1105_SKB_CB(skb) \ + ((struct sja1105_skb_cb *)DSA_SKB_CB_PRIV(skb)) + +struct sja1105_port { + struct dsa_port *dp; + int mgmt_slot; +}; #endif /* _NET_DSA_SJA1105_H */ diff --git a/include/net/dsa.h b/include/net/dsa.h index e20be1ceb306..6aaaadd6a413 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -43,6 +43,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_QCA_VALUE 10 #define DSA_TAG_PROTO_TRAILER_VALUE 11 #define DSA_TAG_PROTO_8021Q_VALUE 12 +#define DSA_TAG_PROTO_SJA1105_VALUE 13 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -58,6 +59,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_QCA = DSA_TAG_PROTO_QCA_VALUE, DSA_TAG_PROTO_TRAILER = DSA_TAG_PROTO_TRAILER_VALUE, DSA_TAG_PROTO_8021Q = DSA_TAG_PROTO_8021Q_VALUE, + DSA_TAG_PROTO_SJA1105 = DSA_TAG_PROTO_SJA1105_VALUE, }; struct packet_type; -- cgit v1.2.3 From d6787147e15dffa7b7f3116a5bc3cbe0670bd74f Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Mon, 6 May 2019 17:24:21 -0700 Subject: net/sched: remove block pointer from common offload structure Based on feedback from Jiri avoid carrying a pointer to the tcf_block structure in the tc_cls_common_offload structure. Instead store a flag in driver private data which indicates if offloads apply to a shared block at block binding time. Suggested-by: Jiri Pirko Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index eed98f8fcb5e..514e3c80ecc1 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -629,7 +629,6 @@ struct tc_cls_common_offload { u32 chain_index; __be16 protocol; u32 prio; - struct tcf_block *block; struct netlink_ext_ack *extack; }; @@ -731,13 +730,11 @@ static inline bool tc_in_hw(u32 flags) static inline void tc_cls_common_offload_init(struct tc_cls_common_offload *cls_common, const struct tcf_proto *tp, u32 flags, - struct tcf_block *block, struct netlink_ext_ack *extack) { cls_common->chain_index = tp->chain->index; cls_common->protocol = tp->protocol; cls_common->prio = tp->prio; - cls_common->block = block; if (tc_skip_sw(flags) || flags & TCA_CLS_FLAGS_VERBOSE) cls_common->extack = extack; } -- cgit v1.2.3