From 9b2c45d479d0fb8647c9e83359df69162b5fbe5f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 12 Feb 2018 20:00:20 +0100 Subject: net: make getname() functions return length rather than use int* parameter Changes since v1: Added changes in these files: drivers/infiniband/hw/usnic/usnic_transport.c drivers/staging/lustre/lnet/lnet/lib-socket.c drivers/target/iscsi/iscsi_target_login.c drivers/vhost/net.c fs/dlm/lowcomms.c fs/ocfs2/cluster/tcp.c security/tomoyo/network.c Before: All these functions either return a negative error indicator, or store length of sockaddr into "int *socklen" parameter and return zero on success. "int *socklen" parameter is awkward. For example, if caller does not care, it still needs to provide on-stack storage for the value it does not need. None of the many FOO_getname() functions of various protocols ever used old value of *socklen. They always just overwrite it. This change drops this parameter, and makes all these functions, on success, return length of sockaddr. It's always >= 0 and can be differentiated from an error. Tests in callers are changed from "if (err)" to "if (err < 0)", where needed. rpc_sockname() lost "int buflen" parameter, since its only use was to be passed to kernel_getsockname() as &buflen and subsequently not used in any way. Userspace API is not changed. text data bss dec hex filename 30108430 2633624 873672 33615726 200ef6e vmlinux.before.o 30108109 2633612 873672 33615393 200ee21 vmlinux.o Signed-off-by: Denys Vlasenko CC: David S. Miller CC: linux-kernel@vger.kernel.org CC: netdev@vger.kernel.org CC: linux-bluetooth@vger.kernel.org CC: linux-decnet-user@lists.sourceforge.net CC: linux-wireless@vger.kernel.org CC: linux-rdma@vger.kernel.org CC: linux-sctp@vger.kernel.org CC: linux-nfs@vger.kernel.org CC: linux-x25@vger.kernel.org Signed-off-by: David S. Miller --- include/linux/net.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 91216b16feb7..000d1aada74f 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -146,7 +146,7 @@ struct proto_ops { struct socket *newsock, int flags, bool kern); int (*getname) (struct socket *sock, struct sockaddr *addr, - int *sockaddr_len, int peer); + int peer); __poll_t (*poll) (struct file *file, struct socket *sock, struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, @@ -294,10 +294,8 @@ int kernel_listen(struct socket *sock, int backlog); int kernel_accept(struct socket *sock, struct socket **newsock, int flags); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); -int kernel_getsockname(struct socket *sock, struct sockaddr *addr, - int *addrlen); -int kernel_getpeername(struct socket *sock, struct sockaddr *addr, - int *addrlen); +int kernel_getsockname(struct socket *sock, struct sockaddr *addr); +int kernel_getpeername(struct socket *sock, struct sockaddr *addr); int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval, int *optlen); int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, -- cgit v1.2.3 From 1a57feb847c56d6193f67d0e892c24e71f9e3ab1 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 13 Feb 2018 12:26:23 +0300 Subject: net: Introduce net_sem for protection of pernet_list Currently, the mutex is mostly used to protect pernet operations list. It orders setup_net() and cleanup_net() with parallel {un,}register_pernet_operations() calls, so ->exit{,batch} methods of the same pernet operations are executed for a dying net, as were used to call ->init methods, even after the net namespace is unlinked from net_namespace_list in cleanup_net(). But there are several problems with scalability. The first one is that more than one net can't be created or destroyed at the same moment on the node. For big machines with many cpus running many containers it's very sensitive. The second one is that it's need to synchronize_rcu() after net is removed from net_namespace_list(): Destroy net_ns: cleanup_net() mutex_lock(&net_mutex) list_del_rcu(&net->list) synchronize_rcu() <--- Sleep there for ages list_for_each_entry_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list) list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list) mutex_unlock(&net_mutex) This primitive is not fast, especially on the systems with many processors and/or when preemptible RCU is enabled in config. So, all the time, while cleanup_net() is waiting for RCU grace period, creation of new net namespaces is not possible, the tasks, who makes it, are sleeping on the same mutex: Create net_ns: copy_net_ns() mutex_lock_killable(&net_mutex) <--- Sleep there for ages I observed 20-30 seconds hangs of "unshare -n" on ordinary 8-cpu laptop with preemptible RCU enabled after CRIU tests round is finished. The solution is to convert net_mutex to the rw_semaphore and add fine grain locks to really small number of pernet_operations, what really need them. Then, pernet_operations::init/::exit methods, modifying the net-related data, will require down_read() locking only, while down_write() will be used for changing pernet_list (i.e., when modules are being loaded and unloaded). This gives signify performance increase, after all patch set is applied, like you may see here: %for i in {1..10000}; do unshare -n bash -c exit; done *before* real 1m40,377s user 0m9,672s sys 0m19,928s *after* real 0m17,007s user 0m5,311s sys 0m11,779 (5.8 times faster) This patch starts replacing net_mutex to net_sem. It adds rw_semaphore, describes the variables it protects, and makes to use, where appropriate. net_mutex is still present, and next patches will kick it out step-by-step. Signed-off-by: Kirill Tkhai Acked-by: Andrei Vagin Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 1fdcde96eb65..e9ee9ad0a681 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -36,6 +36,7 @@ extern int rtnl_is_locked(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct mutex net_mutex; +extern struct rw_semaphore net_sem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); -- cgit v1.2.3 From 297dd12cb104151797fd649433a2157b585f1718 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 13 Feb 2018 14:15:36 +0100 Subject: net: avoid including xdp.h in filter.h If is sufficient with a forward declaration of struct xdp_rxq_info in linux/filter.h, which avoids including net/xdp.h. This was originally suggested by John Fastabend during the review phase, but wasn't included in the final patchset revision. Thus, this followup. Suggested-by: John Fastabend Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 276932d75975..fdb691b520c0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -20,7 +20,6 @@ #include #include -#include #include #include @@ -30,6 +29,7 @@ struct sk_buff; struct sock; struct seccomp_data; struct bpf_prog_aux; +struct xdp_rxq_info; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function -- cgit v1.2.3 From 330c7272c40e965b8ab510d1022acd6e6a32e9c8 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Feb 2018 08:52:00 -0800 Subject: net: Make dn_ptr depend on CONFIG_DECNET Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5eef6c8e2741..d2ef35e00626 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1800,7 +1800,9 @@ struct net_device { #endif void *atalk_ptr; struct in_device __rcu *ip_ptr; +#if IS_ENABLED(CONFIG_DECNET) struct dn_dev __rcu *dn_ptr; +#endif struct inet6_dev __rcu *ip6_ptr; void *ax25_ptr; struct wireless_dev *ieee80211_ptr; -- cgit v1.2.3 From 19ff13f2a411d99af67d8e51867d54b86e1bf017 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Feb 2018 08:52:01 -0800 Subject: net: Make ax25_ptr depend on CONFIG_AX25 Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d2ef35e00626..936dc2c9dca1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1804,7 +1804,9 @@ struct net_device { struct dn_dev __rcu *dn_ptr; #endif struct inet6_dev __rcu *ip6_ptr; +#if IS_ENABLED(CONFIG_AX25) void *ax25_ptr; +#endif struct wireless_dev *ieee80211_ptr; struct wpan_dev *ieee802154_ptr; #if IS_ENABLED(CONFIG_MPLS_ROUTING) -- cgit v1.2.3 From 89e58148fbf2e4cbd84006e263061c19a2b47adf Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 13 Feb 2018 08:52:02 -0800 Subject: net: Make atalk_ptr depend on ATALK or IRDA Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/linux/atalk.h | 2 ++ include/linux/netdevice.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/atalk.h b/include/linux/atalk.h index 4d356e168692..40373920ea58 100644 --- a/include/linux/atalk.h +++ b/include/linux/atalk.h @@ -113,10 +113,12 @@ extern void aarp_proto_init(void); /* Inter module exports */ /* Give a device find its atif control structure */ +#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK) static inline struct atalk_iface *atalk_find_dev(struct net_device *dev) { return dev->atalk_ptr; } +#endif extern struct atalk_addr *atalk_find_dev_addr(struct net_device *dev); extern struct net_device *atrtr_get_dev(struct atalk_addr *sa); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 936dc2c9dca1..dbe6344b727a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1798,7 +1798,9 @@ struct net_device { #if IS_ENABLED(CONFIG_TIPC) struct tipc_bearer __rcu *tipc_ptr; #endif +#if IS_ENABLED(CONFIG_IRDA) || IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; +#endif struct in_device __rcu *ip_ptr; #if IS_ENABLED(CONFIG_DECNET) struct dn_dev __rcu *dn_ptr; -- cgit v1.2.3 From eb09f1feb8e5999390a6f149307cb88354232680 Mon Sep 17 00:00:00 2001 From: Harshitha Ramamurthy Date: Tue, 23 Jan 2018 08:50:56 -0800 Subject: virtchnl: Add virtchl structures to support queue channels This patch defines new structs in support of the virtchannel message that the VF sends to the PF to create a queue channel specified by the user via tc tool. Signed-off-by: Harshitha Ramamurthy Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/avf/virtchnl.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 3ce61342fa31..1f652ceecf35 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -136,6 +136,8 @@ enum virtchnl_ops { VIRTCHNL_OP_ENABLE_VLAN_STRIPPING = 27, VIRTCHNL_OP_DISABLE_VLAN_STRIPPING = 28, VIRTCHNL_OP_REQUEST_QUEUES = 29, + VIRTCHNL_OP_ENABLE_CHANNELS = 30, + VIRTCHNL_OP_DISABLE_CHANNELS = 31, }; /* This macro is used to generate a compilation error if a structure @@ -244,6 +246,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_ENCAP 0X00100000 #define VIRTCHNL_VF_OFFLOAD_ENCAP_CSUM 0X00200000 #define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM 0X00400000 +#define VIRTCHNL_VF_OFFLOAD_ADQ 0X00800000 #define VF_BASE_MODE_OFFLOADS (VIRTCHNL_VF_OFFLOAD_L2 | \ VIRTCHNL_VF_OFFLOAD_VLAN | \ @@ -496,6 +499,30 @@ struct virtchnl_rss_hena { VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hena); +/* VIRTCHNL_OP_ENABLE_CHANNELS + * VIRTCHNL_OP_DISABLE_CHANNELS + * VF sends these messages to enable or disable channels based on + * the user specified queue count and queue offset for each traffic class. + * This struct encompasses all the information that the PF needs from + * VF to create a channel. + */ +struct virtchnl_channel_info { + u16 count; /* number of queues in a channel */ + u16 offset; /* queues in a channel start from 'offset' */ + u32 pad; + u64 max_tx_rate; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_channel_info); + +struct virtchnl_tc_info { + u32 num_tc; + u32 pad; + struct virtchnl_channel_info list[1]; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_tc_info); + /* VIRTCHNL_OP_EVENT * PF sends this message to inform the VF driver of events that may affect it. * No direct response is expected from the VF, though it may generate other @@ -711,6 +738,19 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_REQUEST_QUEUES: valid_len = sizeof(struct virtchnl_vf_res_request); break; + case VIRTCHNL_OP_ENABLE_CHANNELS: + valid_len = sizeof(struct virtchnl_tc_info); + if (msglen >= valid_len) { + struct virtchnl_tc_info *vti = + (struct virtchnl_tc_info *)msg; + valid_len += vti->num_tc * + sizeof(struct virtchnl_channel_info); + if (vti->num_tc == 0) + err_msg_format = true; + } + break; + case VIRTCHNL_OP_DISABLE_CHANNELS: + break; /* These are always errors coming from the VF. */ case VIRTCHNL_OP_EVENT: case VIRTCHNL_OP_UNKNOWN: -- cgit v1.2.3 From 0718e560a330599d15fddc37651d693c7a09e49e Mon Sep 17 00:00:00 2001 From: Harshitha Ramamurthy Date: Tue, 23 Jan 2018 08:51:03 -0800 Subject: virtchnl: Add a macro to check the size of a union This patch adds a macro to check if the size of a union is correct. It throws a divide by zero error if the union is not of the correct size. Signed-off-by: Harshitha Ramamurthy Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/avf/virtchnl.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 1f652ceecf35..6fe630ebbf23 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -140,13 +140,15 @@ enum virtchnl_ops { VIRTCHNL_OP_DISABLE_CHANNELS = 31, }; -/* This macro is used to generate a compilation error if a structure +/* These macros are used to generate compilation errors if a structure/union * is not exactly the correct length. It gives a divide by zero error if the - * structure is not of the correct size, otherwise it creates an enum that is - * never used. + * structure/union is not of the correct size, otherwise it creates an enum + * that is never used. */ #define VIRTCHNL_CHECK_STRUCT_LEN(n, X) enum virtchnl_static_assert_enum_##X \ { virtchnl_static_assert_##X = (n)/((sizeof(struct X) == (n)) ? 1 : 0) } +#define VIRTCHNL_CHECK_UNION_LEN(n, X) enum virtchnl_static_asset_enum_##X \ + { virtchnl_static_assert_##X = (n)/((sizeof(union X) == (n)) ? 1 : 0) } /* Virtual channel message descriptor. This overlays the admin queue * descriptor. All other data is passed in external buffers. -- cgit v1.2.3 From 3872c8d44c2e489bcce0c743e808a4135e8da228 Mon Sep 17 00:00:00 2001 From: Harshitha Ramamurthy Date: Tue, 23 Jan 2018 08:51:04 -0800 Subject: virtchnl: Add filter data structures This patch adds infrastructure to send virtchnl messages to the PF to configure filters on the VF. The patch adds a struct called virtchnl_filter which contains information about the fields in the user-specified tc filter. Signed-off-by: Harshitha Ramamurthy Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- include/linux/avf/virtchnl.h | 59 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 6fe630ebbf23..b0a7f315bfbe 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -138,6 +138,8 @@ enum virtchnl_ops { VIRTCHNL_OP_REQUEST_QUEUES = 29, VIRTCHNL_OP_ENABLE_CHANNELS = 30, VIRTCHNL_OP_DISABLE_CHANNELS = 31, + VIRTCHNL_OP_ADD_CLOUD_FILTER = 32, + VIRTCHNL_OP_DEL_CLOUD_FILTER = 33, }; /* These macros are used to generate compilation errors if a structure/union @@ -525,6 +527,57 @@ struct virtchnl_tc_info { VIRTCHNL_CHECK_STRUCT_LEN(24, virtchnl_tc_info); +/* VIRTCHNL_ADD_CLOUD_FILTER + * VIRTCHNL_DEL_CLOUD_FILTER + * VF sends these messages to add or delete a cloud filter based on the + * user specified match and action filters. These structures encompass + * all the information that the PF needs from the VF to add/delete a + * cloud filter. + */ + +struct virtchnl_l4_spec { + u8 src_mac[ETH_ALEN]; + u8 dst_mac[ETH_ALEN]; + __be16 vlan_id; + __be16 pad; /* reserved for future use */ + __be32 src_ip[4]; + __be32 dst_ip[4]; + __be16 src_port; + __be16 dst_port; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(52, virtchnl_l4_spec); + +union virtchnl_flow_spec { + struct virtchnl_l4_spec tcp_spec; + u8 buffer[128]; /* reserved for future use */ +}; + +VIRTCHNL_CHECK_UNION_LEN(128, virtchnl_flow_spec); + +enum virtchnl_action { + /* action types */ + VIRTCHNL_ACTION_DROP = 0, + VIRTCHNL_ACTION_TC_REDIRECT, +}; + +enum virtchnl_flow_type { + /* flow types */ + VIRTCHNL_TCP_V4_FLOW = 0, + VIRTCHNL_TCP_V6_FLOW, +}; + +struct virtchnl_filter { + union virtchnl_flow_spec data; + union virtchnl_flow_spec mask; + enum virtchnl_flow_type flow_type; + enum virtchnl_action action; + u32 action_meta; + __u8 field_flags; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter); + /* VIRTCHNL_OP_EVENT * PF sends this message to inform the VF driver of events that may affect it. * No direct response is expected from the VF, though it may generate other @@ -753,6 +806,12 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, break; case VIRTCHNL_OP_DISABLE_CHANNELS: break; + case VIRTCHNL_OP_ADD_CLOUD_FILTER: + valid_len = sizeof(struct virtchnl_filter); + break; + case VIRTCHNL_OP_DEL_CLOUD_FILTER: + valid_len = sizeof(struct virtchnl_filter); + break; /* These are always errors coming from the VF. */ case VIRTCHNL_OP_EVENT: case VIRTCHNL_OP_UNKNOWN: -- cgit v1.2.3 From 052fddfb3c4e5f3d413d3f6b8dffe1b7192026af Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 14 Feb 2018 01:07:42 +0100 Subject: net: ptp: Add stub for ptp_classify_raw() When NET_PTP_CLASSIFY is disabled, a stub function is required in order that the drivers compile. Signed-off-by: Andrew Lunn Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_classify.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index a079656b614c..059242030631 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -75,5 +75,9 @@ void __init ptp_classifier_init(void); static inline void ptp_classifier_init(void) { } +static inline unsigned int ptp_classify_raw(struct sk_buff *skb) +{ + return PTP_CLASS_NONE; +} #endif #endif /* _PTP_CLASSIFY_H_ */ -- cgit v1.2.3 From 02d92f7903647119e125b24f5470f96cee0d4b4b Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 19 Jan 2018 16:13:01 -0800 Subject: net/mlx5: CQ Database per EQ Before this patch the driver had one CQ database protected via one spinlock, this spinlock is meant to synchronize between CQ adding/removing and CQ IRQ interrupt handling. On a system with large number of CPUs and on a work load that requires lots of interrupts, this global spinlock becomes a very nasty hotspot and introduces a contention between the active cores, which will significantly hurt performance and becomes a bottleneck that prevents seamless cpu scaling. To solve this we simply move the CQ database and its spinlock to be per EQ (IRQ), thus per core. Tested with: system: 2 sockets, 14 cores per socket, hyperthreading, 2x14x2=56 cores netperf command: ./super_netperf 200 -P 0 -t TCP_RR -H -l 30 -- -r 300,300 -o -s 1M,1M -S 1M,1M WITHOUT THIS PATCH: Average: CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle Average: all 4.32 0.00 36.15 0.09 0.00 34.02 0.00 0.00 0.00 25.41 Samples: 2M of event 'cycles:pp', Event count (approx.): 1554616897271 Overhead Command Shared Object Symbol + 14.28% swapper [kernel.vmlinux] [k] intel_idle + 12.25% swapper [kernel.vmlinux] [k] queued_spin_lock_slowpath + 10.29% netserver [kernel.vmlinux] [k] queued_spin_lock_slowpath + 1.32% netserver [kernel.vmlinux] [k] mlx5e_xmit WITH THIS PATCH: Average: CPU %usr %nice %sys %iowait %irq %soft %steal %guest %gnice %idle Average: all 4.27 0.00 34.31 0.01 0.00 18.71 0.00 0.00 0.00 42.69 Samples: 2M of event 'cycles:pp', Event count (approx.): 1498132937483 Overhead Command Shared Object Symbol + 23.33% swapper [kernel.vmlinux] [k] intel_idle + 1.69% netserver [kernel.vmlinux] [k] mlx5e_xmit Tested-by: Song Liu Signed-off-by: Saeed Mahameed Reviewed-by: Gal Pressman --- include/linux/mlx5/cq.h | 3 +-- include/linux/mlx5/driver.h | 22 +++++++++------------- 2 files changed, 10 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 48c181a2acc9..06ba425a6ad7 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -60,6 +60,7 @@ struct mlx5_core_cq { } tasklet_ctx; int reset_notify_added; struct list_head reset_notify; + struct mlx5_eq *eq; }; @@ -171,8 +172,6 @@ static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd, mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL, NULL); } -int mlx5_init_cq_table(struct mlx5_core_dev *dev); -void mlx5_cleanup_cq_table(struct mlx5_core_dev *dev); int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen); int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6ed79a8a8318..96e003db2bcd 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -375,8 +375,15 @@ struct mlx5_eq_pagefault { mempool_t *pool; }; +struct mlx5_cq_table { + /* protect radix tree */ + spinlock_t lock; + struct radix_tree_root tree; +}; + struct mlx5_eq { struct mlx5_core_dev *dev; + struct mlx5_cq_table cq_table; __be32 __iomem *doorbell; u32 cons_index; struct mlx5_buf buf; @@ -526,13 +533,6 @@ struct mlx5_core_health { struct delayed_work recover_work; }; -struct mlx5_cq_table { - /* protect radix tree - */ - spinlock_t lock; - struct radix_tree_root tree; -}; - struct mlx5_qp_table { /* protect radix tree */ @@ -654,10 +654,6 @@ struct mlx5_priv { struct dentry *cmdif_debugfs; /* end: qp staff */ - /* start: cq staff */ - struct mlx5_cq_table cq_table; - /* end: cq staff */ - /* start: mkey staff */ struct mlx5_mkey_table mkey_table; /* end: mkey staff */ @@ -1053,12 +1049,12 @@ int mlx5_eq_init(struct mlx5_core_dev *dev); void mlx5_eq_cleanup(struct mlx5_core_dev *dev); void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas); -void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); +void mlx5_cq_completion(struct mlx5_eq *eq, u32 cqn); void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced); -void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type); +void mlx5_cq_event(struct mlx5_eq *eq, u32 cqn, int event_type); int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, int nent, u64 mask, const char *name, enum mlx5_eq_type type); -- cgit v1.2.3 From f105b45bf77ced96e516e1cd771c41bb7e8c830b Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 1 Feb 2018 03:32:00 -0800 Subject: net/mlx5: CQ hold/put API Now as the CQ table is per EQ, add an API to hold/put CQ to be used from eq.c in downstream patch. Signed-off-by: Saeed Mahameed Reviewed-by: Gal Pressman --- include/linux/mlx5/cq.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 06ba425a6ad7..445ad194e0fe 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -172,6 +172,17 @@ static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd, mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL, NULL); } +static inline void mlx5_cq_hold(struct mlx5_core_cq *cq) +{ + refcount_inc(&cq->refcount); +} + +static inline void mlx5_cq_put(struct mlx5_core_cq *cq) +{ + if (refcount_dec_and_test(&cq->refcount)) + complete(&cq->free); +} + int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen); int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); -- cgit v1.2.3 From 3ac7afdbcf243d6c79c1569d9e29aef0096e4743 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 1 Feb 2018 04:37:07 -0800 Subject: net/mlx5: Move CQ completion and event forwarding logic to eq.c Since CQ tree is now per EQ, CQ completion and event forwarding became specific implementation of EQ logic, this patch moves that logic to eq.c and makes those functions static. Signed-off-by: Saeed Mahameed Reviewed-by: Gal Pressman --- include/linux/mlx5/driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 96e003db2bcd..09e2f3e8753c 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1049,12 +1049,10 @@ int mlx5_eq_init(struct mlx5_core_dev *dev); void mlx5_eq_cleanup(struct mlx5_core_dev *dev); void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas); -void mlx5_cq_completion(struct mlx5_eq *eq, u32 cqn); void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced); -void mlx5_cq_event(struct mlx5_eq *eq, u32 cqn, int event_type); int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, int nent, u64 mask, const char *name, enum mlx5_eq_type type); -- cgit v1.2.3 From 3ec5693b17314b58977ba3c8d720d1f9cfef39f8 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Thu, 1 Feb 2018 05:42:06 -0800 Subject: net/mlx5: Remove redundant EQ API exports EQ structure and API is private to mlx5_core driver only, external drivers should not have access or the means to manipulate EQ objects. Remove redundant exports and move API functions out of the linux/mlx5 include directory into the driver's mlx5_core.h private include file. Signed-off-by: Saeed Mahameed Reviewed-by: Gal Pressman --- include/linux/mlx5/driver.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 09e2f3e8753c..2860a253275b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1045,20 +1045,11 @@ int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot); int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev); void mlx5_register_debugfs(void); void mlx5_unregister_debugfs(void); -int mlx5_eq_init(struct mlx5_core_dev *dev); -void mlx5_eq_cleanup(struct mlx5_core_dev *dev); void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas); void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); -void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, u64 vec, bool forced); -int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, - int nent, u64 mask, const char *name, - enum mlx5_eq_type type); -int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq); -int mlx5_start_eqs(struct mlx5_core_dev *dev); -void mlx5_stop_eqs(struct mlx5_core_dev *dev); int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, unsigned int *irqn); int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); @@ -1070,14 +1061,6 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, void *data_out, int size_out, u16 reg_num, int arg, int write); -int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq); -void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq); -int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq, - u32 *out, int outlen); -int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev); -void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev); -int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev); -void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev); int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db); int mlx5_db_alloc_node(struct mlx5_core_dev *dev, struct mlx5_db *db, int node); -- cgit v1.2.3 From 388ca8be00370db132464e27f745b8a0add19fcb Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Tue, 2 Jan 2018 16:08:06 +0200 Subject: IB/mlx5: Implement fragmented completion queue (CQ) The current implementation of create CQ requires contiguous memory, such requirement is problematic once the memory is fragmented or the system is low in memory, it causes for failures in dma_zalloc_coherent(). This patch implements new scheme of fragmented CQ to overcome this issue by introducing new type: 'struct mlx5_frag_buf_ctrl' to allocate fragmented buffers, rather than contiguous ones. Base the Completion Queues (CQs) on this new fragmented buffer. It fixes following crashes: kworker/29:0: page allocation failure: order:6, mode:0x80d0 CPU: 29 PID: 8374 Comm: kworker/29:0 Tainted: G OE 3.10.0 Workqueue: ib_cm cm_work_handler [ib_cm] Call Trace: [<>] dump_stack+0x19/0x1b [<>] warn_alloc_failed+0x110/0x180 [<>] __alloc_pages_slowpath+0x6b7/0x725 [<>] __alloc_pages_nodemask+0x405/0x420 [<>] dma_generic_alloc_coherent+0x8f/0x140 [<>] x86_swiotlb_alloc_coherent+0x21/0x50 [<>] mlx5_dma_zalloc_coherent_node+0xad/0x110 [mlx5_core] [<>] ? mlx5_db_alloc_node+0x69/0x1b0 [mlx5_core] [<>] mlx5_buf_alloc_node+0x3e/0xa0 [mlx5_core] [<>] mlx5_buf_alloc+0x14/0x20 [mlx5_core] [<>] create_cq_kernel+0x90/0x1f0 [mlx5_ib] [<>] mlx5_ib_create_cq+0x3b0/0x4e0 [mlx5_ib] Signed-off-by: Yonatan Cohen Reviewed-by: Tariq Toukan Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 51 ++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2860a253275b..bfea26af6de5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -345,13 +345,6 @@ struct mlx5_buf_list { dma_addr_t map; }; -struct mlx5_buf { - struct mlx5_buf_list direct; - int npages; - int size; - u8 page_shift; -}; - struct mlx5_frag_buf { struct mlx5_buf_list *frags; int npages; @@ -359,6 +352,15 @@ struct mlx5_frag_buf { u8 page_shift; }; +struct mlx5_frag_buf_ctrl { + struct mlx5_frag_buf frag_buf; + u32 sz_m1; + u32 frag_sz_m1; + u8 log_sz; + u8 log_stride; + u8 log_frag_strides; +}; + struct mlx5_eq_tasklet { struct list_head list; struct list_head process_list; @@ -386,7 +388,7 @@ struct mlx5_eq { struct mlx5_cq_table cq_table; __be32 __iomem *doorbell; u32 cons_index; - struct mlx5_buf buf; + struct mlx5_frag_buf buf; int size; unsigned int irqn; u8 eqn; @@ -932,9 +934,9 @@ struct mlx5_hca_vport_context { bool grh_required; }; -static inline void *mlx5_buf_offset(struct mlx5_buf *buf, int offset) +static inline void *mlx5_buf_offset(struct mlx5_frag_buf *buf, int offset) { - return buf->direct.buf + offset; + return buf->frags->buf + offset; } #define STRUCT_FIELD(header, field) \ @@ -973,6 +975,25 @@ static inline u32 mlx5_base_mkey(const u32 key) return key & 0xffffff00u; } +static inline void mlx5_core_init_cq_frag_buf(struct mlx5_frag_buf_ctrl *fbc, + void *cqc) +{ + fbc->log_stride = 6 + MLX5_GET(cqc, cqc, cqe_sz); + fbc->log_sz = MLX5_GET(cqc, cqc, log_cq_size); + fbc->sz_m1 = (1 << fbc->log_sz) - 1; + fbc->log_frag_strides = PAGE_SHIFT - fbc->log_stride; + fbc->frag_sz_m1 = (1 << fbc->log_frag_strides) - 1; +} + +static inline void *mlx5_frag_buf_get_wqe(struct mlx5_frag_buf_ctrl *fbc, + u32 ix) +{ + unsigned int frag = (ix >> fbc->log_frag_strides); + + return fbc->frag_buf.frags[frag].buf + + ((fbc->frag_sz_m1 & ix) << fbc->log_stride); +} + int mlx5_cmd_init(struct mlx5_core_dev *dev); void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); void mlx5_cmd_use_events(struct mlx5_core_dev *dev); @@ -998,9 +1019,10 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev); void mlx5_trigger_health_work(struct mlx5_core_dev *dev); void mlx5_drain_health_recovery(struct mlx5_core_dev *dev); int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, - struct mlx5_buf *buf, int node); -int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf); -void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf); + struct mlx5_frag_buf *buf, int node); +int mlx5_buf_alloc(struct mlx5_core_dev *dev, + int size, struct mlx5_frag_buf *buf); +void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf); int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size, struct mlx5_frag_buf *buf, int node); void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf); @@ -1045,7 +1067,8 @@ int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot); int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev); void mlx5_register_debugfs(void); void mlx5_unregister_debugfs(void); -void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); + +void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas); void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas); void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); -- cgit v1.2.3 From d8d211a2a0c37755a8660dc69f97b7c70bf210b1 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 14 Feb 2018 16:39:56 +0300 Subject: net: Make extern and export get_net_ns() This function will be used to obtain net of tun device. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/socket.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 9286a5a8c60c..1ce1f768a58c 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -353,4 +353,6 @@ extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen unsigned int flags, struct timespec *timeout); extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, unsigned int flags); + +extern struct ns_common *get_net_ns(struct ns_common *ns); #endif /* _LINUX_SOCKET_H */ -- cgit v1.2.3 From 6f89dbce8e1134458de8a8e376acaaca4eee602e Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Feb 2018 10:49:32 -0800 Subject: skbuff: export mm_[un]account_pinned_pages for other modules RDS would like to use the helper functions for managing pinned pages added by Commit a91dbff551a6 ("sock: ulimit on MSG_ZEROCOPY pages") Signed-off-by: Sowmini Varadhan Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5ebc0f869720..b1cc38af53e1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -466,6 +466,9 @@ struct ubuf_info { #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) +int mm_account_pinned_pages(struct mmpin *mmp, size_t size); +void mm_unaccount_pinned_pages(struct mmpin *mmp); + struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size); struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); -- cgit v1.2.3 From 22e76844c566e474a9a3e0c2206e45766b5941b7 Mon Sep 17 00:00:00 2001 From: Srinivas Dasari Date: Tue, 6 Feb 2018 19:49:35 +0530 Subject: ieee80211: Increase PMK maximum length to 64 bytes Increase the PMK maximum length to 64 bytes to accommodate the key length used in DPP with the NIST P-521 and Brainpool 512 curves. Signed-off-by: Srinivas Dasari Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index ee6657a0ed69..e4cba332b705 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2111,7 +2111,7 @@ enum ieee80211_key_len { #define FILS_ERP_MAX_REALM_LEN 253 #define FILS_ERP_MAX_RRK_LEN 64 -#define PMK_MAX_LEN 48 +#define PMK_MAX_LEN 64 /* Public action codes (IEEE Std 802.11-2016, 9.6.8.1, Table 9-307) */ enum ieee80211_pub_actioncode { -- cgit v1.2.3 From e3f9f41757f5ce1e95ef3bc3bfb72bbcdb23ece2 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Fri, 16 Feb 2018 12:06:13 +0100 Subject: ptr_ring: Remove now-redundant smp_read_barrier_depends() Because READ_ONCE() now implies smp_read_barrier_depends(), the smp_read_barrier_depends() in __ptr_ring_consume() is redundant; this commit removes it and updates the comments. Signed-off-by: Andrea Parri Cc: "David S. Miller" Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: John Fastabend Cc: Eric Dumazet Cc: Cc: Signed-off-by: David S. Miller --- include/linux/ptr_ring.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index b884b7794187..ddfed1dce936 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -296,13 +296,14 @@ static inline void *__ptr_ring_consume(struct ptr_ring *r) { void *ptr; + /* The READ_ONCE in __ptr_ring_peek guarantees that anyone + * accessing data through the pointer is up to date. Pairs + * with smp_wmb in __ptr_ring_produce. + */ ptr = __ptr_ring_peek(r); if (ptr) __ptr_ring_discard_one(r); - /* Make sure anyone accessing data through the pointer is up to date. */ - /* Pairs with smp_wmb in __ptr_ring_produce. */ - smp_read_barrier_depends(); return ptr; } -- cgit v1.2.3 From 19efbd93e6fb05eab81856b4fc8d64211dd37088 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Feb 2018 12:58:38 +0300 Subject: net: Kill net_mutex We take net_mutex, when there are !async pernet_operations registered, and read locking of net_sem is not enough. But we may get rid of taking the mutex, and just change the logic to write lock net_sem in such cases. This obviously reduces the number of lock operations, we do. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index e9ee9ad0a681..3573b4bf2fdf 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -35,7 +35,6 @@ extern int rtnl_trylock(void); extern int rtnl_is_locked(void); extern wait_queue_head_t netdev_unregistering_wq; -extern struct mutex net_mutex; extern struct rw_semaphore net_sem; #ifdef CONFIG_PROVE_LOCKING -- cgit v1.2.3 From a1f2ba04cc92414b6b933289365eab878b0b2bf4 Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Mon, 19 Feb 2018 14:48:40 +0200 Subject: mac80211: add get TID helper Extracting the TID from the QOS header is common enough to justify helper. Signed-off-by: Sara Sharon Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index e4cba332b705..8fe7e4306816 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -8,6 +8,7 @@ * Copyright (c) 2006, Michael Wu * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 Intel Corporation * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -2501,6 +2502,17 @@ static inline u8 *ieee80211_get_qos_ctl(struct ieee80211_hdr *hdr) return (u8 *)hdr + 24; } +/** + * ieee80211_get_tid - get qos TID + * @hdr: the frame + */ +static inline u8 ieee80211_get_tid(struct ieee80211_hdr *hdr) +{ + u8 *qc = ieee80211_get_qos_ctl(hdr); + + return qc[0] & IEEE80211_QOS_CTL_TID_MASK; +} + /** * ieee80211_get_SA - get pointer to SA * @hdr: the frame -- cgit v1.2.3 From ee07862f7b4594d390b978f6636a6a6191632ab3 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 23 Feb 2018 14:58:41 +0800 Subject: bpf: NULL pointer check is not needed in BPF_CGROUP_RUN_PROG_INET_SOCK sk is already allocated in inet_create/inet6_create, hence when BPF_CGROUP_RUN_PROG_INET_SOCK is executed sk will never be NULL. The logic is as bellow, sk = sk_alloc(); if (!sk) goto out; BPF_CGROUP_RUN_PROG_INET_SOCK(sk); Signed-off-by: Yafang Shao Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a7f16e0f8d68..8a4566691c8f 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -96,7 +96,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled && sk) { \ + if (cgroup_bpf_enabled) { \ __ret = __cgroup_bpf_run_filter_sk(sk, \ BPF_CGROUP_INET_SOCK_CREATE); \ } \ -- cgit v1.2.3 From 57cbd893c4c575a24594fa6c0835247506ce26e2 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 16 Jan 2018 14:04:14 +0000 Subject: net/mlx5: E-Switch, Move representors definition to a global scope In preparation for IB representors, move representors structs to a global scope, also expose functions needed for registration, unregistration, eswitch mode and creating a flow rule to direct traffic from SQs to the right VF. Signed-off-by: Mark Bloch Reviewed-by: Or Gerlitz Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 6 +++++ include/linux/mlx5/eswitch.h | 57 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 include/linux/mlx5/eswitch.h (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index bfea26af6de5..4814cad7456e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1224,6 +1224,12 @@ static inline int mlx5_core_is_pf(struct mlx5_core_dev *dev) return !(dev->priv.pci_dev_data & MLX5_PCI_DEV_IS_VF); } +#define MLX5_TOTAL_VPORTS(mdev) (1 + pci_sriov_get_totalvfs((mdev)->pdev)) +#define MLX5_VPORT_MANAGER(mdev) \ + (MLX5_CAP_GEN(mdev, vport_group_manager) && \ + (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && \ + mlx5_core_is_pf(mdev)) + static inline int mlx5_get_gid_table_len(u16 param) { if (param > 4) { diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h new file mode 100644 index 000000000000..f62bf486c18c --- /dev/null +++ b/include/linux/mlx5/eswitch.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + */ + +#ifndef _MLX5_ESWITCH_ +#define _MLX5_ESWITCH_ + +#include + +enum { + SRIOV_NONE, + SRIOV_LEGACY, + SRIOV_OFFLOADS +}; + +enum { + REP_ETH, + NUM_REP_TYPES, +}; + +struct mlx5_eswitch_rep; +struct mlx5_eswitch_rep_if { + int (*load)(struct mlx5_core_dev *dev, + struct mlx5_eswitch_rep *rep); + void (*unload)(struct mlx5_eswitch_rep *rep); + void *(*get_proto_dev)(struct mlx5_eswitch_rep *rep); + void *priv; + bool valid; +}; + +struct mlx5_eswitch_rep { + struct mlx5_eswitch_rep_if rep_if[NUM_REP_TYPES]; + u16 vport; + u8 hw_id[ETH_ALEN]; + u16 vlan; + u32 vlan_refcount; +}; + +void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw, + int vport_index, + struct mlx5_eswitch_rep_if *rep_if, + u8 rep_type); +void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw, + int vport_index, + u8 rep_type); +void *mlx5_eswitch_get_proto_dev(struct mlx5_eswitch *esw, + int vport, + u8 rep_type); +struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw, + int vport); +void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type); +u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw); +struct mlx5_flow_handle * +mlx5_eswitch_add_send_to_vport_rule(struct mlx5_eswitch *esw, + int vport, u32 sqn); +#endif -- cgit v1.2.3 From 5e65b02c00900155833008b7992bbbbc7f0df2ac Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 16 Jan 2018 14:13:46 +0000 Subject: net/mlx5: E-Switch, Add definition of IB representor Create a new representor type: REP_IB. which will be initialized by an IB device that is used as a logical representor of a eswitch vport (VF or uplink) just like we have a net device today in switchdev mode. Signed-off-by: Mark Bloch Reviewed-by: Or Gerlitz Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eswitch.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index f62bf486c18c..d3c9db492b30 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -16,6 +16,7 @@ enum { enum { REP_ETH, + REP_IB, NUM_REP_TYPES, }; -- cgit v1.2.3 From a9c79364df324a69ba1b71accd5b8a3155e570ac Mon Sep 17 00:00:00 2001 From: Russell King Date: Tue, 27 Feb 2018 15:53:02 +0000 Subject: phylink,sfp: negotiate interface format with MAC Negotiate the interface format with the MAC rather than requiring it to be a fixed type specified solely by the SFP module. This allows modules that can work with several different interface signalling formats to select a format compatible with the MAC - for example, a Fiber module supporing Gigabit ethernet and faster connected to a Gigabit only MAC needs to select the 1000BASE-X mode. Signed-off-by: Russell King Signed-off-by: David S. Miller --- include/linux/sfp.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index e724d5a3dd80..ebce9e24906a 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -422,10 +422,11 @@ struct sfp_upstream_ops { #if IS_ENABLED(CONFIG_SFP) int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support); -phy_interface_t sfp_parse_interface(struct sfp_bus *bus, - const struct sfp_eeprom_id *id); void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support); +phy_interface_t sfp_select_interface(struct sfp_bus *bus, + const struct sfp_eeprom_id *id, + unsigned long *link_modes); int sfp_get_module_info(struct sfp_bus *bus, struct ethtool_modinfo *modinfo); int sfp_get_module_eeprom(struct sfp_bus *bus, struct ethtool_eeprom *ee, @@ -444,18 +445,19 @@ static inline int sfp_parse_port(struct sfp_bus *bus, return PORT_OTHER; } -static inline phy_interface_t sfp_parse_interface(struct sfp_bus *bus, - const struct sfp_eeprom_id *id) -{ - return PHY_INTERFACE_MODE_NA; -} - static inline void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support) { } +static inline phy_interface_t sfp_select_interface(struct sfp_bus *bus, + const struct sfp_eeprom_id *id, + unsigned long *link_modes) +{ + return PHY_INTERFACE_MODE_NA; +} + static inline int sfp_get_module_info(struct sfp_bus *bus, struct ethtool_modinfo *modinfo) { -- cgit v1.2.3 From 6853f21f764b04e58df5e44629fec1fb8f3cbf2e Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:29 +0200 Subject: ipmr,ipmr6: Define a uniform vif_device The two implementations have almost identical structures - vif_device and mif_device. As a step toward uniforming the mr_tables, eliminate the mif_device and relocate the vif_device definition into a new common header file. Also, introduce a common initializing function for setting most of the vif_device fields in a new common source file. This requires modifying the ipv{4,6] Kconfig and ipv4 makefile as we're introducing a new common config option - CONFIG_IP_MROUTE_COMMON. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 13 +----------- include/linux/mroute6.h | 11 +--------- include/linux/mroute_base.h | 52 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 22 deletions(-) create mode 100644 include/linux/mroute_base.h (limited to 'include/linux') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 5396521a776a..b8aadffe6237 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef CONFIG_IP_MROUTE static inline int ip_mroute_opt(int opt) @@ -56,18 +57,6 @@ static inline bool ipmr_rule_default(const struct fib_rule *rule) } #endif -struct vif_device { - struct net_device *dev; /* Device we are using */ - struct netdev_phys_item_id dev_parent_id; /* Device parent ID */ - unsigned long bytes_in,bytes_out; - unsigned long pkt_in,pkt_out; /* Statistics */ - unsigned long rate_limit; /* Traffic shaping (NI) */ - unsigned char threshold; /* TTL threshold */ - unsigned short flags; /* Control flags */ - __be32 local,remote; /* Addresses(remote for tunnels)*/ - int link; /* Physical interface index */ -}; - struct vif_entry_notifier_info { struct fib_notifier_info info; struct net_device *dev; diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index 3014c52bfd86..e5e5b8282551 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -7,6 +7,7 @@ #include /* for struct sk_buff_head */ #include #include +#include #ifdef CONFIG_IPV6_MROUTE static inline int ip6_mroute_opt(int opt) @@ -62,16 +63,6 @@ static inline void ip6_mr_cleanup(void) } #endif -struct mif_device { - struct net_device *dev; /* Device we are using */ - unsigned long bytes_in,bytes_out; - unsigned long pkt_in,pkt_out; /* Statistics */ - unsigned long rate_limit; /* Traffic shaping (NI) */ - unsigned char threshold; /* TTL threshold */ - unsigned short flags; /* Control flags */ - int link; /* Physical interface index */ -}; - #define VIFF_STATIC 0x8000 struct mfc6_cache { diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h new file mode 100644 index 000000000000..0de651e15f27 --- /dev/null +++ b/include/linux/mroute_base.h @@ -0,0 +1,52 @@ +#ifndef __LINUX_MROUTE_BASE_H +#define __LINUX_MROUTE_BASE_H + +#include + +/** + * struct vif_device - interface representor for multicast routing + * @dev: network device being used + * @bytes_in: statistic; bytes ingressing + * @bytes_out: statistic; bytes egresing + * @pkt_in: statistic; packets ingressing + * @pkt_out: statistic; packets egressing + * @rate_limit: Traffic shaping (NI) + * @threshold: TTL threshold + * @flags: Control flags + * @link: Physical interface index + * @dev_parent_id: device parent id + * @local: Local address + * @remote: Remote address for tunnels + */ +struct vif_device { + struct net_device *dev; + unsigned long bytes_in, bytes_out; + unsigned long pkt_in, pkt_out; + unsigned long rate_limit; + unsigned char threshold; + unsigned short flags; + int link; + + /* Currently only used by ipmr */ + struct netdev_phys_item_id dev_parent_id; + __be32 local, remote; +}; + +#ifdef CONFIG_IP_MROUTE_COMMON +void vif_device_init(struct vif_device *v, + struct net_device *dev, + unsigned long rate_limit, + unsigned char threshold, + unsigned short flags, + unsigned short get_iflink_mask); +#else +static inline void vif_device_init(struct vif_device *v, + struct net_device *dev, + unsigned long rate_limit, + unsigned char threshold, + unsigned short flags, + unsigned short get_iflink_mask) +{ +} +#endif +#endif -- cgit v1.2.3 From 8571ab479a6e1ef46ead5ebee567e128a422767c Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:30 +0200 Subject: ip6mr: Make mroute_sk rcu-based In ipmr the mr_table socket is handled under RCU. Introduce the same for ip6mr. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute6.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index e5e5b8282551..e1b9fb06e1ea 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -111,12 +111,12 @@ extern int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid); #ifdef CONFIG_IPV6_MROUTE -extern struct sock *mroute6_socket(struct net *net, struct sk_buff *skb); +bool mroute6_is_socket(struct net *net, struct sk_buff *skb); extern int ip6mr_sk_done(struct sock *sk); #else -static inline struct sock *mroute6_socket(struct net *net, struct sk_buff *skb) +static inline bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { - return NULL; + return false; } static inline int ip6mr_sk_done(struct sock *sk) { -- cgit v1.2.3 From 87c418bf1323d57140f4b448715f64de3fbb7e91 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:31 +0200 Subject: ip6mr: Align hash implementation to ipmr Since commit 8fb472c09b9d ("ipmr: improve hash scalability") ipmr has been using rhashtable as a basis for its mfc routes, but ip6mr is currently still using the old private MFC hash implementation. Align ip6mr to the current ipmr implementation. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute6.h | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index e1b9fb06e1ea..e2dac199861e 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CONFIG_IPV6_MROUTE static inline int ip6_mroute_opt(int opt) @@ -65,10 +66,20 @@ static inline void ip6_mr_cleanup(void) #define VIFF_STATIC 0x8000 +struct mfc6_cache_cmp_arg { + struct in6_addr mf6c_mcastgrp; + struct in6_addr mf6c_origin; +}; + struct mfc6_cache { - struct list_head list; - struct in6_addr mf6c_mcastgrp; /* Group the entry belongs to */ - struct in6_addr mf6c_origin; /* Source of packet */ + struct rhlist_head mnode; + union { + struct { + struct in6_addr mf6c_mcastgrp; + struct in6_addr mf6c_origin; + }; + struct mfc6_cache_cmp_arg cmparg; + }; mifi_t mf6c_parent; /* Source interface */ int mfc_flags; /* Flags on line */ @@ -88,22 +99,13 @@ struct mfc6_cache { unsigned char ttls[MAXMIFS]; /* TTL thresholds */ } res; } mfc_un; + struct list_head list; + struct rcu_head rcu; }; #define MFC_STATIC 1 #define MFC_NOTIFY 2 -#define MFC6_LINES 64 - -#define MFC6_HASH(a, g) (((__force u32)(a)->s6_addr32[0] ^ \ - (__force u32)(a)->s6_addr32[1] ^ \ - (__force u32)(a)->s6_addr32[2] ^ \ - (__force u32)(a)->s6_addr32[3] ^ \ - (__force u32)(g)->s6_addr32[0] ^ \ - (__force u32)(g)->s6_addr32[1] ^ \ - (__force u32)(g)->s6_addr32[2] ^ \ - (__force u32)(g)->s6_addr32[3]) % MFC6_LINES) - #define MFC_ASSERT_THRESH (3*HZ) /* Maximal freq. of asserts */ struct rtmsg; -- cgit v1.2.3 From b70432f7319eb75b24ca57dde8146c5e27244780 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:32 +0200 Subject: mroute*: Make mr_table a common struct Following previous changes to ip6mr, mr_table and mr6_table are basically the same [up to mr6_table having additional '6' suffixes to its variable names]. Move the common structure definition into a common header; This requires renaming all references in ip6mr to variables that had the distinct suffix. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 21 --------------------- include/linux/mroute6.h | 1 - include/linux/mroute_base.h | 46 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index b8aadffe6237..8688c5d03a24 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -4,8 +4,6 @@ #include #include -#include -#include #include #include #include @@ -67,25 +65,6 @@ struct vif_entry_notifier_info { #define VIFF_STATIC 0x8000 -#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL) - -struct mr_table { - struct list_head list; - possible_net_t net; - u32 id; - struct sock __rcu *mroute_sk; - struct timer_list ipmr_expire_timer; - struct list_head mfc_unres_queue; - struct vif_device vif_table[MAXVIFS]; - struct rhltable mfc_hash; - struct list_head mfc_cache_list; - int maxvif; - atomic_t cache_resolve_queue_len; - bool mroute_do_assert; - bool mroute_do_pim; - int mroute_reg_vif_num; -}; - /* mfc_flags: * MFC_STATIC - the entry was added statically (not by a routing daemon) * MFC_OFFLOAD - the entry was offloaded to the hardware diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index e2dac199861e..d5c8dc155a42 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -8,7 +8,6 @@ #include #include #include -#include #ifdef CONFIG_IPV6_MROUTE static inline int ip6_mroute_opt(int opt) diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 0de651e15f27..1cc944a14df5 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -2,6 +2,9 @@ #define __LINUX_MROUTE_BASE_H #include +#include +#include +#include /** * struct vif_device - interface representor for multicast routing @@ -32,6 +35,49 @@ struct vif_device { __be32 local, remote; }; +#ifndef MAXVIFS +/* This one is nasty; value is defined in uapi using different symbols for + * mroute and morute6 but both map into same 32. + */ +#define MAXVIFS 32 +#endif + +#define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev)) + +/** + * struct mr_table - a multicast routing table + * @list: entry within a list of multicast routing tables + * @net: net where this table belongs + * @id: identifier of the table + * @mroute_sk: socket associated with the table + * @ipmr_expire_timer: timer for handling unresolved routes + * @mfc_unres_queue: list of unresolved MFC entries + * @vif_table: array containing all possible vifs + * @mfc_hash: Hash table of all resolved routes for easy lookup + * @mfc_cache_list: list of resovled routes for possible traversal + * @maxvif: Identifier of highest value vif currently in use + * @cache_resolve_queue_len: current size of unresolved queue + * @mroute_do_assert: Whether to inform userspace on wrong ingress + * @mroute_do_pim: Whether to receive IGMP PIMv1 + * @mroute_reg_vif_num: PIM-device vif index + */ +struct mr_table { + struct list_head list; + possible_net_t net; + u32 id; + struct sock __rcu *mroute_sk; + struct timer_list ipmr_expire_timer; + struct list_head mfc_unres_queue; + struct vif_device vif_table[MAXVIFS]; + struct rhltable mfc_hash; + struct list_head mfc_cache_list; + int maxvif; + atomic_t cache_resolve_queue_len; + bool mroute_do_assert; + bool mroute_do_pim; + int mroute_reg_vif_num; +}; + #ifdef CONFIG_IP_MROUTE_COMMON void vif_device_init(struct vif_device *v, struct net_device *dev, -- cgit v1.2.3 From 0bbbf0e7d0e7ea8267836986346a9b3a35b74e4e Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:33 +0200 Subject: ipmr, ip6mr: Unite creation of new mr_table Now that both ipmr and ip6mr are using the same mr_table structure, we can have a common function to allocate & initialize a new instance. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 1cc944a14df5..805305722803 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -85,6 +85,13 @@ void vif_device_init(struct vif_device *v, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask); + +struct mr_table * +mr_table_alloc(struct net *net, u32 id, + const struct rhashtable_params *rht_params, + void (*expire_func)(struct timer_list *t), + void (*table_set)(struct mr_table *mrt, + struct net *net)); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, @@ -94,5 +101,15 @@ static inline void vif_device_init(struct vif_device *v, unsigned short get_iflink_mask) { } + +static inline struct mr_table * +mr_table_alloc(struct net *net, u32 id, + const struct rhashtable_params *rht_params, + void (*expire_func)(struct timer_list *t), + void (*table_set)(struct mr_table *mrt, + struct net *net)) +{ + return NULL; +} #endif #endif -- cgit v1.2.3 From 494fff56379c4ad5b8fe36a5b7ffede4044ca7bb Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:34 +0200 Subject: ipmr, ip6mr: Make mfc_cache a common structure mfc_cache and mfc6_cache are almost identical - the main difference is in the origin/group addresses and comparison-key. Make a common structure encapsulating most of the multicast routing logic - mr_mfc and convert both ipmr and ip6mr into using it. For easy conversion [casting, in this case] mr_mfc has to be the first field inside every multicast routing abstraction utilizing it. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 45 ++++----------------------------------------- include/linux/mroute6.h | 23 +---------------------- include/linux/mroute_base.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 63 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 8688c5d03a24..63b36e6c72a0 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -81,28 +81,13 @@ struct mfc_cache_cmp_arg { /** * struct mfc_cache - multicast routing entries - * @mnode: rhashtable list + * @_c: Common multicast routing information; has to be first [for casting] * @mfc_mcastgrp: destination multicast group address * @mfc_origin: source address * @cmparg: used for rhashtable comparisons - * @mfc_parent: source interface (iif) - * @mfc_flags: entry flags - * @expires: unresolved entry expire time - * @unresolved: unresolved cached skbs - * @last_assert: time of last assert - * @minvif: minimum VIF id - * @maxvif: maximum VIF id - * @bytes: bytes that have passed for this entry - * @pkt: packets that have passed for this entry - * @wrong_if: number of wrong source interface hits - * @lastuse: time of last use of the group (traffic or update) - * @ttls: OIF TTL threshold array - * @refcount: reference count for this entry - * @list: global entry list - * @rcu: used for entry destruction */ struct mfc_cache { - struct rhlist_head mnode; + struct mr_mfc _c; union { struct { __be32 mfc_mcastgrp; @@ -110,28 +95,6 @@ struct mfc_cache { }; struct mfc_cache_cmp_arg cmparg; }; - vifi_t mfc_parent; - int mfc_flags; - - union { - struct { - unsigned long expires; - struct sk_buff_head unresolved; - } unres; - struct { - unsigned long last_assert; - int minvif; - int maxvif; - unsigned long bytes; - unsigned long pkt; - unsigned long wrong_if; - unsigned long lastuse; - unsigned char ttls[MAXVIFS]; - refcount_t refcount; - } res; - } mfc_un; - struct list_head list; - struct rcu_head rcu; }; struct mfc_entry_notifier_info { @@ -155,12 +118,12 @@ static inline void ipmr_cache_free(struct mfc_cache *mfc_cache) static inline void ipmr_cache_put(struct mfc_cache *c) { - if (refcount_dec_and_test(&c->mfc_un.res.refcount)) + if (refcount_dec_and_test(&c->_c.mfc_un.res.refcount)) ipmr_cache_free(c); } static inline void ipmr_cache_hold(struct mfc_cache *c) { - refcount_inc(&c->mfc_un.res.refcount); + refcount_inc(&c->_c.mfc_un.res.refcount); } #endif diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index d5c8dc155a42..6acf576fc135 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -71,7 +71,7 @@ struct mfc6_cache_cmp_arg { }; struct mfc6_cache { - struct rhlist_head mnode; + struct mr_mfc _c; union { struct { struct in6_addr mf6c_mcastgrp; @@ -79,27 +79,6 @@ struct mfc6_cache { }; struct mfc6_cache_cmp_arg cmparg; }; - mifi_t mf6c_parent; /* Source interface */ - int mfc_flags; /* Flags on line */ - - union { - struct { - unsigned long expires; - struct sk_buff_head unresolved; /* Unresolved buffers */ - } unres; - struct { - unsigned long last_assert; - int minvif; - int maxvif; - unsigned long bytes; - unsigned long pkt; - unsigned long wrong_if; - unsigned long lastuse; - unsigned char ttls[MAXMIFS]; /* TTL thresholds */ - } res; - } mfc_un; - struct list_head list; - struct rcu_head rcu; }; #define MFC_STATIC 1 diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 805305722803..2769e2f98b32 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -44,6 +44,51 @@ struct vif_device { #define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev)) +/** + * struct mr_mfc - common multicast routing entries + * @mnode: rhashtable list + * @mfc_parent: source interface (iif) + * @mfc_flags: entry flags + * @expires: unresolved entry expire time + * @unresolved: unresolved cached skbs + * @last_assert: time of last assert + * @minvif: minimum VIF id + * @maxvif: maximum VIF id + * @bytes: bytes that have passed for this entry + * @pkt: packets that have passed for this entry + * @wrong_if: number of wrong source interface hits + * @lastuse: time of last use of the group (traffic or update) + * @ttls: OIF TTL threshold array + * @refcount: reference count for this entry + * @list: global entry list + * @rcu: used for entry destruction + */ +struct mr_mfc { + struct rhlist_head mnode; + unsigned short mfc_parent; + int mfc_flags; + + union { + struct { + unsigned long expires; + struct sk_buff_head unresolved; + } unres; + struct { + unsigned long last_assert; + int minvif; + int maxvif; + unsigned long bytes; + unsigned long pkt; + unsigned long wrong_if; + unsigned long lastuse; + unsigned char ttls[MAXVIFS]; + refcount_t refcount; + } res; + } mfc_un; + struct list_head list; + struct rcu_head rcu; +}; + /** * struct mr_table - a multicast routing table * @list: entry within a list of multicast routing tables -- cgit v1.2.3 From 845c9a7ae7f5342ba42280c3a2f2aa92bce641d7 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:35 +0200 Subject: ipmr, ip6mr: Unite logic for searching in MFC cache ipmr and ip6mr utilize the exact same methods for searching the hashed resolved connections, difference being only in the construction of the hash comparison key. In order to unite the flow, introduce an mr_table operation set that would contain the protocol specific information required for common flows, in this case - the hash parameters and a comparison key representing a (*,*) route. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 52 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 2769e2f98b32..46a082e25dab 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -89,10 +89,23 @@ struct mr_mfc { struct rcu_head rcu; }; +struct mr_table; + +/** + * struct mr_table_ops - callbacks and info for protocol-specific ops + * @rht_params: parameters for accessing the MFC hash + * @cmparg_any: a hash key to be used for matching on (*,*) routes + */ +struct mr_table_ops { + const struct rhashtable_params *rht_params; + void *cmparg_any; +}; + /** * struct mr_table - a multicast routing table * @list: entry within a list of multicast routing tables * @net: net where this table belongs + * @ops: protocol specific operations * @id: identifier of the table * @mroute_sk: socket associated with the table * @ipmr_expire_timer: timer for handling unresolved routes @@ -109,6 +122,7 @@ struct mr_mfc { struct mr_table { struct list_head list; possible_net_t net; + struct mr_table_ops ops; u32 id; struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; @@ -133,10 +147,19 @@ void vif_device_init(struct vif_device *v, struct mr_table * mr_table_alloc(struct net *net, u32 id, - const struct rhashtable_params *rht_params, + struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)); + +/* These actually return 'struct mr_mfc *', but to avoid need for explicit + * castings they simply return void. + */ +void *mr_mfc_find_parent(struct mr_table *mrt, + void *hasharg, int parent); +void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi); +void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg); + #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, @@ -147,14 +170,37 @@ static inline void vif_device_init(struct vif_device *v, { } -static inline struct mr_table * +static inline void * mr_table_alloc(struct net *net, u32 id, - const struct rhashtable_params *rht_params, + struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)) { return NULL; } + +static inline void *mr_mfc_find_parent(struct mr_table *mrt, + void *hasharg, int parent) +{ + return NULL; +} + +static inline void *mr_mfc_find_any_parent(struct mr_table *mrt, + int vifi) +{ + return NULL; +} + +static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt, + int vifi, void *hasharg) +{ + return NULL; +} #endif + +static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) +{ + return mr_mfc_find_parent(mrt, hasharg, -1); +} #endif -- cgit v1.2.3 From c8d6196803265484f7e1cdd1b00a188dc59a5988 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:36 +0200 Subject: ipmr, ip6mr: Unite mfc seq logic With the exception of the final dump, ipmr and ip6mr have the exact same seq logic for traversing a given mr_table. Refactor that code and make it common. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 69 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 46a082e25dab..a007c5ad0fde 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -203,4 +204,72 @@ static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) { return mr_mfc_find_parent(mrt, hasharg, -1); } + +#ifdef CONFIG_PROC_FS +struct mr_mfc_iter { + struct seq_net_private p; + struct mr_table *mrt; + struct list_head *cache; + + /* Lock protecting the mr_table's unresolved queue */ + spinlock_t *lock; +}; + +#ifdef CONFIG_IP_MROUTE_COMMON +/* These actually return 'struct mr_mfc *', but to avoid need for explicit + * castings they simply return void. + */ +void *mr_mfc_seq_idx(struct net *net, + struct mr_mfc_iter *it, loff_t pos); +void *mr_mfc_seq_next(struct seq_file *seq, void *v, + loff_t *pos); + +static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, + struct mr_table *mrt, spinlock_t *lock) +{ + struct mr_mfc_iter *it = seq->private; + + it->mrt = mrt; + it->cache = NULL; + it->lock = lock; + + return *pos ? mr_mfc_seq_idx(seq_file_net(seq), + seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) +{ + struct mr_mfc_iter *it = seq->private; + struct mr_table *mrt = it->mrt; + + if (it->cache == &mrt->mfc_unres_queue) + spin_unlock_bh(it->lock); + else if (it->cache == &mrt->mfc_cache_list) + rcu_read_unlock(); +} +#else +static inline void *mr_mfc_seq_idx(struct net *net, + struct mr_mfc_iter *it, loff_t pos) +{ + return NULL; +} + +static inline void *mr_mfc_seq_next(struct seq_file *seq, void *v, + loff_t *pos) +{ + return NULL; +} + +static inline void *mr_mfc_seq_start(struct seq_file *seq, loff_t *pos, + struct mr_table *mrt, spinlock_t *lock) +{ + return NULL; +} + +static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) +{ +} +#endif +#endif #endif -- cgit v1.2.3 From 3feda6b46f734704840685a62b645cbe4efb810c Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:37 +0200 Subject: ipmr, ip6mr: Unite vif seq functions Same as previously done with the mfc seq, the logic for the vif seq is refactored to be shared between ipmr and ip6mr. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index a007c5ad0fde..cfaec9bd2d3c 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -206,6 +206,12 @@ static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) } #ifdef CONFIG_PROC_FS +struct mr_vif_iter { + struct seq_net_private p; + struct mr_table *mrt; + int ct; +}; + struct mr_mfc_iter { struct seq_net_private p; struct mr_table *mrt; @@ -216,6 +222,16 @@ struct mr_mfc_iter { }; #ifdef CONFIG_IP_MROUTE_COMMON +void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos); +void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos); + +static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? mr_vif_seq_idx(seq_file_net(seq), + seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + /* These actually return 'struct mr_mfc *', but to avoid need for explicit * castings they simply return void. */ @@ -249,6 +265,23 @@ static inline void mr_mfc_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock(); } #else +static inline void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, + loff_t pos) +{ + return NULL; +} + +static inline void *mr_vif_seq_next(struct seq_file *seq, + void *v, loff_t *pos) +{ + return NULL; +} + +static inline void *mr_vif_seq_start(struct seq_file *seq, loff_t *pos) +{ + return NULL; +} + static inline void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos) { -- cgit v1.2.3 From 889cd83cbe411dda854429f3223ab2d31a860a4a Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:38 +0200 Subject: ip6mr: Remove MFC_NOTIFY and refactor flags MFC_NOTIFY exists in ip6mr, probably as some legacy code [was already removed for ipmr in commit 06bd6c0370bb ("net: ipmr: remove unused MFC_NOTIFY flag and make the flags enum"). Remove it from ip6mr as well, and move the enum into a common file; Notice MFC_OFFLOAD is currently only used by ipmr. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute.h | 9 --------- include/linux/mroute6.h | 3 --- include/linux/mroute_base.h | 9 +++++++++ 3 files changed, 9 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 63b36e6c72a0..7ed82e4f11b3 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -65,15 +65,6 @@ struct vif_entry_notifier_info { #define VIFF_STATIC 0x8000 -/* mfc_flags: - * MFC_STATIC - the entry was added statically (not by a routing daemon) - * MFC_OFFLOAD - the entry was offloaded to the hardware - */ -enum { - MFC_STATIC = BIT(0), - MFC_OFFLOAD = BIT(1), -}; - struct mfc_cache_cmp_arg { __be32 mfc_mcastgrp; __be32 mfc_origin; diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index 6acf576fc135..1ac38e6819f5 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -81,9 +81,6 @@ struct mfc6_cache { }; }; -#define MFC_STATIC 1 -#define MFC_NOTIFY 2 - #define MFC_ASSERT_THRESH (3*HZ) /* Maximal freq. of asserts */ struct rtmsg; diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index cfaec9bd2d3c..f40202b16dae 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -45,6 +45,15 @@ struct vif_device { #define VIF_EXISTS(_mrt, _idx) (!!((_mrt)->vif_table[_idx].dev)) +/* mfc_flags: + * MFC_STATIC - the entry was added statically (not by a routing daemon) + * MFC_OFFLOAD - the entry was offloaded to the hardware + */ +enum { + MFC_STATIC = BIT(0), + MFC_OFFLOAD = BIT(1), +}; + /** * struct mr_mfc - common multicast routing entries * @mnode: rhashtable list -- cgit v1.2.3 From 7b0db85737db3f4d76b2a412e4f19eae59b8b494 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Wed, 28 Feb 2018 23:29:39 +0200 Subject: ipmr, ip6mr: Unite dumproute flows The various MFC entries are being held in the same kind of mr_tables for both ipmr and ip6mr, and their traversal logic is identical. Also, with the exception of the addresses [and other small tidbits] the major bulk of the nla setting is identical. Unite as much of the dumping as possible between the two. Notice this requires creating an mr_table iterator for each, as the for-each preprocessor macro can't be used by the common logic. Signed-off-by: Yuval Mintz Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index f40202b16dae..c2560cb50f1d 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -170,6 +170,16 @@ void *mr_mfc_find_parent(struct mr_table *mrt, void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi); void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg); +int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mr_mfc *c, struct rtmsg *rtm); +int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, + struct mr_table *(*iter)(struct net *net, + struct mr_table *mrt), + int (*fill)(struct mr_table *mrt, + struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, + int cmd, int flags), + spinlock_t *lock); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, @@ -207,6 +217,25 @@ static inline struct mr_mfc *mr_mfc_find_any(struct mr_table *mrt, { return NULL; } + +static inline int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, + struct mr_mfc *c, struct rtmsg *rtm) +{ + return -EINVAL; +} + +static inline int +mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, + struct mr_table *(*iter)(struct net *net, + struct mr_table *mrt), + int (*fill)(struct mr_table *mrt, + struct sk_buff *skb, + u32 portid, u32 seq, struct mr_mfc *c, + int cmd, int flags), + spinlock_t *lock) +{ + return -EINVAL; +} #endif static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) -- cgit v1.2.3 From e8a714e086e42972fd0e2d59e90c28eb2d839429 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 1 Mar 2018 16:08:56 -0800 Subject: net: phy: Export gen10g_* functions In order to remove a fair amount of duplication in the different 10G PHY drivers, export all gen10g_* functions to be able to make use of those. While we are at it, rename gen10g_soft_reset() to gen10g_no_soft_reset() to illustrate what it does. Signed-off-by: Florian Fainelli --- include/linux/phy.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 5a0c3e53e7c2..6e38c699b753 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -994,6 +994,14 @@ int genphy_c45_pma_setup_forced(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); +/* The gen10g_* functions are the old Clause 45 stub */ +int gen10g_config_aneg(struct phy_device *phydev); +int gen10g_read_status(struct phy_device *phydev); +int gen10g_no_soft_reset(struct phy_device *phydev); +int gen10g_config_init(struct phy_device *phydev); +int gen10g_suspend(struct phy_device *phydev); +int gen10g_resume(struct phy_device *phydev); + static inline int phy_read_status(struct phy_device *phydev) { if (!phydev->drv) -- cgit v1.2.3 From cceae76ef3a1181242e4f7b559a7bfc904a9855c Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 11 Feb 2018 19:17:20 +0900 Subject: netfilter: nfnetlink_acct: remove useless parameter parameter skb in nfnl_acct_overquota is not used anywhere. Signed-off-by: Taehee Yoo Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink_acct.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink_acct.h b/include/linux/netfilter/nfnetlink_acct.h index b4d741195c28..beee8bffe49e 100644 --- a/include/linux/netfilter/nfnetlink_acct.h +++ b/include/linux/netfilter/nfnetlink_acct.h @@ -16,6 +16,5 @@ struct nf_acct; struct nf_acct *nfnl_acct_find_get(struct net *net, const char *filter_name); void nfnl_acct_put(struct nf_acct *acct); void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct); -int nfnl_acct_overquota(struct net *net, const struct sk_buff *skb, - struct nf_acct *nfacct); +int nfnl_acct_overquota(struct net *net, struct nf_acct *nfacct); #endif /* _NFNL_ACCT_H */ -- cgit v1.2.3 From 1b293e30f759b03f246baae862bdf35e57b2c39e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:29 +0100 Subject: netfilter: x_tables: move hook entry checks into core Allow followup patch to change on location instead of three. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 1313b35c3ab7..fa0c19c328f1 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -281,6 +281,8 @@ int xt_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); +int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks); + unsigned int *xt_alloc_entry_offsets(unsigned int size); bool xt_find_jump_offset(const unsigned int *offsets, unsigned int target, unsigned int size); -- cgit v1.2.3 From c84ca954ac9fa67a6ce27f91f01e4451c74fd8f6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:33 +0100 Subject: netfilter: x_tables: add counters allocation wrapper allows to have size checks in a single spot. This is supposed to reduce oom situations when fuzz-testing xtables. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index fa0c19c328f1..0bd93c589a8c 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -301,6 +301,7 @@ int xt_data_to_user(void __user *dst, const void *src, void *xt_copy_counters_from_user(const void __user *user, unsigned int len, struct xt_counters_info *info, bool compat); +struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, -- cgit v1.2.3 From 9782a11efc072faaf91d4aa60e9d23553f918029 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 27 Feb 2018 19:42:34 +0100 Subject: netfilter: compat: prepare xt_compat_init_offsets to return errors should have no impact, function still always returns 0. This patch is only to ease review. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 0bd93c589a8c..7bd896dc78df 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -510,7 +510,7 @@ void xt_compat_unlock(u_int8_t af); int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta); void xt_compat_flush_offsets(u_int8_t af); -void xt_compat_init_offsets(u_int8_t af, unsigned int number); +int xt_compat_init_offsets(u8 af, unsigned int number); int xt_compat_calc_jump(u_int8_t af, unsigned int offset); int xt_compat_match_offset(const struct xt_match *match); -- cgit v1.2.3 From a9db0ecf1578894ea3405f3eb5a441508840d479 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 16 Aug 2017 09:43:48 +0300 Subject: {net,IB}/mlx5: Add has_tag to mlx5_flow_act The has_tag member will indicate whether a tag action was specified in flow specification. A flow tag 0 = MLX5_FS_DEFAULT_FLOW_TAG is assumed a valid flow tag that is currently used by mlx5 RDMA driver, whereas in HW flow_tag = 0 means that the user doesn't care about flow_tag. HW always provide a flow_tag = 0 if all flow tags requested on a specific flow are 0. So we need a way (in the driver) to differentiate between a user really requesting flow_tag = 0 and a user who does not care, in order to be able to report conflicting flow tags on a specific flow. Signed-off-by: Matan Barak Reviewed-by: Aviad Yehezkel Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index a0b48afcb422..f580bc4c2443 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -141,6 +141,7 @@ void mlx5_destroy_flow_group(struct mlx5_flow_group *fg); struct mlx5_flow_act { u32 action; + bool has_flow_tag; u32 flow_tag; u32 encap_id; u32 modify_id; -- cgit v1.2.3 From 5f4183781a303da5ab6731b8c19328c5b9df89fa Mon Sep 17 00:00:00 2001 From: Aviad Yehezkel Date: Sun, 18 Feb 2018 13:17:17 +0200 Subject: net/mlx5: Add empty egress namespace to flow steering core Currently, we don't support egress flow steering namespace in mlx5 flow steering core implementation. However, when we want to encrypt a packet, we model it as a flow steering rule in the egress path. To overcome this, we add an empty egress namespace to flow steering. This namespace is initialized only when ipsec support exists. In the future, this will grow to a full blown full steering implementation, resembling the ingress path. Signed-off-by: Matan Barak Signed-off-by: Aviad Yehezkel Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 1 + include/linux/mlx5/mlx5_ifc.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index f580bc4c2443..744ea228acea 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -69,6 +69,7 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_ESW_INGRESS, MLX5_FLOW_NAMESPACE_SNIFFER_RX, MLX5_FLOW_NAMESPACE_SNIFFER_TX, + MLX5_FLOW_NAMESPACE_EGRESS, }; struct mlx5_flow_table; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f4e417686f62..9bc4ea0cf5a9 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1091,6 +1091,7 @@ enum mlx5_flow_destination_type { MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE = 0x1, MLX5_FLOW_DESTINATION_TYPE_TIR = 0x2, + MLX5_FLOW_DESTINATION_TYPE_PORT = 0x99, MLX5_FLOW_DESTINATION_TYPE_COUNTER = 0x100, }; -- cgit v1.2.3 From 3346c4873733a109bea29467308a754038b886a9 Mon Sep 17 00:00:00 2001 From: Boris Pismenny Date: Sun, 20 Aug 2017 15:13:08 +0300 Subject: {net,IB}/mlx5: Add flow steering helpers Add helper functions that check if a protocol is part of a flow steering match criteria. Signed-off-by: Boris Pismenny Signed-off-by: Matan Barak Signed-off-by: Aviad Yehezkel Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs_helpers.h | 134 ++++++++++++++++++++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 8 ++- 2 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 include/linux/mlx5/fs_helpers.h (limited to 'include/linux') diff --git a/include/linux/mlx5/fs_helpers.h b/include/linux/mlx5/fs_helpers.h new file mode 100644 index 000000000000..7b476bbae731 --- /dev/null +++ b/include/linux/mlx5/fs_helpers.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2018, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _MLX5_FS_HELPERS_ +#define _MLX5_FS_HELPERS_ + +#include + +#define MLX5_FS_IPV4_VERSION 4 +#define MLX5_FS_IPV6_VERSION 6 + +static inline bool _mlx5_fs_is_outer_ipproto_flow(const u32 *match_c, + const u32 *match_v, u8 match) +{ + const void *headers_c = MLX5_ADDR_OF(fte_match_param, match_c, + outer_headers); + const void *headers_v = MLX5_ADDR_OF(fte_match_param, match_v, + outer_headers); + + return MLX5_GET(fte_match_set_lyr_2_4, headers_c, ip_protocol) == 0xff && + MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol) == match; +} + +static inline bool mlx5_fs_is_outer_tcp_flow(const u32 *match_c, + const u32 *match_v) +{ + return _mlx5_fs_is_outer_ipproto_flow(match_c, match_v, IPPROTO_TCP); +} + +static inline bool mlx5_fs_is_outer_udp_flow(const u32 *match_c, + const u32 *match_v) +{ + return _mlx5_fs_is_outer_ipproto_flow(match_c, match_v, IPPROTO_UDP); +} + +static inline bool mlx5_fs_is_vxlan_flow(const u32 *match_c) +{ + void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, + misc_parameters); + + return MLX5_GET(fte_match_set_misc, misc_params_c, vxlan_vni); +} + +static inline bool _mlx5_fs_is_outer_ipv_flow(struct mlx5_core_dev *mdev, + const u32 *match_c, + const u32 *match_v, int version) +{ + int match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_ip_version); + const void *headers_c = MLX5_ADDR_OF(fte_match_param, match_c, + outer_headers); + const void *headers_v = MLX5_ADDR_OF(fte_match_param, match_v, + outer_headers); + + if (!match_ipv) { + u16 ethertype; + + switch (version) { + case MLX5_FS_IPV4_VERSION: + ethertype = ETH_P_IP; + break; + case MLX5_FS_IPV6_VERSION: + ethertype = ETH_P_IPV6; + break; + default: + return false; + } + + return MLX5_GET(fte_match_set_lyr_2_4, headers_c, + ethertype) == 0xffff && + MLX5_GET(fte_match_set_lyr_2_4, headers_v, + ethertype) == ethertype; + } + + return MLX5_GET(fte_match_set_lyr_2_4, headers_c, + ip_version) == 0xf && + MLX5_GET(fte_match_set_lyr_2_4, headers_v, + ip_version) == version; +} + +static inline bool +mlx5_fs_is_outer_ipv4_flow(struct mlx5_core_dev *mdev, const u32 *match_c, + const u32 *match_v) +{ + return _mlx5_fs_is_outer_ipv_flow(mdev, match_c, match_v, + MLX5_FS_IPV4_VERSION); +} + +static inline bool +mlx5_fs_is_outer_ipv6_flow(struct mlx5_core_dev *mdev, const u32 *match_c, + const u32 *match_v) +{ + return _mlx5_fs_is_outer_ipv_flow(mdev, match_c, match_v, + MLX5_FS_IPV6_VERSION); +} + +static inline bool mlx5_fs_is_outer_ipsec_flow(const u32 *match_c) +{ + void *misc_params_c = + MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters); + + return MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi); +} + +#endif diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 9bc4ea0cf5a9..14ad84afe8ba 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -295,7 +295,9 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 inner_tcp_dport[0x1]; u8 inner_tcp_flags[0x1]; u8 reserved_at_37[0x9]; - u8 reserved_at_40[0x1a]; + u8 reserved_at_40[0x17]; + u8 outer_esp_spi[0x1]; + u8 reserved_at_58[0x2]; u8 bth_dst_qp[0x1]; u8 reserved_at_5b[0x25]; @@ -437,7 +439,9 @@ struct mlx5_ifc_fte_match_set_misc_bits { u8 reserved_at_120[0x28]; u8 bth_dst_qp[0x18]; - u8 reserved_at_160[0xa0]; + u8 reserved_at_160[0x20]; + u8 outer_esp_spi[0x20]; + u8 reserved_at_1a0[0x60]; }; struct mlx5_ifc_cmd_pas_bits { -- cgit v1.2.3 From 1ec54cb44e6731c3cb251bcf9251d65a4b4f6306 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 6 Mar 2018 10:56:31 +0100 Subject: net: unpollute priv_flags space the ipvlan device driver defines and uses 2 bits inside the priv_flags net_device field. Such bits and the related helper are used only inside the ipvlan device driver, and the core networking does not need to be aware of them. This change moves netif_is_ipvlan* helper in the ipvlan driver and re-implement them looking for ipvlan specific symbols instead of using priv_flags. Overall this frees two bits inside priv_flags - and move the following ones to avoid gaps - without any intended functional change. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dbe6344b727a..95a613a7cc1c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1381,8 +1381,6 @@ struct net_device_ops { * @IFF_MACVLAN: Macvlan device * @IFF_XMIT_DST_RELEASE_PERM: IFF_XMIT_DST_RELEASE not taking into account * underlying stacked devices - * @IFF_IPVLAN_MASTER: IPvlan master device - * @IFF_IPVLAN_SLAVE: IPvlan slave device * @IFF_L3MDEV_MASTER: device is an L3 master device * @IFF_NO_QUEUE: device can run without qdisc attached * @IFF_OPENVSWITCH: device is a Open vSwitch master @@ -1412,16 +1410,14 @@ enum netdev_priv_flags { IFF_LIVE_ADDR_CHANGE = 1<<15, IFF_MACVLAN = 1<<16, IFF_XMIT_DST_RELEASE_PERM = 1<<17, - IFF_IPVLAN_MASTER = 1<<18, - IFF_IPVLAN_SLAVE = 1<<19, - IFF_L3MDEV_MASTER = 1<<20, - IFF_NO_QUEUE = 1<<21, - IFF_OPENVSWITCH = 1<<22, - IFF_L3MDEV_SLAVE = 1<<23, - IFF_TEAM = 1<<24, - IFF_RXFH_CONFIGURED = 1<<25, - IFF_PHONY_HEADROOM = 1<<26, - IFF_MACSEC = 1<<27, + IFF_L3MDEV_MASTER = 1<<18, + IFF_NO_QUEUE = 1<<19, + IFF_OPENVSWITCH = 1<<20, + IFF_L3MDEV_SLAVE = 1<<21, + IFF_TEAM = 1<<22, + IFF_RXFH_CONFIGURED = 1<<23, + IFF_PHONY_HEADROOM = 1<<24, + IFF_MACSEC = 1<<25, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1442,8 +1438,6 @@ enum netdev_priv_flags { #define IFF_LIVE_ADDR_CHANGE IFF_LIVE_ADDR_CHANGE #define IFF_MACVLAN IFF_MACVLAN #define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM -#define IFF_IPVLAN_MASTER IFF_IPVLAN_MASTER -#define IFF_IPVLAN_SLAVE IFF_IPVLAN_SLAVE #define IFF_L3MDEV_MASTER IFF_L3MDEV_MASTER #define IFF_NO_QUEUE IFF_NO_QUEUE #define IFF_OPENVSWITCH IFF_OPENVSWITCH @@ -4223,16 +4217,6 @@ static inline bool netif_is_macvlan_port(const struct net_device *dev) return dev->priv_flags & IFF_MACVLAN_PORT; } -static inline bool netif_is_ipvlan(const struct net_device *dev) -{ - return dev->priv_flags & IFF_IPVLAN_SLAVE; -} - -static inline bool netif_is_ipvlan_port(const struct net_device *dev) -{ - return dev->priv_flags & IFF_IPVLAN_MASTER; -} - static inline bool netif_is_bond_master(const struct net_device *dev) { return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING; -- cgit v1.2.3 From 581fdddee420cebe2cb781cb3c84c82676a86949 Mon Sep 17 00:00:00 2001 From: Yossi Kuperman Date: Sun, 22 Oct 2017 19:43:58 +0300 Subject: net/mlx5: IPSec, Generalize sandbox QP commands The current code assume only SA QP commands. Refactor in order to pave the way for new QP commands: 1. Generic cmd response format. 2. SA cmd checks are in dedicated functions. 3. Aligned debug prints. Signed-off-by: Yossi Kuperman Signed-off-by: Aviad Yehezkel Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc_fpga.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index 255a88d08078..7283fe780f93 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -429,4 +429,20 @@ struct mlx5_ifc_ipsec_counters_bits { u8 dropped_cmd[0x40]; }; +enum mlx5_ifc_fpga_ipsec_response_syndrome { + MLX5_FPGA_IPSEC_RESPONSE_SUCCESS = 0, + MLX5_FPGA_IPSEC_RESPONSE_ILLEGAL_REQUEST = 1, + MLX5_FPGA_IPSEC_RESPONSE_SADB_ISSUE = 2, + MLX5_FPGA_IPSEC_RESPONSE_WRITE_RESPONSE_ISSUE = 3, +}; + +struct mlx5_ifc_fpga_ipsec_cmd_resp { + __be32 syndrome; + union { + __be32 sw_sa_handle; + __be32 flags; + }; + u8 reserved[24]; +} __packed; + #endif /* MLX5_IFC_FPGA_H */ -- cgit v1.2.3 From 788a8210764ce2977095010931959c87b60c2f51 Mon Sep 17 00:00:00 2001 From: Yossi Kuperman Date: Sun, 22 Oct 2017 19:45:45 +0300 Subject: net/mlx5e: IPSec, Add support for ESP trailer removal by hardware Current hardware decrypts and authenticates incoming ESP packets. Subsequently, the software extracts the nexthdr field, truncates the trailer and adjusts csum accordingly. With this patch and a capable device, the trailer is being removed by the hardware and the nexthdr field is conveyed via PET. This way we avoid both the need to access the trailer (cache miss) and to compute its relative checksum, which significantly improve the performance. Experiment shows that trailer removal improves the performance by 2Gbps, (netperf). Both forwarding and host-to-host configurations. Signed-off-by: Yossi Kuperman Signed-off-by: Aviad Yehezkel Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc_fpga.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index 7283fe780f93..643544db180b 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -373,7 +373,8 @@ struct mlx5_ifc_fpga_destroy_qp_out_bits { struct mlx5_ifc_ipsec_extended_cap_bits { u8 encapsulation[0x20]; - u8 reserved_0[0x15]; + u8 reserved_0[0x14]; + u8 rx_no_trailer[0x1]; u8 ipv4_fragment[0x1]; u8 ipv6[0x1]; u8 esn[0x1]; @@ -445,4 +446,14 @@ struct mlx5_ifc_fpga_ipsec_cmd_resp { u8 reserved[24]; } __packed; +enum mlx5_ifc_fpga_ipsec_cap { + MLX5_FPGA_IPSEC_CAP_NO_TRAILER = BIT(0), +}; + +struct mlx5_ifc_fpga_ipsec_cmd_cap { + __be32 cmd; + __be32 flags; + u8 reserved[24]; +} __packed; + #endif /* MLX5_IFC_FPGA_H */ -- cgit v1.2.3 From 65802f480008066636a43173b12388bb3fb7bd3a Mon Sep 17 00:00:00 2001 From: Aviad Yehezkel Date: Tue, 16 Jan 2018 16:12:22 +0200 Subject: net/mlx5: IPSec, Add command V2 support This patch adds V2 command support. New fpga devices support extended features (udp encap, esn etc...), this features require new hardware sadb format therefore we have a new version of commands to manipulate it. Signed-off-by: Yossef Efraim Signed-off-by: Aviad Yehezkel Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc_fpga.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index 643544db180b..dd7e4538159c 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -373,7 +373,9 @@ struct mlx5_ifc_fpga_destroy_qp_out_bits { struct mlx5_ifc_ipsec_extended_cap_bits { u8 encapsulation[0x20]; - u8 reserved_0[0x14]; + u8 reserved_0[0x12]; + u8 v2_command[0x1]; + u8 udp_encap[0x1]; u8 rx_no_trailer[0x1]; u8 ipv4_fragment[0x1]; u8 ipv6[0x1]; -- cgit v1.2.3 From 1d2005e2040b95af4c861e40cf806ff44cd7c883 Mon Sep 17 00:00:00 2001 From: Aviad Yehezkel Date: Mon, 29 Jan 2018 15:05:50 +0200 Subject: net/mlx5: Export ipsec capabilities We will need that for ipsec verbs. Signed-off-by: Aviad Yehezkel Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 include/linux/mlx5/accel.h (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h new file mode 100644 index 000000000000..601280c782d3 --- /dev/null +++ b/include/linux/mlx5/accel.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#ifndef __MLX5_ACCEL_H__ +#define __MLX5_ACCEL_H__ + +#include + +enum mlx5_accel_ipsec_caps { + MLX5_ACCEL_IPSEC_CAP_DEVICE = 1 << 0, + MLX5_ACCEL_IPSEC_CAP_ESP = 1 << 2, + MLX5_ACCEL_IPSEC_CAP_IPV6 = 1 << 3, + MLX5_ACCEL_IPSEC_CAP_LSO = 1 << 4, + MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER = 1 << 5, + MLX5_ACCEL_IPSEC_CAP_V2_CMD = 1 << 6, +}; + +#ifdef CONFIG_MLX5_ACCEL + +u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev); + +#else + +static inline u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev) { return 0; } + +#endif +#endif -- cgit v1.2.3 From af9fe19d660e333ca9b0a6e1506e684a1126b9e7 Mon Sep 17 00:00:00 2001 From: Aviad Yehezkel Date: Wed, 17 Jan 2018 11:20:33 +0200 Subject: net/mlx5: Added required metadata capability for ipsec Currently our device requires additional metadata in packet to perform ipsec crypto offload. Signed-off-by: Aviad Yehezkel Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index 601280c782d3..b674af63689b 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -38,6 +38,7 @@ enum mlx5_accel_ipsec_caps { MLX5_ACCEL_IPSEC_CAP_DEVICE = 1 << 0, + MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA = 1 << 1, MLX5_ACCEL_IPSEC_CAP_ESP = 1 << 2, MLX5_ACCEL_IPSEC_CAP_IPV6 = 1 << 3, MLX5_ACCEL_IPSEC_CAP_LSO = 1 << 4, -- cgit v1.2.3 From d6c4f0298cec8c4c88d33aca17c066995e92fe91 Mon Sep 17 00:00:00 2001 From: Aviad Yehezkel Date: Thu, 18 Jan 2018 13:05:48 +0200 Subject: net/mlx5: Refactor accel IPSec code The current code has one layer that executed FPGA commands and the Ethernet part directly used this code. Since downstream patches introduces support for IPSec in mlx5_ib, we need to provide some abstractions. This patch refactors the accel code into one layer that creates a software IPSec transformation and another one which creates the actual hardware context. The internal command implementation is now hidden in the FPGA core layer. The code also adds the ability to share FPGA hardware contexts. If two contexts are the same, only a reference count is taken. Signed-off-by: Aviad Yehezkel Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 83 +++++++++++++++++++++++++++++++++++++- include/linux/mlx5/mlx5_ifc_fpga.h | 59 +++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index b674af63689b..da6de465ea6d 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -36,23 +36,102 @@ #include -enum mlx5_accel_ipsec_caps { +enum mlx5_accel_esp_aes_gcm_keymat_iv_algo { + MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ, +}; + +enum mlx5_accel_esp_flags { + MLX5_ACCEL_ESP_FLAGS_TUNNEL = 0, /* Default */ + MLX5_ACCEL_ESP_FLAGS_TRANSPORT = 1UL << 0, + MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED = 1UL << 1, + MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP = 1UL << 2, +}; + +enum mlx5_accel_esp_action { + MLX5_ACCEL_ESP_ACTION_DECRYPT, + MLX5_ACCEL_ESP_ACTION_ENCRYPT, +}; + +enum mlx5_accel_esp_keymats { + MLX5_ACCEL_ESP_KEYMAT_AES_NONE, + MLX5_ACCEL_ESP_KEYMAT_AES_GCM, +}; + +enum mlx5_accel_esp_replay { + MLX5_ACCEL_ESP_REPLAY_NONE, + MLX5_ACCEL_ESP_REPLAY_BMP, +}; + +struct aes_gcm_keymat { + u64 seq_iv; + enum mlx5_accel_esp_aes_gcm_keymat_iv_algo iv_algo; + + u32 salt; + u32 icv_len; + + u32 key_len; + u32 aes_key[256 / 32]; +}; + +struct mlx5_accel_esp_xfrm_attrs { + enum mlx5_accel_esp_action action; + u32 esn; + u32 spi; + u32 seq; + u32 tfc_pad; + u32 flags; + u32 sa_handle; + enum mlx5_accel_esp_replay replay_type; + union { + struct { + u32 size; + + } bmp; + } replay; + enum mlx5_accel_esp_keymats keymat_type; + union { + struct aes_gcm_keymat aes_gcm; + } keymat; +}; + +struct mlx5_accel_esp_xfrm { + struct mlx5_core_dev *mdev; + struct mlx5_accel_esp_xfrm_attrs attrs; +}; + +enum { + MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA = 1UL << 0, +}; + +enum mlx5_accel_ipsec_cap { MLX5_ACCEL_IPSEC_CAP_DEVICE = 1 << 0, MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA = 1 << 1, MLX5_ACCEL_IPSEC_CAP_ESP = 1 << 2, MLX5_ACCEL_IPSEC_CAP_IPV6 = 1 << 3, MLX5_ACCEL_IPSEC_CAP_LSO = 1 << 4, MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER = 1 << 5, - MLX5_ACCEL_IPSEC_CAP_V2_CMD = 1 << 6, }; #ifdef CONFIG_MLX5_ACCEL u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev); +struct mlx5_accel_esp_xfrm * +mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, + const struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 flags); +void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm); + #else static inline u32 mlx5_accel_ipsec_device_caps(struct mlx5_core_dev *mdev) { return 0; } +static inline struct mlx5_accel_esp_xfrm * +mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, + const struct mlx5_accel_esp_xfrm_attrs *attrs, + u32 flags) { return ERR_PTR(-EOPNOTSUPP); } +static inline void +mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm) {} + #endif #endif diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index dd7e4538159c..debcc57de43a 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -448,6 +448,15 @@ struct mlx5_ifc_fpga_ipsec_cmd_resp { u8 reserved[24]; } __packed; +enum mlx5_ifc_fpga_ipsec_cmd_opcode { + MLX5_FPGA_IPSEC_CMD_OP_ADD_SA = 0, + MLX5_FPGA_IPSEC_CMD_OP_DEL_SA = 1, + MLX5_FPGA_IPSEC_CMD_OP_ADD_SA_V2 = 2, + MLX5_FPGA_IPSEC_CMD_OP_DEL_SA_V2 = 3, + MLX5_FPGA_IPSEC_CMD_OP_MOD_SA_V2 = 4, + MLX5_FPGA_IPSEC_CMD_OP_SET_CAP = 5, +}; + enum mlx5_ifc_fpga_ipsec_cap { MLX5_FPGA_IPSEC_CAP_NO_TRAILER = BIT(0), }; @@ -458,4 +467,54 @@ struct mlx5_ifc_fpga_ipsec_cmd_cap { u8 reserved[24]; } __packed; +enum mlx5_ifc_fpga_ipsec_sa_flags { + MLX5_FPGA_IPSEC_SA_IPV6 = BIT(2), + MLX5_FPGA_IPSEC_SA_DIR_SX = BIT(3), + MLX5_FPGA_IPSEC_SA_SPI_EN = BIT(4), + MLX5_FPGA_IPSEC_SA_SA_VALID = BIT(5), + MLX5_FPGA_IPSEC_SA_IP_ESP = BIT(6), + MLX5_FPGA_IPSEC_SA_IP_AH = BIT(7), +}; + +enum mlx5_ifc_fpga_ipsec_sa_enc_mode { + MLX5_FPGA_IPSEC_SA_ENC_MODE_NONE = 0, + MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_128_AUTH_128 = 1, + MLX5_FPGA_IPSEC_SA_ENC_MODE_AES_GCM_256_AUTH_128 = 3, +}; + +struct mlx5_ifc_fpga_ipsec_sa_v1 { + __be32 cmd; + u8 key_enc[32]; + u8 key_auth[32]; + __be32 sip[4]; + __be32 dip[4]; + union { + struct { + __be32 reserved; + u8 salt_iv[8]; + __be32 salt; + } __packed gcm; + struct { + u8 salt[16]; + } __packed cbc; + }; + __be32 spi; + __be32 sw_sa_handle; + __be16 tfclen; + u8 enc_mode; + u8 reserved1[2]; + u8 flags; + u8 reserved2[2]; +}; + +struct mlx5_ifc_fpga_ipsec_sa { + struct mlx5_ifc_fpga_ipsec_sa_v1 ipsec_sa_v1; + __be16 udp_sp; + __be16 udp_dp; + u8 reserved1[4]; + __be32 esn; + __be16 vid; /* only 12 bits, rest is reserved */ + __be16 reserved2; +} __packed; + #endif /* MLX5_IFC_FPGA_H */ -- cgit v1.2.3 From 05564d0ae075b7a73339eaa05296c3034e439c32 Mon Sep 17 00:00:00 2001 From: Aviad Yehezkel Date: Sun, 18 Feb 2018 15:07:20 +0200 Subject: net/mlx5: Add flow-steering commands for FPGA IPSec implementation In order to add a context to the FPGA, we need to get both the software transform context (which includes the keys, etc) and the source/destination IPs (which are included in the steering rule). Therefore, we register new set of firmware like commands for the FPGA. Each time a rule is added, the steering core infrastructure calls the FPGA command layer. If the rule is intended for the FPGA, it combines the IPs information with the software transformation context and creates the respective hardware transform. Afterwards, it calls the standard steering command layer. Signed-off-by: Aviad Yehezkel Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 5 +++++ include/linux/mlx5/fs.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index da6de465ea6d..6c694709b0a2 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -121,6 +121,8 @@ mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, const struct mlx5_accel_esp_xfrm_attrs *attrs, u32 flags); void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm); +int mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, + const struct mlx5_accel_esp_xfrm_attrs *attrs); #else @@ -132,6 +134,9 @@ mlx5_accel_esp_create_xfrm(struct mlx5_core_dev *mdev, u32 flags) { return ERR_PTR(-EOPNOTSUPP); } static inline void mlx5_accel_esp_destroy_xfrm(struct mlx5_accel_esp_xfrm *xfrm) {} +static inline int +mlx5_accel_esp_modify_xfrm(struct mlx5_accel_esp_xfrm *xfrm, + const struct mlx5_accel_esp_xfrm_attrs *attrs) { return -EOPNOTSUPP; } #endif #endif diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 744ea228acea..b957e52434f8 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -40,6 +40,8 @@ enum { MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO = 1 << 16, + MLX5_FLOW_CONTEXT_ACTION_ENCRYPT = 1 << 17, + MLX5_FLOW_CONTEXT_ACTION_DECRYPT = 1 << 18, }; enum { @@ -146,6 +148,7 @@ struct mlx5_flow_act { u32 flow_tag; u32 encap_id; u32 modify_id; + uintptr_t esp_id; }; #define MLX5_DECLARE_FLOW_ACT(name) \ -- cgit v1.2.3 From cb01008390bb0645d4728c7f8825e32d4b540a30 Mon Sep 17 00:00:00 2001 From: Aviad Yehezkel Date: Thu, 18 Jan 2018 16:02:17 +0200 Subject: net/mlx5: IPSec, Add support for ESN Currently ESN is not supported with IPSec device offload. This patch adds ESN support to IPsec device offload. Implementing new xfrm device operation to synchronize offloading device ESN with xfrm received SN. New QP command to update SA state at the following: ESN 1 ESN 2 ESN 3 |-----------*-----------|-----------*-----------|-----------* ^ ^ ^ ^ ^ ^ ^ - marks where QP command invoked to update the SA ESN state machine. | - marks the start of the ESN scope (0-2^32-1). At this point move SA ESN overlap bit to zero and increment ESN. * - marks the middle of the ESN scope (2^31). At this point move SA ESN overlap bit to one. Signed-off-by: Aviad Yehezkel Signed-off-by: Yossef Efraim Signed-off-by: Saeed Mahameed --- include/linux/mlx5/accel.h | 2 ++ include/linux/mlx5/mlx5_ifc_fpga.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/accel.h b/include/linux/mlx5/accel.h index 6c694709b0a2..70e7e5673ce9 100644 --- a/include/linux/mlx5/accel.h +++ b/include/linux/mlx5/accel.h @@ -110,6 +110,8 @@ enum mlx5_accel_ipsec_cap { MLX5_ACCEL_IPSEC_CAP_IPV6 = 1 << 3, MLX5_ACCEL_IPSEC_CAP_LSO = 1 << 4, MLX5_ACCEL_IPSEC_CAP_RX_NO_TRAILER = 1 << 5, + MLX5_ACCEL_IPSEC_CAP_ESN = 1 << 6, + MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN = 1 << 7, }; #ifdef CONFIG_MLX5_ACCEL diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index debcc57de43a..ec052491ba3d 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -468,6 +468,8 @@ struct mlx5_ifc_fpga_ipsec_cmd_cap { } __packed; enum mlx5_ifc_fpga_ipsec_sa_flags { + MLX5_FPGA_IPSEC_SA_ESN_EN = BIT(0), + MLX5_FPGA_IPSEC_SA_ESN_OVERLAP = BIT(1), MLX5_FPGA_IPSEC_SA_IPV6 = BIT(2), MLX5_FPGA_IPSEC_SA_DIR_SX = BIT(3), MLX5_FPGA_IPSEC_SA_SPI_EN = BIT(4), -- cgit v1.2.3 From 84a1d9c4820080bebcbd413a845076dcb62f45fa Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Thu, 8 Mar 2018 15:45:03 +0000 Subject: net: ethtool: extend RXNFC API to support RSS spreading of filter matches We use a two-step process to configure a filter with RSS spreading. First, the RSS context is allocated and configured using ETHTOOL_SRSSH; this returns an identifier (rss_context) which can then be passed to subsequent invocations of ETHTOOL_SRXCLSRLINS to specify that the offset from the RSS indirection table lookup should be added to the queue number (ring_cookie) when delivering the packet. Drivers for devices which can only use the indirection table entry directly (not add it to a base queue number) should reject rule insertions combining RSS with a nonzero ring_cookie. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/ethtool.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 2ec41a7eb54f..ebe41811ed34 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -371,6 +371,11 @@ struct ethtool_ops { u8 *hfunc); int (*set_rxfh)(struct net_device *, const u32 *indir, const u8 *key, const u8 hfunc); + int (*get_rxfh_context)(struct net_device *, u32 *indir, u8 *key, + u8 *hfunc, u32 rss_context); + int (*set_rxfh_context)(struct net_device *, const u32 *indir, + const u8 *key, const u8 hfunc, + u32 *rss_context, bool delete); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); int (*get_dump_flag)(struct net_device *, struct ethtool_dump *); -- cgit v1.2.3 From 79134e6ce2c9d1a00eab4d98cb48f975dd2474cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Mar 2018 12:51:41 -0800 Subject: net: do not create fallback tunnels for non-default namespaces fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0, ip6tnl0, ip6gre0) are automatically created when the corresponding module is loaded. These tunnels are also automatically created when a new network namespace is created, at a great cost. In many cases, netns are used for isolation purposes, and these extra network devices are a waste of resources. We are using thousands of netns per host, and hit the netns creation/delete bottleneck a lot. (Many thanks to Kirill for recent work on this) Add a new sysctl so that we can opt-out from this automatic creation. Note that these tunnels are still created for the initial namespace, to be the least intrusive for typical setups. Tested: lpk43:~# cat add_del_unshare.sh for i in `seq 1 40` do (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) & done wait lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net lpk43:~# time ./add_del_unshare.sh real 0m37.521s user 0m0.886s sys 7m7.084s lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net lpk43:~# time ./add_del_unshare.sh real 0m4.761s user 0m0.851s sys 1m8.343s lpk43:~# Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 95a613a7cc1c..9711108c3916 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -585,6 +585,13 @@ struct netdev_queue { #endif } ____cacheline_aligned_in_smp; +extern int sysctl_fb_tunnels_only_for_init_net; + +static inline bool net_has_fallback_tunnels(const struct net *net) +{ + return net == &init_net || !sysctl_fb_tunnels_only_for_init_net; +} + static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) { #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) -- cgit v1.2.3 From f5426250a6ecfd1e9b2d5e0daf07565f664aa67d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 9 Mar 2018 10:39:24 +0100 Subject: net: introduce IFF_NO_RX_HANDLER Some network devices - notably ipvlan slave - are not compatible with any kind of rx_handler. Currently the hook can be installed but any configuration (bridge, bond, macsec, ...) is nonfunctional. This change allocates a priv_flag bit to mark such devices and explicitly forbid installing a rx_handler if such bit is set. The new bit is used by ipvlan slave device. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9711108c3916..5fbb9f1da7fd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1397,6 +1397,7 @@ struct net_device_ops { * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external * entity (i.e. the master device for bridged veth) * @IFF_MACSEC: device is a MACsec device + * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1425,6 +1426,7 @@ enum netdev_priv_flags { IFF_RXFH_CONFIGURED = 1<<23, IFF_PHONY_HEADROOM = 1<<24, IFF_MACSEC = 1<<25, + IFF_NO_RX_HANDLER = 1<<26, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1452,6 +1454,7 @@ enum netdev_priv_flags { #define IFF_TEAM IFF_TEAM #define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED #define IFF_MACSEC IFF_MACSEC +#define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER /** * struct net_device - The DEVICE structure. -- cgit v1.2.3 From be9fc0971a5c27b791608cf9705a04fe96dbd395 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 13 Mar 2018 12:44:53 +0100 Subject: net: fix sysctl_fb_tunnels_only_for_init_net link error The new variable is only available when CONFIG_SYSCTL is enabled, otherwise we get a link error: net/ipv4/ip_tunnel.o: In function `ip_tunnel_init_net': ip_tunnel.c:(.text+0x278b): undefined reference to `sysctl_fb_tunnels_only_for_init_net' net/ipv6/sit.o: In function `sit_init_net': sit.c:(.init.text+0x4c): undefined reference to `sysctl_fb_tunnels_only_for_init_net' net/ipv6/ip6_tunnel.o: In function `ip6_tnl_init_net': ip6_tunnel.c:(.init.text+0x39): undefined reference to `sysctl_fb_tunnels_only_for_init_net' This adds an extra condition, keeping the traditional behavior when CONFIG_SYSCTL is disabled. Fixes: 79134e6ce2c9 ("net: do not create fallback tunnels for non-default namespaces") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5fbb9f1da7fd..913b1cc882cf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -589,7 +589,9 @@ extern int sysctl_fb_tunnels_only_for_init_net; static inline bool net_has_fallback_tunnels(const struct net *net) { - return net == &init_net || !sysctl_fb_tunnels_only_for_init_net; + return net == &init_net || + !IS_ENABLED(CONFIG_SYSCTL) || + !sysctl_fb_tunnels_only_for_init_net; } static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) -- cgit v1.2.3 From 79ffdfc6522ae33d8a33e971070c08ee5f27439b Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 14 Mar 2018 22:17:20 +0300 Subject: net: Add rtnl_lock_killable() rtnl_lock() is widely used mutex in kernel. Some of kernel code does memory allocations under it. In case of memory deficit this may invoke OOM killer, but the problem is a killed task can't exit if it's waiting for the mutex. This may be a reason of deadlock and panic. This patch adds a new primitive, which responds on SIGKILL, and it allows to use it in the places, where we don't want to sleep forever. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 3573b4bf2fdf..562a175c35a9 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -33,6 +33,7 @@ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); +extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore net_sem; -- cgit v1.2.3 From 312fc2b4c82e96a48cb2d0da2bd4816eb253c499 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:00 -0700 Subject: net: do_tcp_sendpages flag to avoid SKBTX_SHARED_FRAG When calling do_tcp_sendpages() from in kernel and we know the data has no references from user side we can omit SKBTX_SHARED_FRAG flag. This patch adds an internal flag, NO_SKBTX_SHARED_FRAG that can be used to omit setting SKBTX_SHARED_FRAG. The flag is not exposed to userspace because the sendpage call from the splice logic masks out all bits except MSG_MORE. Signed-off-by: John Fastabend Acked-by: David S. Miller Signed-off-by: Daniel Borkmann --- include/linux/socket.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index 1ce1f768a58c..60e01482a9c4 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -287,6 +287,7 @@ struct ucred { #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN +#define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ -- cgit v1.2.3 From 4f738adba30a7cfc006f605707e7aee847ffefa0 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 18 Mar 2018 12:57:10 -0700 Subject: bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data This implements a BPF ULP layer to allow policy enforcement and monitoring at the socket layer. In order to support this a new program type BPF_PROG_TYPE_SK_MSG is used to run the policy at the sendmsg/sendpage hook. To attach the policy to sockets a sockmap is used with a new program attach type BPF_SK_MSG_VERDICT. Similar to previous sockmap usages when a sock is added to a sockmap, via a map update, if the map contains a BPF_SK_MSG_VERDICT program type attached then the BPF ULP layer is created on the socket and the attached BPF_PROG_TYPE_SK_MSG program is run for every msg in sendmsg case and page/offset in sendpage case. BPF_PROG_TYPE_SK_MSG Semantics/API: BPF_PROG_TYPE_SK_MSG supports only two return codes SK_PASS and SK_DROP. Returning SK_DROP free's the copied data in the sendmsg case and in the sendpage case leaves the data untouched. Both cases return -EACESS to the user. Returning SK_PASS will allow the msg to be sent. In the sendmsg case data is copied into kernel space buffers before running the BPF program. The kernel space buffers are stored in a scatterlist object where each element is a kernel memory buffer. Some effort is made to coalesce data from the sendmsg call here. For example a sendmsg call with many one byte iov entries will likely be pushed into a single entry. The BPF program is run with data pointers (start/end) pointing to the first sg element. In the sendpage case data is not copied. We opt not to copy the data by default here, because the BPF infrastructure does not know what bytes will be needed nor when they will be needed. So copying all bytes may be wasteful. Because of this the initial start/end data pointers are (0,0). Meaning no data can be read or written. This avoids reading data that may be modified by the user. A new helper is added later in this series if reading and writing the data is needed. The helper call will do a copy by default so that the page is exclusively owned by the BPF call. The verdict from the BPF_PROG_TYPE_SK_MSG applies to the entire msg in the sendmsg() case and the entire page/offset in the sendpage case. This avoids ambiguity on how to handle mixed return codes in the sendmsg case. Again a helper is added later in the series if a verdict needs to apply to multiple system calls and/or only a subpart of the currently being processed message. The helper msg_redirect_map() can be used to select the socket to send the data on. This is used similar to existing redirect use cases. This allows policy to redirect msgs. Pseudo code simple example: The basic logic to attach a program to a socket is as follows, // load the programs bpf_prog_load(SOCKMAP_TCP_MSG_PROG, BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog); // lookup the sockmap bpf_map_msg = bpf_object__find_map_by_name(obj, "my_sock_map"); // get fd for sockmap map_fd_msg = bpf_map__fd(bpf_map_msg); // attach program to sockmap bpf_prog_attach(msg_prog, map_fd_msg, BPF_SK_MSG_VERDICT, 0); Adding sockets to the map is done in the normal way, // Add a socket 'fd' to sockmap at location 'i' bpf_map_update_elem(map_fd_msg, &i, fd, BPF_ANY); After the above any socket attached to "my_sock_map", in this case 'fd', will run the BPF msg verdict program (msg_prog) on every sendmsg and sendpage system call. For a complete example see BPF selftests or sockmap samples. Implementation notes: It seemed the simplest, to me at least, to use a refcnt to ensure psock is not lost across the sendmsg copy into the sg, the bpf program running on the data in sg_data, and the final pass to the TCP stack. Some performance testing may show a better method to do this and avoid the refcnt cost, but for now use the simpler method. Another item that will come after basic support is in place is supporting MSG_MORE flag. At the moment we call sendpages even if the MSG_MORE flag is set. An enhancement would be to collect the pages into a larger scatterlist and pass down the stack. Notice that bpf_tcp_sendmsg() could support this with some additional state saved across sendmsg calls. I built the code to support this without having to do refactoring work. Other features TBD include ZEROCOPY and the TCP_RECV_QUEUE/TCP_NO_QUEUE support. This will follow initial series shortly. Future work could improve size limits on the scatterlist rings used here. Currently, we use MAX_SKB_FRAGS simply because this was being used already in the TLS case. Future work could extend the kernel sk APIs to tune this depending on workload. This is a trade-off between memory usage and throughput performance. Signed-off-by: John Fastabend Acked-by: David S. Miller Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/linux/filter.h | 17 +++++++++++++++++ 3 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 66df387106de..819229c80eca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -21,6 +21,7 @@ struct bpf_verifier_env; struct perf_event; struct bpf_prog; struct bpf_map; +struct sock; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 19b8349a3809..5e2e8a49fb21 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -13,6 +13,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) diff --git a/include/linux/filter.h b/include/linux/filter.h index fdb691b520c0..109d05ccea9a 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -507,6 +507,22 @@ struct xdp_buff { struct xdp_rxq_info *rxq; }; +struct sk_msg_buff { + void *data; + void *data_end; + __u32 apply_bytes; + __u32 cork_bytes; + int sg_copybreak; + int sg_start; + int sg_curr; + int sg_end; + struct scatterlist sg_data[MAX_SKB_FRAGS]; + bool sg_copy[MAX_SKB_FRAGS]; + __u32 key; + __u32 flags; + struct bpf_map *map; +}; + /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) @@ -771,6 +787,7 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) void bpf_warn_invalid_xdp_action(u32 act); struct sock *do_sk_redirect_map(struct sk_buff *skb); +struct sock *do_msg_redirect_map(struct sk_msg_buff *md); #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; -- cgit v1.2.3 From b9193c1b61ddb97da4713155b0d580e41fb544ac Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 24 Mar 2018 11:44:22 -0700 Subject: bpf: Rename bpf_verifer_log bpf_verifer_log => bpf_verifier_log Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6b66cd1aa0b9..c30668414b22 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -153,7 +153,7 @@ struct bpf_insn_aux_data { #define BPF_VERIFIER_TMP_LOG_SIZE 1024 -struct bpf_verifer_log { +struct bpf_verifier_log { u32 level; char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; char __user *ubuf; @@ -161,7 +161,7 @@ struct bpf_verifer_log { u32 len_total; }; -static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log) +static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) { return log->len_used >= log->len_total - 1; } @@ -185,7 +185,7 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ - struct bpf_verifer_log log; + struct bpf_verifier_log log; u32 subprog_starts[BPF_MAX_SUBPROGS]; /* computes the stack depth of each bpf function */ u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; -- cgit v1.2.3 From 77d2e05abd45886dcad2b632c738cf46b9f7c19e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 24 Mar 2018 11:44:23 -0700 Subject: bpf: Add bpf_verifier_vlog() and bpf_verifier_log_needed() The BTF (BPF Type Format) verifier needs to reuse the current BPF verifier log. Hence, it requires the following changes: (1) Expose log_write() in verifier.c for other users. Its name is renamed to bpf_verifier_vlog(). (2) The BTF verifier also needs to check 'log->level && log->ubuf && !bpf_verifier_log_full(log);' independently outside of the current log_write(). It is because the BTF verifier will do one-check before making multiple calls to btf_verifier_vlog to log the details of a type. Hence, this check is also re-factored to a new function bpf_verifier_log_needed(). Since it is re-factored, we can check it before va_start() in the current bpf_verifier_log_write() and verbose(). Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c30668414b22..7e61c395fddf 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -166,6 +166,11 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) return log->len_used >= log->len_total - 1; } +static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) +{ + return log->level && log->ubuf && !bpf_verifier_log_full(log); +} + #define BPF_MAX_SUBPROGS 256 /* single container for all structs @@ -192,6 +197,8 @@ struct bpf_verifier_env { u32 subprog_cnt; }; +void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, + va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); -- cgit v1.2.3 From ede2762d93ff16e0974f7446516b46b1022db213 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 23 Mar 2018 19:47:19 +0300 Subject: net: Make NETDEV_XXX commands enum { } This patch is preparation to drop NETDEV_UNREGISTER_FINAL. Since the cmd is used in usnic_ib_netdev_event_to_string() to get cmd name, after plain removing NETDEV_UNREGISTER_FINAL from everywhere, we'd have holes in event2str[] in this function. Instead of that, let's make NETDEV_XXX commands names available for everyone, and to define netdev_cmd_to_name() in the way we won't have to shaffle names after their numbers are changed. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/netdevice.h | 69 ++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 913b1cc882cf..dd5a04c971d5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2312,43 +2312,46 @@ struct netdev_lag_lower_state_info { #include -/* netdevice notifier chain. Please remember to update the rtnetlink - * notification exclusion list in rtnetlink_event() when adding new - * types. +/* netdevice notifier chain. Please remember to update netdev_cmd_to_name() + * and the rtnetlink notification exclusion list in rtnetlink_event() when + * adding new types. */ -#define NETDEV_UP 0x0001 /* For now you can't veto a device up/down */ -#define NETDEV_DOWN 0x0002 -#define NETDEV_REBOOT 0x0003 /* Tell a protocol stack a network interface +enum netdev_cmd { + NETDEV_UP = 1, /* For now you can't veto a device up/down */ + NETDEV_DOWN, + NETDEV_REBOOT, /* Tell a protocol stack a network interface detected a hardware crash and restarted - we can use this eg to kick tcp sessions once done */ -#define NETDEV_CHANGE 0x0004 /* Notify device state change */ -#define NETDEV_REGISTER 0x0005 -#define NETDEV_UNREGISTER 0x0006 -#define NETDEV_CHANGEMTU 0x0007 /* notify after mtu change happened */ -#define NETDEV_CHANGEADDR 0x0008 -#define NETDEV_GOING_DOWN 0x0009 -#define NETDEV_CHANGENAME 0x000A -#define NETDEV_FEAT_CHANGE 0x000B -#define NETDEV_BONDING_FAILOVER 0x000C -#define NETDEV_PRE_UP 0x000D -#define NETDEV_PRE_TYPE_CHANGE 0x000E -#define NETDEV_POST_TYPE_CHANGE 0x000F -#define NETDEV_POST_INIT 0x0010 -#define NETDEV_UNREGISTER_FINAL 0x0011 -#define NETDEV_RELEASE 0x0012 -#define NETDEV_NOTIFY_PEERS 0x0013 -#define NETDEV_JOIN 0x0014 -#define NETDEV_CHANGEUPPER 0x0015 -#define NETDEV_RESEND_IGMP 0x0016 -#define NETDEV_PRECHANGEMTU 0x0017 /* notify before mtu change happened */ -#define NETDEV_CHANGEINFODATA 0x0018 -#define NETDEV_BONDING_INFO 0x0019 -#define NETDEV_PRECHANGEUPPER 0x001A -#define NETDEV_CHANGELOWERSTATE 0x001B -#define NETDEV_UDP_TUNNEL_PUSH_INFO 0x001C -#define NETDEV_UDP_TUNNEL_DROP_INFO 0x001D -#define NETDEV_CHANGE_TX_QUEUE_LEN 0x001E + NETDEV_CHANGE, /* Notify device state change */ + NETDEV_REGISTER, + NETDEV_UNREGISTER, + NETDEV_CHANGEMTU, /* notify after mtu change happened */ + NETDEV_CHANGEADDR, + NETDEV_GOING_DOWN, + NETDEV_CHANGENAME, + NETDEV_FEAT_CHANGE, + NETDEV_BONDING_FAILOVER, + NETDEV_PRE_UP, + NETDEV_PRE_TYPE_CHANGE, + NETDEV_POST_TYPE_CHANGE, + NETDEV_POST_INIT, + NETDEV_UNREGISTER_FINAL, + NETDEV_RELEASE, + NETDEV_NOTIFY_PEERS, + NETDEV_JOIN, + NETDEV_CHANGEUPPER, + NETDEV_RESEND_IGMP, + NETDEV_PRECHANGEMTU, /* notify before mtu change happened */ + NETDEV_CHANGEINFODATA, + NETDEV_BONDING_INFO, + NETDEV_PRECHANGEUPPER, + NETDEV_CHANGELOWERSTATE, + NETDEV_UDP_TUNNEL_PUSH_INFO, + NETDEV_UDP_TUNNEL_DROP_INFO, + NETDEV_CHANGE_TX_QUEUE_LEN, +}; +const char *netdev_cmd_to_name(enum netdev_cmd cmd); int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); -- cgit v1.2.3 From 070f2d7e264acd6316fc24092b7f51a18c75ac9c Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 23 Mar 2018 19:47:39 +0300 Subject: net: Drop NETDEV_UNREGISTER_FINAL Last user is gone after bdf5bd7f2132 "rds: tcp: remove register_netdevice_notifier infrastructure.", so we can remove this netdevice command. This allows to delete rtnl_lock() in netdev_run_todo(), which is hot path for net namespace unregistration. dev_change_net_namespace() and netdev_wait_allrefs() have rcu_barrier() before NETDEV_UNREGISTER_FINAL call, and the source commits say they were introduced to delemit the call with NETDEV_UNREGISTER, but this patch leaves them on the places, since they require additional analysis, whether we need in them for something else. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index dd5a04c971d5..2a2d9cf50aa2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2336,7 +2336,6 @@ enum netdev_cmd { NETDEV_PRE_TYPE_CHANGE, NETDEV_POST_TYPE_CHANGE, NETDEV_POST_INIT, - NETDEV_UNREGISTER_FINAL, NETDEV_RELEASE, NETDEV_NOTIFY_PEERS, NETDEV_JOIN, -- cgit v1.2.3 From bc67a0daf8f3bc6fa8fcb68090f3c444de7f951c Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 26 Mar 2018 15:01:31 +0300 Subject: ipmr: Make vif fib notifiers common The fib-notifiers are tightly coupled with the vif_device which is already common. Move the notifier struct definition and helpers to the common file; Currently they're only used by ipmr. Signed-off-by: Yuval Mintz Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/mroute.h | 8 ------- include/linux/mroute_base.h | 53 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 7ed82e4f11b3..3f70a04a5879 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -55,14 +55,6 @@ static inline bool ipmr_rule_default(const struct fib_rule *rule) } #endif -struct vif_entry_notifier_info { - struct fib_notifier_info info; - struct net_device *dev; - vifi_t vif_index; - unsigned short vif_flags; - u32 tb_id; -}; - #define VIFF_STATIC 0x8000 struct mfc_cache_cmp_arg { diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index c2560cb50f1d..23326f5402f3 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -6,6 +6,7 @@ #include #include #include +#include /** * struct vif_device - interface representor for multicast routing @@ -36,6 +37,58 @@ struct vif_device { __be32 local, remote; }; +struct vif_entry_notifier_info { + struct fib_notifier_info info; + struct net_device *dev; + unsigned short vif_index; + unsigned short vif_flags; + u32 tb_id; +}; + +static inline int mr_call_vif_notifier(struct notifier_block *nb, + struct net *net, + unsigned short family, + enum fib_event_type event_type, + struct vif_device *vif, + unsigned short vif_index, u32 tb_id) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = family, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static inline int mr_call_vif_notifiers(struct net *net, + unsigned short family, + enum fib_event_type event_type, + struct vif_device *vif, + unsigned short vif_index, u32 tb_id, + unsigned int *ipmr_seq) +{ + struct vif_entry_notifier_info info = { + .info = { + .family = family, + .net = net, + }, + .dev = vif->dev, + .vif_index = vif_index, + .vif_flags = vif->flags, + .tb_id = tb_id, + }; + + ASSERT_RTNL(); + (*ipmr_seq)++; + return call_fib_notifiers(net, event_type, &info.info); +} + #ifndef MAXVIFS /* This one is nasty; value is defined in uapi using different symbols for * mroute and morute6 but both map into same 32. -- cgit v1.2.3 From 54c4cad97b8fd414909b78d4274a6797baa52b3b Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 26 Mar 2018 15:01:32 +0300 Subject: ipmr: Make MFC fib notifiers common Like vif notifications, move the notifier struct for MFC as well as its helpers into a common file; Currently they're only used by ipmr. Signed-off-by: Yuval Mintz Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/mroute.h | 6 ------ include/linux/mroute_base.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 3f70a04a5879..c855d80b51f7 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -80,12 +80,6 @@ struct mfc_cache { }; }; -struct mfc_entry_notifier_info { - struct fib_notifier_info info; - struct mfc_cache *mfc; - u32 tb_id; -}; - struct rtmsg; int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 23326f5402f3..2c594686c05e 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -152,6 +152,50 @@ struct mr_mfc { struct rcu_head rcu; }; +struct mfc_entry_notifier_info { + struct fib_notifier_info info; + struct mr_mfc *mfc; + u32 tb_id; +}; + +static inline int mr_call_mfc_notifier(struct notifier_block *nb, + struct net *net, + unsigned short family, + enum fib_event_type event_type, + struct mr_mfc *mfc, u32 tb_id) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = family, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static inline int mr_call_mfc_notifiers(struct net *net, + unsigned short family, + enum fib_event_type event_type, + struct mr_mfc *mfc, u32 tb_id, + unsigned int *ipmr_seq) +{ + struct mfc_entry_notifier_info info = { + .info = { + .family = family, + .net = net, + }, + .mfc = mfc, + .tb_id = tb_id + }; + + ASSERT_RTNL(); + (*ipmr_seq)++; + return call_fib_notifiers(net, event_type, &info.info); +} + struct mr_table; /** -- cgit v1.2.3 From cdc9f9443b5c3a61c7cec807965054ee1fd29acf Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 26 Mar 2018 15:01:33 +0300 Subject: ipmr: Make ipmr_dump() common Since all the primitive elements used for the notification done by ipmr are now common [mr_table, mr_mfc, vif_device] we can refactor the logic for dumping them to a common file. Signed-off-by: Yuval Mintz Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/mroute_base.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 2c594686c05e..289eb5aa7b5d 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -277,6 +277,13 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock); + +int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, + int (*rules_dump)(struct net *net, + struct notifier_block *nb), + struct mr_table *(*mr_iter)(struct net *net, + struct mr_table *mrt), + rwlock_t *mrt_lock); #else static inline void vif_device_init(struct vif_device *v, struct net_device *dev, @@ -333,6 +340,17 @@ mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, { return -EINVAL; } + +static inline int mr_dump(struct net *net, struct notifier_block *nb, + unsigned short family, + int (*rules_dump)(struct net *net, + struct notifier_block *nb), + struct mr_table *(*mr_iter)(struct net *net, + struct mr_table *mrt), + rwlock_t *mrt_lock) +{ + return -EINVAL; +} #endif static inline void *mr_mfc_find(struct mr_table *mrt, void *hasharg) -- cgit v1.2.3 From d3c07e5b9939a055fa017f200e535ae947eb22ab Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 26 Mar 2018 15:01:35 +0300 Subject: ip6mr: Add API for default_rule fib Add the ability to discern whether a given FIB rule notification relates to the default rule inserted when registering ip6mr or a different one. Would later be used by drivers wishing to offload ipv6 multicast routes but unable to offload rules other than the default one. Signed-off-by: Yuval Mintz Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/mroute6.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index 1ac38e6819f5..c4a45859f586 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -8,6 +8,7 @@ #include #include #include +#include #ifdef CONFIG_IPV6_MROUTE static inline int ip6_mroute_opt(int opt) @@ -63,6 +64,15 @@ static inline void ip6_mr_cleanup(void) } #endif +#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES +bool ip6mr_rule_default(const struct fib_rule *rule); +#else +static inline bool ip6mr_rule_default(const struct fib_rule *rule) +{ + return true; +} +#endif + #define VIFF_STATIC 0x8000 struct mfc6_cache_cmp_arg { -- cgit v1.2.3 From 8c13af2a219c6498071b30ea558438c74267ae4d Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 26 Mar 2018 15:01:36 +0300 Subject: ip6mr: Add refcounting to mfc Since ipmr and ip6mr are using the same mr_mfc struct at their core, we can now refactor the ipmr_cache_{hold,put} logic and apply refcounting to both ipmr and ip6mr. Signed-off-by: Yuval Mintz Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/mroute.h | 19 ------------------- include/linux/mroute_base.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mroute.h b/include/linux/mroute.h index c855d80b51f7..9a36fad9e068 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -84,23 +84,4 @@ struct rtmsg; int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid); - -#ifdef CONFIG_IP_MROUTE -void ipmr_cache_free(struct mfc_cache *mfc_cache); -#else -static inline void ipmr_cache_free(struct mfc_cache *mfc_cache) -{ -} -#endif - -static inline void ipmr_cache_put(struct mfc_cache *c) -{ - if (refcount_dec_and_test(&c->_c.mfc_un.res.refcount)) - ipmr_cache_free(c); -} -static inline void ipmr_cache_hold(struct mfc_cache *c) -{ - refcount_inc(&c->_c.mfc_un.res.refcount); -} - #endif diff --git a/include/linux/mroute_base.h b/include/linux/mroute_base.h index 289eb5aa7b5d..d617fe45543e 100644 --- a/include/linux/mroute_base.h +++ b/include/linux/mroute_base.h @@ -125,6 +125,7 @@ enum { * @refcount: reference count for this entry * @list: global entry list * @rcu: used for entry destruction + * @free: Operation used for freeing an entry under RCU */ struct mr_mfc { struct rhlist_head mnode; @@ -150,8 +151,20 @@ struct mr_mfc { } mfc_un; struct list_head list; struct rcu_head rcu; + void (*free)(struct rcu_head *head); }; +static inline void mr_cache_put(struct mr_mfc *c) +{ + if (refcount_dec_and_test(&c->mfc_un.res.refcount)) + call_rcu(&c->rcu, c->free); +} + +static inline void mr_cache_hold(struct mr_mfc *c) +{ + refcount_inc(&c->mfc_un.res.refcount); +} + struct mfc_entry_notifier_info { struct fib_notifier_info info; struct mr_mfc *mfc; -- cgit v1.2.3 From 2fcb12df7d2fa5a004fc3e7f589e58a08f7ed8c9 Mon Sep 17 00:00:00 2001 From: Inbar Karmy Date: Thu, 17 Aug 2017 16:39:47 +0300 Subject: net/mlx5e: Expose PFC stall prevention counters Add the needed capability bit and counters to device spec description. Expose the following two counters in ethtool: tx_pause_storm_warning_events: when the device is stalled for a period longer than a pre-configured watermark, the counter increase, allowing the debug utility an insight into current device status. tx_pause_storm_error_events: when the device is stalled for a period longer than a pre-configured timeout, the pause transmission is disabled, and the counter increase. Signed-off-by: Inbar Karmy Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 4 ++++ include/linux/mlx5/mlx5_ifc.h | 28 +++++++++++++++++++++++++--- 2 files changed, 29 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index e5258ee4e38b..4b5939c78cdd 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1013,6 +1013,7 @@ enum mlx5_cap_type { MLX5_CAP_RESERVED, MLX5_CAP_VECTOR_CALC, MLX5_CAP_QOS, + MLX5_CAP_DEBUG, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1140,6 +1141,9 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_QOS(mdev, cap)\ MLX5_GET(qos_cap, mdev->caps.hca_cur[MLX5_CAP_QOS], cap) +#define MLX5_CAP_DEBUG(mdev, cap)\ + MLX5_GET(debug_cap, mdev->caps.hca_cur[MLX5_CAP_DEBUG], cap) + #define MLX5_CAP_PCAM_FEATURE(mdev, fld) \ MLX5_GET(pcam_reg, (mdev)->caps.pcam, feature_cap_mask.enhanced_features.fld) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 14ad84afe8ba..c7d50eccff9e 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -593,6 +593,16 @@ struct mlx5_ifc_qos_cap_bits { u8 reserved_at_100[0x700]; }; +struct mlx5_ifc_debug_cap_bits { + u8 reserved_at_0[0x20]; + + u8 reserved_at_20[0x2]; + u8 stall_detect[0x1]; + u8 reserved_at_23[0x1d]; + + u8 reserved_at_40[0x7c0]; +}; + struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 csum_cap[0x1]; u8 vlan_cap[0x1]; @@ -855,7 +865,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 out_of_seq_cnt[0x1]; u8 vport_counters[0x1]; u8 retransmission_q_counters[0x1]; - u8 reserved_at_183[0x1]; + u8 debug[0x1]; u8 modify_rq_counter_set_id[0x1]; u8 rq_delay_drop[0x1]; u8 max_qp_cnt[0xa]; @@ -1572,7 +1582,17 @@ struct mlx5_ifc_eth_per_prio_grp_data_layout_bits { u8 rx_pause_transition_low[0x20]; - u8 reserved_at_3c0[0x400]; + u8 reserved_at_3c0[0x40]; + + u8 device_stall_minor_watermark_cnt_high[0x20]; + + u8 device_stall_minor_watermark_cnt_low[0x20]; + + u8 device_stall_critical_watermark_cnt_high[0x20]; + + u8 device_stall_critical_watermark_cnt_low[0x20]; + + u8 reserved_at_480[0x340]; }; struct mlx5_ifc_eth_extended_cntrs_grp_data_layout_bits { @@ -7874,8 +7894,10 @@ struct mlx5_ifc_peir_reg_bits { }; struct mlx5_ifc_pcam_enhanced_features_bits { - u8 reserved_at_0[0x7b]; + u8 reserved_at_0[0x76]; + u8 pfcc_mask[0x1]; + u8 reserved_at_77[0x4]; u8 rx_buffer_fullness_counters[0x1]; u8 ptys_connector_type[0x1]; u8 reserved_at_7d[0x1]; -- cgit v1.2.3 From 2afa609f5c970185a8cae73f6a4caadf97fbea54 Mon Sep 17 00:00:00 2001 From: Inbar Karmy Date: Mon, 20 Nov 2017 18:06:20 +0200 Subject: net/mlx5e: PFC stall prevention support Implement set/get functions to configure PFC stall prevention timeout by tunables api through ethtool. By default the stall prevention timeout is configured to 8 sec. Timeout range is: 80-8000 msec. Enabling stall prevention with the auto timeout will set the timeout to 100 msec. Signed-off-by: Inbar Karmy Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 17 +++++++++++++---- include/linux/mlx5/port.h | 6 ++++++ 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c7d50eccff9e..f3200a9696d6 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -7833,7 +7833,11 @@ struct mlx5_ifc_pifr_reg_bits { struct mlx5_ifc_pfcc_reg_bits { u8 reserved_at_0[0x8]; u8 local_port[0x8]; - u8 reserved_at_10[0x10]; + u8 reserved_at_10[0xb]; + u8 ppan_mask_n[0x1]; + u8 minor_stall_mask[0x1]; + u8 critical_stall_mask[0x1]; + u8 reserved_at_1e[0x2]; u8 ppan[0x4]; u8 reserved_at_24[0x4]; @@ -7843,17 +7847,22 @@ struct mlx5_ifc_pfcc_reg_bits { u8 pptx[0x1]; u8 aptx[0x1]; - u8 reserved_at_42[0x6]; + u8 pptx_mask_n[0x1]; + u8 reserved_at_43[0x5]; u8 pfctx[0x8]; u8 reserved_at_50[0x10]; u8 pprx[0x1]; u8 aprx[0x1]; - u8 reserved_at_62[0x6]; + u8 pprx_mask_n[0x1]; + u8 reserved_at_63[0x5]; u8 pfcrx[0x8]; u8 reserved_at_70[0x10]; - u8 reserved_at_80[0x80]; + u8 device_stall_minor_watermark[0x10]; + u8 device_stall_critical_watermark[0x10]; + + u8 reserved_at_a0[0x60]; }; struct mlx5_ifc_pelc_reg_bits { diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 035f0d4dc9fe..34aed6032f86 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -151,6 +151,12 @@ int mlx5_set_port_pfc(struct mlx5_core_dev *dev, u8 pfc_en_tx, u8 pfc_en_rx); int mlx5_query_port_pfc(struct mlx5_core_dev *dev, u8 *pfc_en_tx, u8 *pfc_en_rx); +int mlx5_set_port_stall_watermark(struct mlx5_core_dev *dev, + u16 stall_critical_watermark, + u16 stall_minor_watermark); +int mlx5_query_port_stall_watermark(struct mlx5_core_dev *dev, + u16 *stall_critical_watermark, u16 *stall_minor_watermark); + int mlx5_max_tc(struct mlx5_core_dev *mdev); int mlx5_set_port_prio_tc(struct mlx5_core_dev *mdev, u8 *prio_tc); -- cgit v1.2.3 From 61c5b5c9178288a4caa3e39095aafb391c5100f6 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 7 Jan 2018 16:45:27 +0200 Subject: net/mlx5: Add support for QUERY_VNIC_ENV command Add support for new FW command QUERY_VNIC_ENV. The command is used by the driver to query vnic diagnostic statistics from FW. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 50 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f3200a9696d6..52e373dd2679 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -143,6 +143,7 @@ enum { MLX5_CMD_OP_MODIFY_HCA_VPORT_CONTEXT = 0x763, MLX5_CMD_OP_QUERY_HCA_VPORT_GID = 0x764, MLX5_CMD_OP_QUERY_HCA_VPORT_PKEY = 0x765, + MLX5_CMD_OP_QUERY_VNIC_ENV = 0x76f, MLX5_CMD_OP_QUERY_VPORT_COUNTER = 0x770, MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771, MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772, @@ -875,7 +876,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 vhca_group_manager[0x1]; u8 ib_virt[0x1]; u8 eth_virt[0x1]; - u8 reserved_at_1a4[0x1]; + u8 vnic_env_queue_counters[0x1]; u8 ets[0x1]; u8 nic_flow_table[0x1]; u8 eswitch_flow_table[0x1]; @@ -2386,6 +2387,24 @@ struct mlx5_ifc_xrc_srqc_bits { u8 reserved_at_180[0x80]; }; +struct mlx5_ifc_vnic_diagnostic_statistics_bits { + u8 counter_error_queues[0x20]; + + u8 total_error_queues[0x20]; + + u8 send_queue_priority_update_flow[0x20]; + + u8 reserved_at_60[0x20]; + + u8 nic_receive_steering_discard[0x40]; + + u8 receive_discard_vport_down[0x40]; + + u8 transmit_discard_vport_down[0x40]; + + u8 reserved_at_140[0xec0]; +}; + struct mlx5_ifc_traffic_counter_bits { u8 packets[0x40]; @@ -3661,6 +3680,35 @@ struct mlx5_ifc_query_vport_state_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_query_vnic_env_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_vnic_diagnostic_statistics_bits vport_env; +}; + +enum { + MLX5_QUERY_VNIC_ENV_IN_OP_MOD_VPORT_DIAG_STATISTICS = 0x0, +}; + +struct mlx5_ifc_query_vnic_env_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 other_vport[0x1]; + u8 reserved_at_41[0xf]; + u8 vport_number[0x10]; + + u8 reserved_at_60[0x20]; +}; + struct mlx5_ifc_query_vport_counter_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; -- cgit v1.2.3 From 5c298143be17f5100656b9c140af672c644116d9 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Tue, 26 Dec 2017 16:46:29 +0200 Subject: net/mlx5e: Add vnic steering drop statistics Added the following packets drop counter: Rx steering missed dropped packets - counts packets which were dropped due to miss on NIC rx steering rules. This counter will be shown on ethtool as a new counter called rx_steer_missed_packets. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 52e373dd2679..9202113f552c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1008,7 +1008,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_330[0xb]; u8 log_max_xrcd[0x5]; - u8 reserved_at_340[0x8]; + u8 nic_receive_steering_discard[0x1]; + u8 reserved_at_341[0x7]; u8 log_max_flow_counter_bulk[0x8]; u8 max_flow_counter_15_0[0x10]; -- cgit v1.2.3 From aaabd0783b1cf46569f5fc1c60d79709815497fc Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 14 Jan 2018 00:56:25 +0200 Subject: net/mlx5: Add packet dropped while vport down statistics Added the following packets dropped while vport down statistics: Rx dropped while vport down - counts packets which were steered by e-switch to a vport, but dropped since the vport was down. This counter will be shown on ip link tool as part of the vport rx_dropped counter. Tx dropped while vport down - counts packets which were transmitted by a vport, but dropped due to vport logical link down. This counter will be shown on ip link tool as part of the vport tx_dropped counter. The counters are read from FW by command QUERY_VNIC_ENV. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 +++- include/linux/mlx5/vport.h | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 9202113f552c..1f3483d40055 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1009,7 +1009,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_xrcd[0x5]; u8 nic_receive_steering_discard[0x1]; - u8 reserved_at_341[0x7]; + u8 receive_discard_vport_down[0x1]; + u8 transmit_discard_vport_down[0x1]; + u8 reserved_at_343[0x5]; u8 log_max_flow_counter_bulk[0x8]; u8 max_flow_counter_15_0[0x10]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 64e193e87394..9208cb8809ac 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -107,6 +107,9 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev, int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev); int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev); +int mlx5_query_vport_down_stats(struct mlx5_core_dev *mdev, u16 vport, + u64 *rx_discard_vport_down, + u64 *tx_discard_vport_down); int mlx5_core_query_vport_counter(struct mlx5_core_dev *dev, u8 other_vport, int vf, u8 port_num, void *out, size_t out_sz); -- cgit v1.2.3 From 0c06897a9ac7e2db9ad2df15bc6511e8ab88378f Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Sun, 28 Jan 2018 20:14:20 +0200 Subject: net/mlx5: Add core support for vlan push/pop steering action Newer NICs (ConnectX-5 and onward) can apply vlan pop or push as an action taking place during flow steering. Add the core bits for that. Signed-off-by: Or Gerlitz Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 7 +++++++ include/linux/mlx5/mlx5_ifc.h | 16 ++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index b957e52434f8..47aecc4fa8c2 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -142,6 +142,12 @@ struct mlx5_flow_group * mlx5_create_flow_group(struct mlx5_flow_table *ft, u32 *in); void mlx5_destroy_flow_group(struct mlx5_flow_group *fg); +struct mlx5_fs_vlan { + u16 ethtype; + u16 vid; + u8 prio; +}; + struct mlx5_flow_act { u32 action; bool has_flow_tag; @@ -149,6 +155,7 @@ struct mlx5_flow_act { u32 encap_id; u32 modify_id; uintptr_t esp_id; + struct mlx5_fs_vlan vlan; }; #define MLX5_DECLARE_FLOW_ACT(name) \ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1f3483d40055..c19e611d2782 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -314,7 +314,10 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 flow_table_modify[0x1]; u8 encap[0x1]; u8 decap[0x1]; - u8 reserved_at_9[0x17]; + u8 reserved_at_9[0x1]; + u8 pop_vlan[0x1]; + u8 push_vlan[0x1]; + u8 reserved_at_c[0x14]; u8 reserved_at_20[0x2]; u8 log_max_ft_size[0x6]; @@ -2311,10 +2314,19 @@ enum { MLX5_FLOW_CONTEXT_ACTION_ENCAP = 0x10, MLX5_FLOW_CONTEXT_ACTION_DECAP = 0x20, MLX5_FLOW_CONTEXT_ACTION_MOD_HDR = 0x40, + MLX5_FLOW_CONTEXT_ACTION_VLAN_POP = 0x80, + MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH = 0x100, +}; + +struct mlx5_ifc_vlan_bits { + u8 ethtype[0x10]; + u8 prio[0x3]; + u8 cfi[0x1]; + u8 vid[0xc]; }; struct mlx5_ifc_flow_context_bits { - u8 reserved_at_0[0x20]; + struct mlx5_ifc_vlan_bits push_vlan; u8 group_id[0x20]; -- cgit v1.2.3 From 4420bf21fb6c0306e36ad58ade1e741fba57ce65 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 27 Mar 2018 18:02:23 +0300 Subject: net: Rename net_sem to pernet_ops_rwsem net_sem is some undefined area name, so it will be better to make the area more defined. Rename it to pernet_ops_rwsem for better readability and better intelligibility. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 562a175c35a9..c7d1e4689325 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -36,7 +36,7 @@ extern int rtnl_is_locked(void); extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; -extern struct rw_semaphore net_sem; +extern struct rw_semaphore pernet_ops_rwsem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); -- cgit v1.2.3 From 2816077127230ef52cc7497903e71def45747611 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Tue, 26 Dec 2017 15:17:05 +0200 Subject: mlx5_{ib,core}: Add query SQ state helper function Move query SQ state function from mlx5_ib to mlx5_core in order to have it in shared code. It will be used in a downstream patch from mlx5e. Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- include/linux/mlx5/transobj.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h index 7e8f281f8c00..80d7aa8b2831 100644 --- a/include/linux/mlx5/transobj.h +++ b/include/linux/mlx5/transobj.h @@ -47,6 +47,7 @@ int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen); void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn); int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out); +int mlx5_core_query_sq_state(struct mlx5_core_dev *dev, u32 sqn, u8 *state); int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tirn); int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, -- cgit v1.2.3 From 1acae6b030164217b9c6a52245eade730057152b Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Sun, 31 Dec 2017 12:55:26 +0200 Subject: mlx5: Move dump error CQE function out of mlx5_ib for code sharing Move mlx5_ib dump error CQE implementation to mlx5 CQ header file in order to use it in a downstream patch from mlx5e. In addition, use print_hex_dump instead of manual dumping of the buffer. Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- include/linux/mlx5/cq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 445ad194e0fe..0ef6138eca49 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -193,6 +193,12 @@ int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u16 cq_period, u16 cq_max_count); +static inline void mlx5_dump_err_cqe(struct mlx5_core_dev *dev, + struct mlx5_err_cqe *err_cqe) +{ + print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, err_cqe, + sizeof(*err_cqe), false); +} int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); -- cgit v1.2.3 From cf14f27f82af78e713f8a57c477cf9233faf8b30 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 28 Mar 2018 12:05:36 -0700 Subject: macro: introduce COUNT_ARGS() macro move COUNT_ARGS() macro from apparmor to generic header and extend it to count till twelve. COUNT() was an alternative name for this logic, but it's used for different purpose in many other places. Similarly for CONCATENATE() macro. Suggested-by: Linus Torvalds Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/kernel.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3fd291503576..293fa0677fba 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -919,6 +919,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) +/* This counts to 12. Any more, it will return 13th argument. */ +#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n +#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) + +#define __CONCAT(a, b) a ## b +#define CONCATENATE(a, b) __CONCAT(a, b) + /** * container_of - cast a member of a structure out to the containing structure * @ptr: the pointer to the member. -- cgit v1.2.3 From c4f6699dfcb8558d138fe838f741b2c10f416cf9 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 28 Mar 2018 12:05:37 -0700 Subject: bpf: introduce BPF_RAW_TRACEPOINT Introduce BPF_PROG_TYPE_RAW_TRACEPOINT bpf program type to access kernel internal arguments of the tracepoints in their raw form. >From bpf program point of view the access to the arguments look like: struct bpf_raw_tracepoint_args { __u64 args[0]; }; int bpf_prog(struct bpf_raw_tracepoint_args *ctx) { // program can read args[N] where N depends on tracepoint // and statically verified at program load+attach time } kprobe+bpf infrastructure allows programs access function arguments. This feature allows programs access raw tracepoint arguments. Similar to proposed 'dynamic ftrace events' there are no abi guarantees to what the tracepoints arguments are and what their meaning is. The program needs to type cast args properly and use bpf_probe_read() helper to access struct fields when argument is a pointer. For every tracepoint __bpf_trace_##call function is prepared. In assembler it looks like: (gdb) disassemble __bpf_trace_xdp_exception Dump of assembler code for function __bpf_trace_xdp_exception: 0xffffffff81132080 <+0>: mov %ecx,%ecx 0xffffffff81132082 <+2>: jmpq 0xffffffff811231f0 where TRACE_EVENT(xdp_exception, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, u32 act), The above assembler snippet is casting 32-bit 'act' field into 'u64' to pass into bpf_trace_run3(), while 'dev' and 'xdp' args are passed as-is. All of ~500 of __bpf_trace_*() functions are only 5-10 byte long and in total this approach adds 7k bytes to .text. This approach gives the lowest possible overhead while calling trace_xdp_exception() from kernel C code and transitioning into bpf land. Since tracepoint+bpf are used at speeds of 1M+ events per second this is valuable optimization. The new BPF_RAW_TRACEPOINT_OPEN sys_bpf command is introduced that returns anon_inode FD of 'bpf-raw-tracepoint' object. The user space looks like: // load bpf prog with BPF_PROG_TYPE_RAW_TRACEPOINT type prog_fd = bpf_prog_load(...); // receive anon_inode fd for given bpf_raw_tracepoint with prog attached raw_tp_fd = bpf_raw_tracepoint_open("xdp_exception", prog_fd); Ctrl-C of tracing daemon or cmdline tool that uses this feature will automatically detach bpf program, unload it and unregister tracepoint probe. On the kernel side the __bpf_raw_tp_map section of pointers to tracepoint definition and to __bpf_trace_*() probe function is used to find a tracepoint with "xdp_exception" name and corresponding __bpf_trace_xdp_exception() probe function which are passed to tracepoint_probe_register() to connect probe with tracepoint. Addition of bpf_raw_tracepoint doesn't interfere with ftrace and perf tracepoint mechanisms. perf_event_open() can be used in parallel on the same tracepoint. Multiple bpf_raw_tracepoint_open("xdp_exception", prog_fd) are permitted. Each with its own bpf program. The kernel will execute all tracepoint probes and all attached bpf programs. In the future bpf_raw_tracepoints can be extended with query/introspection logic. __bpf_raw_tp_map section logic was contributed by Steven Rostedt Signed-off-by: Alexei Starovoitov Signed-off-by: Steven Rostedt (VMware) Acked-by: Steven Rostedt (VMware) Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 1 + include/linux/trace_events.h | 42 +++++++++++++++++++++++++++++++++++++++++ include/linux/tracepoint-defs.h | 6 ++++++ 3 files changed, 49 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5e2e8a49fb21..6d7243bfb0ff 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -19,6 +19,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8a1442c4e513..b0357cd198b0 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -468,6 +468,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); +struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -487,6 +490,18 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info) { return -EOPNOTSUPP; } +static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p) +{ + return -EOPNOTSUPP; +} +static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p) +{ + return -EOPNOTSUPP; +} +static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +{ + return NULL; +} #endif enum { @@ -546,6 +561,33 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); +void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); +void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); +void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3); +void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4); +void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5); +void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6); +void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); +void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8); +void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9); +void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10); +void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10, u64 arg11); +void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 64ed7064f1fa..22c5a46e9693 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -35,4 +35,10 @@ struct tracepoint { struct tracepoint_func __rcu *funcs; }; +struct bpf_raw_event_map { + struct tracepoint *tp; + void *bpf_func; + u32 num_args; +} __aligned(32); + #endif -- cgit v1.2.3 From e59ac634908f4ea90066e6db7dd7ae8ca02815ff Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 28 Mar 2018 17:48:33 -0700 Subject: bpf: add parenthesis around argument of BPF_LDST_BYTES() BPF_LDST_BYTES() does not put it's argument in parenthesis when referencing it. This makes it impossible to pass pointers obtained by address-of operator (e.g. BPF_LDST_BYTES(&insn)). Add the parenthesis. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 109d05ccea9a..c2f167db8bd5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -372,7 +372,7 @@ struct xdp_rxq_info; #define BPF_LDST_BYTES(insn) \ ({ \ - const int __size = bpf_size_to_bytes(BPF_SIZE(insn->code)); \ + const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \ WARN_ON(__size < 0); \ __size; \ }) -- cgit v1.2.3 From f0b07bb151b098d291fd1fd71ef7a2df56fb124a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 29 Mar 2018 19:20:32 +0300 Subject: net: Introduce net_rwsem to protect net_namespace_list rtnl_lock() is used everywhere, and contention is very high. When someone wants to iterate over alive net namespaces, he/she has no a possibility to do that without exclusive lock. But the exclusive rtnl_lock() in such places is overkill, and it just increases the contention. Yes, there is already for_each_net_rcu() in kernel, but it requires rcu_read_lock(), and this can't be sleepable. Also, sometimes it may be need really prevent net_namespace_list growth, so for_each_net_rcu() is not fit there. This patch introduces new rw_semaphore, which will be used instead of rtnl_mutex to protect net_namespace_list. It is sleepable and allows not-exclusive iterations over net namespaces list. It allows to stop using rtnl_lock() in several places (what is made in next patches) and makes less the time, we keep rtnl_mutex. Here we just add new lock, while the explanation of we can remove rtnl_lock() there are in next patches. Fine grained locks generally are better, then one big lock, so let's do that with net_namespace_list, while the situation allows that. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index c7d1e4689325..5225832bd6ff 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -37,6 +37,7 @@ extern int rtnl_lock_killable(void); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore pernet_ops_rwsem; +extern struct rw_semaphore net_rwsem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); -- cgit v1.2.3 From 50bc60cb155c813157fdca5b3b05194cd325d3e9 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Wed, 28 Mar 2018 11:42:16 +0300 Subject: qed*: Utilize FW 8.33.11.0 This FW contains several fixes and features RDMA Features - SRQ support - XRC support - Memory window support - RDMA low latency queue support - RDMA bonding support RDMA bug fixes - RDMA remote invalidate during retransmit fix - iWARP MPA connect interop issue with RTR fix - iWARP Legacy DPM support - Fix MPA reject flow - iWARP error handling - RQ WQE validation checks MISC - Fix some HSI types endianity - New Restriction: vlan insertion in core_tx_bd_data can't be set for LB packets ETH - HW QoS offload support - Fix vlan, dcb and sriov flow of VF sending a packet with inband VLAN tag instead of default VLAN - Allow GRE version 1 offloads in RX flow - Allow VXLAN steering iSCSI / FcoE - Fix bd availability checking flow - Support 256th sge proerly in iscsi/fcoe retransmit - Performance improvement - Fix handle iSCSI command arrival with AHS and with immediate - Fix ipv6 traffic class configuration DEBUG - Update debug utilities Signed-off-by: Michal Kalderon Signed-off-by: Tomer Tayar Signed-off-by: Manish Rangankar Signed-off-by: Ariel Elior Acked-by: Jason Gunthorpe Signed-off-by: David S. Miller --- include/linux/qed/common_hsi.h | 2 +- include/linux/qed/eth_common.h | 2 +- include/linux/qed/iscsi_common.h | 4 ++-- include/linux/qed/rdma_common.h | 2 ++ include/linux/qed/roce_common.h | 3 +++ 5 files changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 2b3b350e07b7..13c8ab171437 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -110,7 +110,7 @@ #define FW_MAJOR_VERSION 8 #define FW_MINOR_VERSION 33 -#define FW_REVISION_VERSION 1 +#define FW_REVISION_VERSION 11 #define FW_ENGINEERING_VERSION 0 /***********************/ diff --git a/include/linux/qed/eth_common.h b/include/linux/qed/eth_common.h index 9db02856623b..d9416ad5ef59 100644 --- a/include/linux/qed/eth_common.h +++ b/include/linux/qed/eth_common.h @@ -105,7 +105,7 @@ #define ETH_CTL_FRAME_ETH_TYPE_NUM 4 /* GFS constants */ -#define ETH_GFT_TRASH_CAN_VPORT 0x1FF +#define ETH_GFT_TRASHCAN_VPORT 0x1FF /* GFT drop flow vport number */ /* Destination port mode */ enum dest_port_mode { diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h index 4cc9b37b8d95..938df614cb6a 100644 --- a/include/linux/qed/iscsi_common.h +++ b/include/linux/qed/iscsi_common.h @@ -753,8 +753,8 @@ struct e4_ystorm_iscsi_task_ag_ctx { #define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT1_SHIFT 5 #define E4_YSTORM_ISCSI_TASK_AG_CTX_VALID_MASK 0x1 #define E4_YSTORM_ISCSI_TASK_AG_CTX_VALID_SHIFT 6 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT3_MASK 0x1 -#define E4_YSTORM_ISCSI_TASK_AG_CTX_BIT3_SHIFT 7 +#define E4_YSTORM_ISCSI_TASK_AG_CTX_TTT_VALID_MASK 0x1 /* bit3 */ +#define E4_YSTORM_ISCSI_TASK_AG_CTX_TTT_VALID_SHIFT 7 u8 flags1; #define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0_MASK 0x3 #define E4_YSTORM_ISCSI_TASK_AG_CTX_CF0_SHIFT 0 diff --git a/include/linux/qed/rdma_common.h b/include/linux/qed/rdma_common.h index c1a446ebe362..480a57eb36cc 100644 --- a/include/linux/qed/rdma_common.h +++ b/include/linux/qed/rdma_common.h @@ -51,6 +51,8 @@ #define RDMA_MAX_CQS (64 * 1024) #define RDMA_MAX_TIDS (128 * 1024 - 1) #define RDMA_MAX_PDS (64 * 1024) +#define RDMA_MAX_XRC_SRQS (1024) +#define RDMA_MAX_SRQS (32 * 1024) #define RDMA_NUM_STATISTIC_COUNTERS MAX_NUM_VPORTS #define RDMA_NUM_STATISTIC_COUNTERS_K2 MAX_NUM_VPORTS_K2 diff --git a/include/linux/qed/roce_common.h b/include/linux/qed/roce_common.h index e15e0da71240..193bcef302e1 100644 --- a/include/linux/qed/roce_common.h +++ b/include/linux/qed/roce_common.h @@ -59,6 +59,9 @@ enum roce_async_events_type { ROCE_ASYNC_EVENT_CQ_OVERFLOW_ERR, ROCE_ASYNC_EVENT_SRQ_EMPTY, ROCE_ASYNC_EVENT_DESTROY_QP_DONE, + ROCE_ASYNC_EVENT_XRC_DOMAIN_ERR, + ROCE_ASYNC_EVENT_INVALID_XRCETH_ERR, + ROCE_ASYNC_EVENT_XRC_SRQ_CATASTROPHIC_ERR, MAX_ROCE_ASYNC_EVENTS_TYPE }; -- cgit v1.2.3 From 3a69cae80cdd1b5c8b23137cba2a80ecfec4cef5 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Wed, 28 Mar 2018 05:14:22 -0700 Subject: qed: Adapter flash update support. This patch adds the required driver support for updating the flash or non volatile memory of the adapter. At highlevel, flash upgrade comprises of reading the flash images from the input file, validating the images and writing them to the respective paritions. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Ariel Elior Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 15e398c7230e..b5b2bc9eacca 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -483,6 +483,15 @@ struct qed_int_info { u8 used_cnt; }; +#define QED_NVM_SIGNATURE 0x12435687 + +enum qed_nvm_flash_cmd { + QED_NVM_FLASH_CMD_FILE_DATA = 0x2, + QED_NVM_FLASH_CMD_FILE_START = 0x3, + QED_NVM_FLASH_CMD_NVM_CHANGE = 0x4, + QED_NVM_FLASH_CMD_NVM_MAX, +}; + struct qed_common_cb_ops { void (*arfs_filter_op)(void *dev, void *fltr, u8 fw_rc); void (*link_update)(void *dev, @@ -657,6 +666,16 @@ struct qed_common_ops { void (*chain_free)(struct qed_dev *cdev, struct qed_chain *p_chain); +/** + * @brief nvm_flash - Flash nvm data. + * + * @param cdev + * @param name - file containing the data + * + * @return 0 on success, error otherwise. + */ + int (*nvm_flash)(struct qed_dev *cdev, const char *name); + /** * @brief nvm_get_image - reads an entire image from nvram * -- cgit v1.2.3 From 8934ce2fd08171e8605f7fada91ee7619fe17ab8 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 28 Mar 2018 12:49:15 -0700 Subject: bpf: sockmap redirect ingress support Add support for the BPF_F_INGRESS flag in sk_msg redirect helper. To do this add a scatterlist ring for receiving socks to check before calling into regular recvmsg call path. Additionally, because the poll wakeup logic only checked the skb recv queue we need to add a hook in TCP stack (similar to write side) so that we have a way to wake up polling socks when a scatterlist is redirected to that sock. After this all that is needed is for the redirect helper to push the scatterlist into the psock receive queue. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index c2f167db8bd5..961cc5d53956 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -521,6 +521,7 @@ struct sk_msg_buff { __u32 key; __u32 flags; struct bpf_map *map; + struct list_head list; }; /* Compute the linear packet data range [data, data_end) which -- cgit v1.2.3 From fa246693a111fab32bd51d20f07a347e42773ee9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 28 Mar 2018 12:49:25 -0700 Subject: bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT: Add support for the BPF_F_INGRESS flag in skb redirect helper. To do this convert skb into a scatterlist and push into ingress queue. This is the same logic that is used in the sk_msg redirect helper so it should feel familiar. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 961cc5d53956..897ff3d95968 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -521,6 +521,7 @@ struct sk_msg_buff { __u32 key; __u32 flags; struct bpf_map *map; + struct sk_buff *skb; struct list_head list; }; -- cgit v1.2.3 From 9daae9bd47cff82a2a06aca23c458d6c79d09d52 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 28 Mar 2018 17:46:54 +0300 Subject: net: Call add/kill vid ndo on vlan filter feature toggling NETIF_F_HW_VLAN_[CS]TAG_FILTER features require more than just a bit flip in dev->features in order to keep the driver in a consistent state. These features notify the driver of each added/removed vlan, but toggling of vlan-filter does not notify the driver accordingly for each of the existing vlans. This patch implements a similar solution to NETIF_F_RX_UDP_TUNNEL_PORT behavior (which notifies the driver about UDP ports in the same manner that vids are reported). Each toggling of the features propagates to the 8021q module, which iterates over the vlans and call add/kill ndo accordingly. Signed-off-by: Gal Pressman Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 24 ++++++++++++++++++++++++ include/linux/netdevice.h | 4 ++++ 2 files changed, 28 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c4a1cff9c768..24d1976c1e61 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -83,6 +83,30 @@ static inline bool is_vlan_dev(const struct net_device *dev) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) #define skb_vlan_tag_get_prio(__skb) ((__skb)->vlan_tci & VLAN_PRIO_MASK) +static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev) +{ + ASSERT_RTNL(); + return notifier_to_errno(call_netdevice_notifiers(NETDEV_CVLAN_FILTER_PUSH_INFO, dev)); +} + +static inline void vlan_drop_rx_ctag_filter_info(struct net_device *dev) +{ + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_CVLAN_FILTER_DROP_INFO, dev); +} + +static inline int vlan_get_rx_stag_filter_info(struct net_device *dev) +{ + ASSERT_RTNL(); + return notifier_to_errno(call_netdevice_notifiers(NETDEV_SVLAN_FILTER_PUSH_INFO, dev)); +} + +static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev) +{ + ASSERT_RTNL(); + call_netdevice_notifiers(NETDEV_SVLAN_FILTER_DROP_INFO, dev); +} + /** * struct vlan_pcpu_stats - VLAN percpu rx/tx stats * @rx_packets: number of received packets diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2a2d9cf50aa2..da44dab492e3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2349,6 +2349,10 @@ enum netdev_cmd { NETDEV_UDP_TUNNEL_PUSH_INFO, NETDEV_UDP_TUNNEL_DROP_INFO, NETDEV_CHANGE_TX_QUEUE_LEN, + NETDEV_CVLAN_FILTER_PUSH_INFO, + NETDEV_CVLAN_FILTER_DROP_INFO, + NETDEV_SVLAN_FILTER_PUSH_INFO, + NETDEV_SVLAN_FILTER_DROP_INFO, }; const char *netdev_cmd_to_name(enum netdev_cmd cmd); -- cgit v1.2.3 From c6ab3008b6a6ecda22e92f96a1b9cc6b0d0b0a4e Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 28 Mar 2018 15:44:15 -0700 Subject: net: phy: phylink: Provide PHY interface to mac_link_{up, down} In preparation for having DSA transition entirely to PHYLINK, we need to pass a PHY interface type to the mac_link_{up,down} callbacks because we may have to make decisions on that (e.g: turn on/off RGMII interfaces etc.). We do not pass an entire phylink_link_state because not all parameters (pause, duplex etc.) are defined when the link is down, only link and interface are. Update mvneta accordingly since it currently implements phylink_mac_ops. Acked-by: Russell King Signed-off-by: Florian Fainelli Acked-by: Russell King Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phylink.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index bd137c273d38..e95cc12030fa 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -73,8 +73,10 @@ struct phylink_mac_ops { void (*mac_config)(struct net_device *ndev, unsigned int mode, const struct phylink_link_state *state); void (*mac_an_restart)(struct net_device *ndev); - void (*mac_link_down)(struct net_device *ndev, unsigned int mode); + void (*mac_link_down)(struct net_device *ndev, unsigned int mode, + phy_interface_t interface); void (*mac_link_up)(struct net_device *ndev, unsigned int mode, + phy_interface_t interface, struct phy_device *phy); }; @@ -161,25 +163,31 @@ void mac_an_restart(struct net_device *ndev); * mac_link_down() - take the link down * @ndev: a pointer to a &struct net_device for the MAC. * @mode: link autonegotiation mode + * @interface: link &typedef phy_interface_t mode * * If @mode is not an in-band negotiation mode (as defined by * phylink_autoneg_inband()), force the link down and disable any - * Energy Efficient Ethernet MAC configuration. + * Energy Efficient Ethernet MAC configuration. Interface type + * selection must be done in mac_config(). */ -void mac_link_down(struct net_device *ndev, unsigned int mode); +void mac_link_down(struct net_device *ndev, unsigned int mode, + phy_interface_t interface); /** * mac_link_up() - allow the link to come up * @ndev: a pointer to a &struct net_device for the MAC. * @mode: link autonegotiation mode + * @interface: link &typedef phy_interface_t mode * @phy: any attached phy * * If @mode is not an in-band negotiation mode (as defined by * phylink_autoneg_inband()), allow the link to come up. If @phy * is non-%NULL, configure Energy Efficient Ethernet by calling * phy_init_eee() and perform appropriate MAC configuration for EEE. + * Interface type selection must be done in mac_config(). */ void mac_link_up(struct net_device *ndev, unsigned int mode, + phy_interface_t interface, struct phy_device *phy); #endif -- cgit v1.2.3 From e679c9c1dbfdba07b2a979a076cca74b773be8ce Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 28 Mar 2018 15:44:16 -0700 Subject: sfp/phylink: move module EEPROM ethtool access into netdev core ethtool Provide a pointer to the SFP bus in struct net_device, so that the ethtool module EEPROM methods can access the SFP directly, rather than needing every user to provide a hook for it. Reviewed-by: Andrew Lunn Signed-off-by: Russell King Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: Russell King Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ include/linux/phylink.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index da44dab492e3..cf44503ea81a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -58,6 +58,7 @@ struct device; struct phy_device; struct dsa_port; +struct sfp_bus; /* 802.11 specific */ struct wireless_dev; /* 802.15.4 specific */ @@ -1662,6 +1663,7 @@ enum netdev_priv_flags { * @priomap: XXX: need comments on this one * @phydev: Physical device may attach itself * for hardware timestamping + * @sfp_bus: attached &struct sfp_bus structure. * * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount @@ -1945,6 +1947,7 @@ struct net_device { struct netprio_map __rcu *priomap; #endif struct phy_device *phydev; + struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; struct lock_class_key *qdisc_running_key; bool proto_down; diff --git a/include/linux/phylink.h b/include/linux/phylink.h index e95cc12030fa..50eeae025f1e 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -219,9 +219,6 @@ void phylink_ethtool_get_pauseparam(struct phylink *, struct ethtool_pauseparam *); int phylink_ethtool_set_pauseparam(struct phylink *, struct ethtool_pauseparam *); -int phylink_ethtool_get_module_info(struct phylink *, struct ethtool_modinfo *); -int phylink_ethtool_get_module_eeprom(struct phylink *, - struct ethtool_eeprom *, u8 *); int phylink_get_eee_err(struct phylink *); int phylink_ethtool_get_eee(struct phylink *, struct ethtool_eee *); int phylink_ethtool_set_eee(struct phylink *, struct ethtool_eee *); -- cgit v1.2.3 From 9217e566bdee4583d0a9ea4879c8f5e004886eac Mon Sep 17 00:00:00 2001 From: Mike Looijmans Date: Thu, 29 Mar 2018 07:29:48 +0200 Subject: of_net: Implement of_get_nvmem_mac_address helper It's common practice to store MAC addresses for network interfaces into nvmem devices. However the code to actually do this in the kernel lacks, so this patch adds of_get_nvmem_mac_address() for drivers to obtain the address from an nvmem cell provider. This is particulary useful on devices where the ethernet interface cannot be configured by the bootloader, for example because it's in an FPGA. Signed-off-by: Mike Looijmans Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/of_net.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_net.h b/include/linux/of_net.h index 9cd72aab76fe..90d81ee9e6a0 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -13,6 +13,7 @@ struct net_device; extern int of_get_phy_mode(struct device_node *np); extern const void *of_get_mac_address(struct device_node *np); +extern int of_get_nvmem_mac_address(struct device_node *np, void *addr); extern struct net_device *of_find_net_device_by_node(struct device_node *np); #else static inline int of_get_phy_mode(struct device_node *np) @@ -25,6 +26,11 @@ static inline const void *of_get_mac_address(struct device_node *np) return NULL; } +static inline int of_get_nvmem_mac_address(struct device_node *np, void *addr) +{ + return -ENODEV; +} + static inline struct net_device *of_find_net_device_by_node(struct device_node *np) { return NULL; -- cgit v1.2.3 From f385178679b6561d2e717567d12e07c7f927ee59 Mon Sep 17 00:00:00 2001 From: Prashant Bhole Date: Fri, 30 Mar 2018 09:20:59 +0900 Subject: lib/scatterlist: add sg_init_marker() helper sg_init_marker initializes sg_magic in the sg table and calls sg_mark_end() on the last entry of the table. This can be useful to avoid memset in sg_init_table() when scatterlist is already zeroed out For example: when scatterlist is embedded inside other struct and that container struct is zeroed out Suggested-by: Daniel Borkmann Signed-off-by: Prashant Bhole Acked-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/scatterlist.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 22b2131bcdcd..aa5d4eb725f5 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -248,6 +248,24 @@ static inline void *sg_virt(struct scatterlist *sg) return page_address(sg_page(sg)) + sg->offset; } +/** + * sg_init_marker - Initialize markers in sg table + * @sgl: The SG table + * @nents: Number of entries in table + * + **/ +static inline void sg_init_marker(struct scatterlist *sgl, + unsigned int nents) +{ +#ifdef CONFIG_DEBUG_SG + unsigned int i; + + for (i = 0; i < nents; i++) + sgl[i].sg_magic = SG_MAGIC; +#endif + sg_mark_end(&sgl[nents - 1]); +} + int sg_nents(struct scatterlist *sg); int sg_nents_for_len(struct scatterlist *sg, u64 len); struct scatterlist *sg_next(struct scatterlist *); -- cgit v1.2.3 From b2d3907c234618c20239127f2c234b4e92adf5ef Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 13 Feb 2018 17:07:02 -0800 Subject: net/mlx5: Eliminate query xsrq dead code 1. This function is not used anywhere in mlx5 driver 2. It has a memcpy statement that makes no sense and produces build warning with gcc8 drivers/net/ethernet/mellanox/mlx5/core/transobj.c: In function 'mlx5_core_query_xsrq': drivers/net/ethernet/mellanox/mlx5/core/transobj.c:347:3: error: 'memcpy' source argument is the same as destination [-Werror=restrict] Fixes: 01949d0109ee ("net/mlx5_core: Enable XRCs and SRQs when using ISSI > 0") Reported-by: Arnd Bergmann Signed-off-by: Saeed Mahameed --- include/linux/mlx5/transobj.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h index 80d7aa8b2831..83a33a1873a6 100644 --- a/include/linux/mlx5/transobj.h +++ b/include/linux/mlx5/transobj.h @@ -67,7 +67,6 @@ int mlx5_core_arm_rmp(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm); int mlx5_core_create_xsrq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rmpn); int mlx5_core_destroy_xsrq(struct mlx5_core_dev *dev, u32 rmpn); -int mlx5_core_query_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u32 *out); int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm); int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen, -- cgit v1.2.3 From 619a8f2a42f1031cdbd74435b6a9191eb4913139 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 7 Feb 2018 14:41:25 +0200 Subject: net/mlx5e: Use linear SKB in Striding RQ Current Striding RQ HW feature utilizes the RX buffers so that there is no wasted room between the strides. This maximises the memory utilization. This prevents the use of build_skb() (which requires headroom and tailroom), and demands to memcpy the packets headers into the skb linear part. In this patch, whenever a set of conditions holds, we apply an RQ configuration that allows combining the use of linear SKB on top of a Striding RQ. To use build_skb() with Striding RQ, the following must hold: 1. packet does not cross a page boundary. 2. there is enough headroom and tailroom surrounding the packet. We can satisfy 1 and 2 by configuring: stride size = MTU + headroom + tailoom. This is possible only when: a. (MTU - headroom - tailoom) does not exceed PAGE_SIZE. b. HW LRO is turned off. Using linear SKB has many advantages: - Saves a memcpy of the headers. - No page-boundary checks in datapath. - No filler CQEs. - Significantly smaller CQ. - SKB data continuously resides in linear part, and not split to small amount (linear part) and large amount (fragment). This saves datapath cycles in driver and improves utilization of SKB fragments in GRO. - The fragments of a resulting GRO SKB follow the IP forwarding assumption of equal-size fragments. Some implementation details: HW writes the packets to the beginning of a stride, i.e. does not keep headroom. To overcome this we make sure we can extend backwards and use the last bytes of stride i-1. Extra care is needed for stride 0 as it has no preceding stride. We make sure headroom bytes are available by shifting the buffer pointer passed to HW by headroom bytes. This configuration now becomes default, whenever capable. Of course, this implies turning LRO off. Performance testing: ConnectX-5, single core, single RX ring, default MTU. UDP packet rate, early drop in TC layer: -------------------------------------------- | pkt size | before | after | ratio | -------------------------------------------- | 1500byte | 4.65 Mpps | 5.96 Mpps | 1.28x | | 500byte | 5.23 Mpps | 5.97 Mpps | 1.14x | | 64byte | 5.94 Mpps | 5.96 Mpps | 1.00x | -------------------------------------------- TCP streams: ~20% gain Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 3 +++ include/linux/mlx5/mlx5_ifc.h | 7 ++++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 4b5939c78cdd..12758595459b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -782,6 +782,9 @@ static inline u64 get_cqe_ts(struct mlx5_cqe64 *cqe) return (u64)lo | ((u64)hi << 32); } +#define MLX5_MPWQE_LOG_NUM_STRIDES_BASE (9) +#define MLX5_MPWQE_LOG_STRIDE_SZ_BASE (6) + struct mpwrq_cqe_bc { __be16 filler_consumed_strides; __be16 byte_cnt; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c19e611d2782..d25011f84815 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1038,7 +1038,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_398[0x3]; u8 log_max_tis_per_sq[0x5]; - u8 reserved_at_3a0[0x3]; + u8 ext_stride_num_range[0x1]; + u8 reserved_at_3a1[0x2]; u8 log_max_stride_sz_rq[0x5]; u8 reserved_at_3a8[0x3]; u8 log_min_stride_sz_rq[0x5]; @@ -1205,9 +1206,9 @@ struct mlx5_ifc_wq_bits { u8 log_hairpin_num_packets[0x5]; u8 reserved_at_128[0x3]; u8 log_hairpin_data_sz[0x5]; - u8 reserved_at_130[0x5]; - u8 log_wqe_num_of_strides[0x3]; + u8 reserved_at_130[0x4]; + u8 log_wqe_num_of_strides[0x4]; u8 two_byte_shift_en[0x1]; u8 reserved_at_139[0x4]; u8 log_wqe_stride_size[0x3]; -- cgit v1.2.3 From 5e43f899b03a3492ce5fc44e8900becb04dae9c0 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:00 -0700 Subject: bpf: Check attach type at prog load time == The problem == There are use-cases when a program of some type can be attached to multiple attach points and those attach points must have different permissions to access context or to call helpers. E.g. context structure may have fields for both IPv4 and IPv6 but it doesn't make sense to read from / write to IPv6 field when attach point is somewhere in IPv4 stack. Same applies to BPF-helpers: it may make sense to call some helper from some attach point, but not from other for same prog type. == The solution == Introduce `expected_attach_type` field in in `struct bpf_attr` for `BPF_PROG_LOAD` command. If scenario described in "The problem" section is the case for some prog type, the field will be checked twice: 1) At load time prog type is checked to see if attach type for it must be known to validate program permissions correctly. Prog will be rejected with EINVAL if it's the case and `expected_attach_type` is not specified or has invalid value. 2) At attach time `attach_type` is compared with `expected_attach_type`, if prog type requires to have one, and, if they differ, attach will be rejected with EINVAL. The `expected_attach_type` is now available as part of `struct bpf_prog` in both `bpf_verifier_ops->is_valid_access()` and `bpf_verifier_ops->get_func_proto()` () and can be used to check context accesses and calls to helpers correspondingly. Initially the idea was discussed by Alexei Starovoitov and Daniel Borkmann here: https://marc.info/?l=linux-netdev&m=152107378717201&w=2 Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 5 ++++- include/linux/filter.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 819229c80eca..95a7abd0ee92 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -208,12 +208,15 @@ struct bpf_prog_ops { struct bpf_verifier_ops { /* return eBPF function prototype for verification */ - const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); + const struct bpf_func_proto * + (*get_func_proto)(enum bpf_func_id func_id, + const struct bpf_prog *prog); /* return true if 'size' wide access at offset 'off' within bpf_context * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); diff --git a/include/linux/filter.h b/include/linux/filter.h index 897ff3d95968..13c044e4832d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -469,6 +469,7 @@ struct bpf_prog { is_func:1, /* program is a bpf function */ kprobe_override:1; /* Do we override a kprobe? */ enum bpf_prog_type type; /* Type of BPF program */ + enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; -- cgit v1.2.3 From 4fbac77d2d092b475dda9eea66da674369665427 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:02 -0700 Subject: bpf: Hooks for sys_bind == The problem == There is a use-case when all processes inside a cgroup should use one single IP address on a host that has multiple IP configured. Those processes should use the IP for both ingress and egress, for TCP and UDP traffic. So TCP/UDP servers should be bound to that IP to accept incoming connections on it, and TCP/UDP clients should make outgoing connections from that IP. It should not require changing application code since it's often not possible. Currently it's solved by intercepting glibc wrappers around syscalls such as `bind(2)` and `connect(2)`. It's done by a shared library that is preloaded for every process in a cgroup so that whenever TCP/UDP server calls `bind(2)`, the library replaces IP in sockaddr before passing arguments to syscall. When application calls `connect(2)` the library transparently binds the local end of connection to that IP (`bind(2)` with `IP_BIND_ADDRESS_NO_PORT` to avoid performance penalty). Shared library approach is fragile though, e.g.: * some applications clear env vars (incl. `LD_PRELOAD`); * `/etc/ld.so.preload` doesn't help since some applications are linked with option `-z nodefaultlib`; * other applications don't use glibc and there is nothing to intercept. == The solution == The patch provides much more reliable in-kernel solution for the 1st part of the problem: binding TCP/UDP servers on desired IP. It does not depend on application environment and implementation details (whether glibc is used or not). It adds new eBPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` and attach types `BPF_CGROUP_INET4_BIND` and `BPF_CGROUP_INET6_BIND` (similar to already existing `BPF_CGROUP_INET_SOCK_CREATE`). The new program type is intended to be used with sockets (`struct sock`) in a cgroup and provided by user `struct sockaddr`. Pointers to both of them are parts of the context passed to programs of newly added types. The new attach types provides hooks in `bind(2)` system call for both IPv4 and IPv6 so that one can write a program to override IP addresses and ports user program tries to bind to and apply such a program for whole cgroup. == Implementation notes == [1] Separate attach types for `AF_INET` and `AF_INET6` are added intentionally to prevent reading/writing to offsets that don't make sense for corresponding socket family. E.g. if user passes `sockaddr_in` it doesn't make sense to read from / write to `user_ip6[]` context fields. [2] The write access to `struct bpf_sock_addr_kern` is implemented using special field as an additional "register". There are just two registers in `sock_addr_convert_ctx_access`: `src` with value to write and `dst` with pointer to context that can't be changed not to break later instructions. But the fields, allowed to write to, are not available directly and to access them address of corresponding pointer has to be loaded first. To get additional register the 1st not used by `src` and `dst` one is taken, its content is saved to `bpf_sock_addr_kern.tmp_reg`, then the register is used to load address of pointer field, and finally the register's content is restored from the temporary field after writing `src` value. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 21 +++++++++++++++++++++ include/linux/bpf_types.h | 1 + include/linux/filter.h | 10 ++++++++++ 3 files changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8a4566691c8f..67dc4a6471ad 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -6,6 +6,7 @@ #include struct sock; +struct sockaddr; struct cgroup; struct sk_buff; struct bpf_sock_ops_kern; @@ -63,6 +64,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type); + int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum bpf_attach_type type); @@ -103,6 +108,20 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -135,6 +154,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 6d7243bfb0ff..2b28fcf6f6ae 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -8,6 +8,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) diff --git a/include/linux/filter.h b/include/linux/filter.h index 13c044e4832d..fc4e8f91b03d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1021,6 +1021,16 @@ static inline int bpf_tell_extensions(void) return SKF_AD_MAX; } +struct bpf_sock_addr_kern { + struct sock *sk; + struct sockaddr *uaddr; + /* Temporary "register" to make indirect stores to nested structures + * defined above. We need three registers to make such a store, but + * only two (src and dst) are available at convert_ctx_access time + */ + u64 tmp_reg; +}; + struct bpf_sock_ops_kern { struct sock *sk; u32 op; -- cgit v1.2.3 From d74bad4e74ee373787a9ae24197c17b7cdc428d5 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:05 -0700 Subject: bpf: Hooks for sys_connect == The problem == See description of the problem in the initial patch of this patch set. == The solution == The patch provides much more reliable in-kernel solution for the 2nd part of the problem: making outgoing connecttion from desired IP. It adds new attach types `BPF_CGROUP_INET4_CONNECT` and `BPF_CGROUP_INET6_CONNECT` for program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that can be used to override both source and destination of a connection at connect(2) time. Local end of connection can be bound to desired IP using newly introduced BPF-helper `bpf_bind()`. It allows to bind to only IP though, and doesn't support binding to port, i.e. leverages `IP_BIND_ADDRESS_NO_PORT` socket option. There are two reasons for this: * looking for a free port is expensive and can affect performance significantly; * there is no use-case for port. As for remote end (`struct sockaddr *` passed by user), both parts of it can be overridden, remote IP and remote port. It's useful if an application inside cgroup wants to connect to another application inside same cgroup or to itself, but knows nothing about IP assigned to the cgroup. Support is added for IPv4 and IPv6, for TCP and UDP. IPv4 and IPv6 have separate attach types for same reason as sys_bind hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. == Implementation notes == The patch introduces new field in `struct proto`: `pre_connect` that is a pointer to a function with same signature as `connect` but is called before it. The reason is in some cases BPF hooks should be called way before control is passed to `sk->sk_prot->connect`. Specifically `inet_dgram_connect` autobinds socket before calling `sk->sk_prot->connect` and there is no way to call `bpf_bind()` from hooks from e.g. `ip4_datagram_connect` or `ip6_datagram_connect` since it'd cause double-bind. On the other hand `proto.pre_connect` provides a flexible way to add BPF hooks for connect only for necessary `proto` and call them at desired time before `connect`. Since `bpf_bind()` is allowed to bind only to IP and autobind in `inet_dgram_connect` binds only port there is no chance of double-bind. bpf_bind() sets `force_bind_address_no_port` to bind to only IP despite of value of `bind_address_no_port` socket field. bpf_bind() sets `with_lock` to `false` when calling to __inet_bind() and __inet6_bind() since all call-sites, where bpf_bind() is called, already hold socket lock. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 67dc4a6471ad..c6ab295e6dcb 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -116,12 +116,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) { \ + lock_sock(sk); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + release_sock(sk); \ + } \ + __ret; \ +}) + #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ + sk->sk_prot->pre_connect) + +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -151,11 +177,16 @@ struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -- cgit v1.2.3 From aac3fc320d9404f2665a8b1249dc3170d5fa3caf Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 30 Mar 2018 15:08:07 -0700 Subject: bpf: Post-hooks for sys_bind "Post-hooks" are hooks that are called right before returning from sys_bind. At this time IP and port are already allocated and no further changes to `struct sock` can happen before returning from sys_bind but BPF program has a chance to inspect the socket and change sys_bind result. Specifically it can e.g. inspect what port was allocated and if it doesn't satisfy some policy, BPF program can force sys_bind to fail and return EPERM to user. Another example of usage is recording the IP:port pair to some map to use it in later calls to sys_connect. E.g. if some TCP server inside cgroup was bound to some IP:port_n, it can be recorded to a map. And later when some TCP client inside same cgroup is trying to connect to 127.0.0.1:port_n, BPF hook for sys_connect can override the destination and connect application to IP:port_n instead of 127.0.0.1:port_n. That helps forcing all applications inside a cgroup to use desired IP and not break those applications if they e.g. use localhost to communicate between each other. == Implementation details == Post-hooks are implemented as two new attach types `BPF_CGROUP_INET4_POST_BIND` and `BPF_CGROUP_INET6_POST_BIND` for existing prog type `BPF_PROG_TYPE_CGROUP_SOCK`. Separate attach types for IPv4 and IPv6 are introduced to avoid access to IPv6 field in `struct sock` from `inet_bind()` and to IPv4 field from `inet6_bind()` since those fields might not make sense in such cases. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c6ab295e6dcb..30d15e64b993 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -98,16 +98,24 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ +#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, \ - BPF_CGROUP_INET_SOCK_CREATE); \ + __ret = __cgroup_bpf_run_filter_sk(sk, type); \ } \ __ret; \ }) +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) + +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) + #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ int __ret = 0; \ @@ -183,6 +191,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) -- cgit v1.2.3 From e5d672a0780d9e7118caad4c171ec88b8299398d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:56 -0700 Subject: rhashtable: reorganize struct rhashtable layout While under frags DDOS I noticed unfortunate false sharing between @nelems and @params.automatic_shrinking Move @nelems at the end of struct rhashtable so that first cache line is shared between all cpus, because almost never dirtied. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 668a21f04b09..1f8ad121eb43 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -152,25 +152,25 @@ struct rhashtable_params { /** * struct rhashtable - Hash table handle * @tbl: Bucket table - * @nelems: Number of elements in table * @key_len: Key length for hashfn - * @p: Configuration parameters * @max_elems: Maximum number of elements in table + * @p: Configuration parameters * @rhlist: True if this is an rhltable * @run_work: Deferred worker to expand/shrink asynchronously * @mutex: Mutex to protect current/future table swapping * @lock: Spin lock to protect walker list + * @nelems: Number of elements in table */ struct rhashtable { struct bucket_table __rcu *tbl; - atomic_t nelems; unsigned int key_len; - struct rhashtable_params p; unsigned int max_elems; + struct rhashtable_params p; bool rhlist; struct work_struct run_work; struct mutex mutex; spinlock_t lock; + atomic_t nelems; }; /** -- cgit v1.2.3 From bf66337140c64c27fa37222b7abca7e49d63fb57 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 31 Mar 2018 12:58:58 -0700 Subject: inet: frags: get rid of ipfrag_skb_cb/FRAG_CB ip_defrag uses skb->cb[] to store the fragment offset, and unfortunately this integer is currently in a different cache line than skb->next, meaning that we use two cache lines per skb when finding the insertion point. By aliasing skb->ip_defrag_offset and skb->dev, we pack all the fields in a single cache line and save precious memory bandwidth. Note that after the fast path added by Changli Gao in commit d6bebca92c66 ("fragment: add fast path for in-order fragments") this change wont help the fast path, since we still need to access prev->len (2nd cache line), but will show great benefits when slow path is entered, since we perform a linear scan of a potentially long list. Also, note that this potential long list is an attack vector, we might consider also using an rb-tree there eventually. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 47082f54ec1f..9065477ed255 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -672,6 +672,7 @@ struct sk_buff { * UDP receive path is one user. */ unsigned long dev_scratch; + int ip_defrag_offset; }; }; struct rb_node rbnode; /* used in netem & tcp stack */ -- cgit v1.2.3 From e0be6bea2583486ec4ed98e36437d82ea8190811 Mon Sep 17 00:00:00 2001 From: Atul Gupta Date: Sat, 31 Mar 2018 21:41:53 +0530 Subject: ethtool: enable Inline TLS in HW Ethtool option enables TLS record offload on HW, user configures the feature for netdev capable of Inline TLS. This allows user to define custom sk_prot for Inline TLS sock Signed-off-by: Atul Gupta Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index db84c516bcfb..35b79f47a13d 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -79,6 +79,7 @@ enum { NETIF_F_RX_UDP_TUNNEL_PORT_BIT, /* Offload of RX port for UDP tunnels */ NETIF_F_GRO_HW_BIT, /* Hardware Generic receive offload */ + NETIF_F_HW_TLS_RECORD_BIT, /* Offload TLS record */ /* * Add your fresh new feature above and remember to update @@ -145,6 +146,7 @@ enum { #define NETIF_F_HW_ESP __NETIF_F(HW_ESP) #define NETIF_F_HW_ESP_TX_CSUM __NETIF_F(HW_ESP_TX_CSUM) #define NETIF_F_RX_UDP_TUNNEL_PORT __NETIF_F(RX_UDP_TUNNEL_PORT) +#define NETIF_F_HW_TLS_RECORD __NETIF_F(HW_TLS_RECORD) #define for_each_netdev_feature(mask_addr, bit) \ for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) -- cgit v1.2.3