From 59cc1f61f09c26ce82c308e24b76141e1efe99f8 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 10 Aug 2016 11:05:15 +0200 Subject: net: sched: convert qdisc linked list to hashtable Convert the per-device linked list into a hashtable. The primary motivation for this change is that currently, we're not tracking all the qdiscs in hierarchy (e.g. excluding default qdiscs), as the lookup performed over the linked list by qdisc_match_from_root() is rather expensive. The ultimate goal is to get rid of hidden qdiscs completely, which will bring much more determinism in user experience. Reviewed-by: Cong Wang Signed-off-by: Jiri Kosina Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 076df5360ba5..96e0b6cd964e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -52,6 +52,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -1800,6 +1801,9 @@ struct net_device { unsigned int num_tx_queues; unsigned int real_num_tx_queues; struct Qdisc *qdisc; +#ifdef CONFIG_NET_SCHED + DECLARE_HASHTABLE (qdisc_hash, 4); +#endif unsigned long tx_queue_len; spinlock_t tx_global_lock; int watchdog_timeo; -- cgit v1.2.3 From 054c67d1c82afde13e475cdd8b7117a5e40bebb1 Mon Sep 17 00:00:00 2001 From: Sudarsana Reddy Kalluru Date: Tue, 9 Aug 2016 03:51:23 -0400 Subject: qed*: Add support for ethtool link_ksettings callbacks. This patch adds the driver implementation for ethtool link_ksettings callbacks. qed driver now defines/uses the qed specific masks for representing link capability values. qede driver maps these values to to new link modes defined by the kernel implementation of link_ksettings. Please consider applying this to 'net-next' branch. Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index b1e3c57c7117..737fc4c8db49 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -268,6 +268,21 @@ enum qed_protocol { QED_PROTOCOL_ISCSI, }; +enum qed_link_mode_bits { + QED_LM_FIBRE_BIT = BIT(0), + QED_LM_Autoneg_BIT = BIT(1), + QED_LM_Asym_Pause_BIT = BIT(2), + QED_LM_Pause_BIT = BIT(3), + QED_LM_1000baseT_Half_BIT = BIT(4), + QED_LM_1000baseT_Full_BIT = BIT(5), + QED_LM_10000baseKR_Full_BIT = BIT(6), + QED_LM_25000baseKR_Full_BIT = BIT(7), + QED_LM_40000baseLR4_Full_BIT = BIT(8), + QED_LM_50000baseKR2_Full_BIT = BIT(9), + QED_LM_100000baseKR4_Full_BIT = BIT(10), + QED_LM_COUNT = 11 +}; + struct qed_link_params { bool link_up; -- cgit v1.2.3 From aed704b7a634954dc28fe5c4b49db478cf2d96b7 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Fri, 12 Aug 2016 08:56:40 -0700 Subject: cgroup: Add task_under_cgroup_hierarchy cgroup inline function to headers This commit adds an inline function to cgroup.h to check whether a given task is under a given cgroup hierarchy. This is to avoid having to put ifdefs in .c files to gate access to cgroups. When cgroups are disabled this always returns true. Signed-off-by: Sargun Dhillon Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Tejun Heo Acked-by: Tejun Heo Signed-off-by: David S. Miller --- include/linux/cgroup.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 984f73b719a9..a4414a11eea7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -497,6 +497,23 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp, return cgrp->ancestor_ids[ancestor->level] == ancestor->id; } +/** + * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry + * @task: the task to be tested + * @ancestor: possible ancestor of @task's cgroup + * + * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. + * It follows all the same rules as cgroup_is_descendant, and only applies + * to the default hierarchy. + */ +static inline bool task_under_cgroup_hierarchy(struct task_struct *task, + struct cgroup *ancestor) +{ + struct css_set *cset = task_css_set(task); + + return cgroup_is_descendant(cset->dfl_cgrp, ancestor); +} + /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { @@ -557,6 +574,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; +struct cgroup; static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, @@ -574,6 +592,11 @@ static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } +static inline bool task_under_cgroup_hierarchy(struct task_struct *task, + struct cgroup *ancestor) +{ + return true; +} #endif /* !CONFIG_CGROUPS */ /* -- cgit v1.2.3 From 04ed5ad5db6880d53dd1bb8c93e82228a462a4dd Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sun, 17 Jul 2016 01:28:47 +0300 Subject: net/mlx5: Init/Teardown hca commands via mlx5 ifc Remove old representation of manually created Init/Teardown hca commands layout and use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 24 ------------------------ 1 file changed, 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 0b6d15cddb2f..6c343c0b77d2 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -455,30 +455,6 @@ struct mlx5_odp_caps { char reserved2[0xe4]; }; -struct mlx5_cmd_init_hca_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd0[2]; - __be16 profile; - u8 rsvd1[4]; -}; - -struct mlx5_cmd_init_hca_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_cmd_teardown_hca_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd0[2]; - __be16 profile; - u8 rsvd1[4]; -}; - -struct mlx5_cmd_teardown_hca_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - struct mlx5_cmd_layout { u8 type; u8 rsvd0[3]; -- cgit v1.2.3 From 20ed51c643b6296789a48adc3bc2cc875a1612cf Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sun, 17 Jul 2016 00:46:41 +0300 Subject: net/mlx5: Access register and MAD IFC commands via mlx5 ifc Remove old representation of manually created ACCESS_REG/MAD_IFC commands layout and use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 29 ----------------------------- 1 file changed, 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 6c343c0b77d2..9570c493b50f 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1165,35 +1165,6 @@ struct mlx5_dump_mkey_mbox_out { __be32 mkey; }; -struct mlx5_mad_ifc_mbox_in { - struct mlx5_inbox_hdr hdr; - __be16 remote_lid; - u8 rsvd0; - u8 port; - u8 rsvd1[4]; - u8 data[256]; -}; - -struct mlx5_mad_ifc_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; - u8 data[256]; -}; - -struct mlx5_access_reg_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd0[2]; - __be16 register_id; - __be32 arg; - __be32 data[0]; -}; - -struct mlx5_access_reg_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; - __be32 data[0]; -}; - #define MLX5_ATTR_EXTENDED_PORT_INFO cpu_to_be16(0xff90) enum { -- cgit v1.2.3 From 20bb566bda7b3e62b67dbb1bd363be40b5ae81c3 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sun, 17 Jul 2016 02:01:45 +0300 Subject: net/mlx5: MCG commands via mlx5 ifc Remove old representation of manually created MCG commands layout and use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 21bc4557b67a..3f70fc9c2fc9 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -152,7 +152,7 @@ enum { MLX5_CMD_OP_CONFIG_INT_MODERATION = 0x804, MLX5_CMD_OP_ACCESS_REG = 0x805, MLX5_CMD_OP_ATTACH_TO_MCG = 0x806, - MLX5_CMD_OP_DETTACH_FROM_MCG = 0x807, + MLX5_CMD_OP_DETACH_FROM_MCG = 0x807, MLX5_CMD_OP_GET_DROPPED_PACKET_LOG = 0x80a, MLX5_CMD_OP_MAD_IFC = 0x50d, MLX5_CMD_OP_QUERY_MAD_DEMUX = 0x80b, -- cgit v1.2.3 From 73b626c182dff06867ceba996a819e8372c9b2ce Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 16 Jul 2016 03:26:15 +0300 Subject: net/mlx5: EQ commands via mlx5 ifc Remove old representation of manually created EQ commands layout, and use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 74 --------------------------------------------- include/linux/mlx5/driver.h | 2 +- 2 files changed, 1 insertion(+), 75 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 9570c493b50f..c84e0ba5b261 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -995,80 +995,6 @@ struct mlx5_disable_hca_mbox_out { u8 rsvd[8]; }; -struct mlx5_eq_context { - u8 status; - u8 ec_oi; - u8 st; - u8 rsvd2[7]; - __be16 page_pffset; - __be32 log_sz_usr_page; - u8 rsvd3[7]; - u8 intr; - u8 log_page_size; - u8 rsvd4[15]; - __be32 consumer_counter; - __be32 produser_counter; - u8 rsvd5[16]; -}; - -struct mlx5_create_eq_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd0[3]; - u8 input_eqn; - u8 rsvd1[4]; - struct mlx5_eq_context ctx; - u8 rsvd2[8]; - __be64 events_mask; - u8 rsvd3[176]; - __be64 pas[0]; -}; - -struct mlx5_create_eq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[3]; - u8 eq_number; - u8 rsvd1[4]; -}; - -struct mlx5_destroy_eq_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd0[3]; - u8 eqn; - u8 rsvd1[4]; -}; - -struct mlx5_destroy_eq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_map_eq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be64 mask; - u8 mu; - u8 rsvd0[2]; - u8 eqn; - u8 rsvd1[24]; -}; - -struct mlx5_map_eq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_query_eq_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd0[3]; - u8 eqn; - u8 rsvd1[4]; -}; - -struct mlx5_query_eq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; - struct mlx5_eq_context ctx; -}; - enum { MLX5_MKEY_STATUS_FREE = 1 << 6, }; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ccea6fb16482..eed4b612572d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -865,7 +865,7 @@ int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq); void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq); int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq, - struct mlx5_query_eq_mbox_out *out, int outlen); + u32 *out, int outlen); int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev); void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev); int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 278277866334e515141dde7c8ac143e15c0a767f Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 16 Jul 2016 02:33:22 +0300 Subject: {net,IB}/mlx5: CQ commands via mlx5 ifc Remove old representation of manually created CQ commands layout, and use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/cq.h | 6 ++-- include/linux/mlx5/device.h | 76 --------------------------------------------- 2 files changed, 3 insertions(+), 79 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 2566f6d6444f..7c3c0d3aca37 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -170,12 +170,12 @@ static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd, int mlx5_init_cq_table(struct mlx5_core_dev *dev); void mlx5_cleanup_cq_table(struct mlx5_core_dev *dev); int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - struct mlx5_create_cq_mbox_in *in, int inlen); + u32 *in, int inlen); int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - struct mlx5_query_cq_mbox_out *out); + u32 *out, int outlen); int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, - struct mlx5_modify_cq_mbox_in *in, int in_sz); + u32 *in, int inlen); int mlx5_core_modify_cq_moderation(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u16 cq_period, u16 cq_max_count); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index c84e0ba5b261..5a1c1606bdbd 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -899,82 +899,6 @@ struct mlx5_arm_srq_mbox_out { u8 rsvd[8]; }; -struct mlx5_cq_context { - u8 status; - u8 cqe_sz_flags; - u8 st; - u8 rsvd3; - u8 rsvd4[6]; - __be16 page_offset; - __be32 log_sz_usr_page; - __be16 cq_period; - __be16 cq_max_count; - __be16 rsvd20; - __be16 c_eqn; - u8 log_pg_sz; - u8 rsvd25[7]; - __be32 last_notified_index; - __be32 solicit_producer_index; - __be32 consumer_counter; - __be32 producer_counter; - u8 rsvd48[8]; - __be64 db_record_addr; -}; - -struct mlx5_create_cq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 input_cqn; - u8 rsvdx[4]; - struct mlx5_cq_context ctx; - u8 rsvd6[192]; - __be64 pas[0]; -}; - -struct mlx5_create_cq_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 cqn; - u8 rsvd0[4]; -}; - -struct mlx5_destroy_cq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 cqn; - u8 rsvd0[4]; -}; - -struct mlx5_destroy_cq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; -}; - -struct mlx5_query_cq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 cqn; - u8 rsvd0[4]; -}; - -struct mlx5_query_cq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; - struct mlx5_cq_context ctx; - u8 rsvd6[16]; - __be64 pas[0]; -}; - -struct mlx5_modify_cq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 cqn; - __be32 field_select; - struct mlx5_cq_context ctx; - u8 rsvd[192]; - __be64 pas[0]; -}; - -struct mlx5_modify_cq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - struct mlx5_enable_hca_mbox_in { struct mlx5_inbox_hdr hdr; u8 rsvd[8]; -- cgit v1.2.3 From ec22eb53106be1472ba6573dc900943f52f8fd1e Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 16 Jul 2016 06:28:36 +0300 Subject: {net,IB}/mlx5: MKey/PSV commands via mlx5 ifc Remove old representation of manually created MKey/PSV commands layout, and use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 113 +++--------------------------------------- include/linux/mlx5/driver.h | 11 ++-- include/linux/mlx5/mlx5_ifc.h | 2 +- 3 files changed, 15 insertions(+), 111 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 5a1c1606bdbd..fb002db1e2f0 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -197,19 +197,6 @@ enum { MLX5_PCIE_CTRL_TPH_MASK = 3 << 4, }; -enum { - MLX5_ACCESS_MODE_PA = 0, - MLX5_ACCESS_MODE_MTT = 1, - MLX5_ACCESS_MODE_KLM = 2 -}; - -enum { - MLX5_MKEY_REMOTE_INVAL = 1 << 24, - MLX5_MKEY_FLAG_SYNC_UMR = 1 << 29, - MLX5_MKEY_BSF_EN = 1 << 30, - MLX5_MKEY_LEN64 = 1 << 31, -}; - enum { MLX5_EN_RD = (u64)1, MLX5_EN_WR = (u64)2 @@ -923,6 +910,13 @@ enum { MLX5_MKEY_STATUS_FREE = 1 << 6, }; +enum { + MLX5_MKEY_REMOTE_INVAL = 1 << 24, + MLX5_MKEY_FLAG_SYNC_UMR = 1 << 29, + MLX5_MKEY_BSF_EN = 1 << 30, + MLX5_MKEY_LEN64 = 1 << 31, +}; + struct mlx5_mkey_seg { /* This is a two bit field occupying bits 31-30. * bit 31 is always 0, @@ -945,105 +939,12 @@ struct mlx5_mkey_seg { u8 rsvd4[4]; }; -struct mlx5_query_special_ctxs_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_query_special_ctxs_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 dump_fill_mkey; - __be32 reserved_lkey; -}; - -struct mlx5_create_mkey_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 input_mkey_index; - __be32 flags; - struct mlx5_mkey_seg seg; - u8 rsvd1[16]; - __be32 xlat_oct_act_size; - __be32 rsvd2; - u8 rsvd3[168]; - __be64 pas[0]; -}; - -struct mlx5_create_mkey_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 mkey; - u8 rsvd[4]; -}; - -struct mlx5_destroy_mkey_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 mkey; - u8 rsvd[4]; -}; - -struct mlx5_destroy_mkey_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_query_mkey_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 mkey; -}; - -struct mlx5_query_mkey_mbox_out { - struct mlx5_outbox_hdr hdr; - __be64 pas[0]; -}; - -struct mlx5_modify_mkey_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 mkey; - __be64 pas[0]; -}; - -struct mlx5_modify_mkey_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_dump_mkey_mbox_in { - struct mlx5_inbox_hdr hdr; -}; - -struct mlx5_dump_mkey_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 mkey; -}; - #define MLX5_ATTR_EXTENDED_PORT_INFO cpu_to_be16(0xff90) enum { MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO = 1 << 0 }; -struct mlx5_allocate_psv_in { - struct mlx5_inbox_hdr hdr; - __be32 npsv_pd; - __be32 rsvd_psv0; -}; - -struct mlx5_allocate_psv_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; - __be32 psv_idx[4]; -}; - -struct mlx5_destroy_psv_in { - struct mlx5_inbox_hdr hdr; - __be32 psv_number; - u8 rsvd[4]; -}; - -struct mlx5_destroy_psv_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - enum { VPORT_STATE_DOWN = 0x0, VPORT_STATE_UP = 0x1, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index eed4b612572d..173817187abb 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -807,15 +807,18 @@ int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, u16 lwm, int is_srq); void mlx5_init_mkey_table(struct mlx5_core_dev *dev); void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev); +int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev, + struct mlx5_core_mkey *mkey, + u32 *in, int inlen, + u32 *out, int outlen, + mlx5_cmd_cbk_t callback, void *context); int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, - struct mlx5_create_mkey_mbox_in *in, int inlen, - mlx5_cmd_cbk_t callback, void *context, - struct mlx5_create_mkey_mbox_out *out); + u32 *in, int inlen); int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey); int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, - struct mlx5_query_mkey_mbox_out *out, int outlen); + u32 *out, int outlen); int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mkey *_mkey, u32 *mkey); int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3f70fc9c2fc9..2a39a06dbad4 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3489,7 +3489,7 @@ struct mlx5_ifc_query_special_contexts_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x20]; + u8 dump_fill_mkey[0x20]; u8 resd_lkey[0x20]; }; -- cgit v1.2.3 From e79c6a4fc923eed2bdd3b716e0f01414847db90a Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 10 Aug 2016 14:36:02 -0700 Subject: net: make net namespace sysctls belong to container's owner If net namespace is attached to a user namespace let's make container's root owner of sysctls affecting said network namespace instead of global root. This also allows us to clean up net_ctl_permissions() because we do not need to fudge permissions anymore for the container's owner since it now owns the objects in question. Acked-by: "Eric W. Biederman" Signed-off-by: Dmitry Torokhov Signed-off-by: David S. Miller --- include/linux/sysctl.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 697e160c78d0..d82cb6011e77 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -25,6 +25,7 @@ #include #include #include +#include #include /* For the /proc/sys support */ @@ -157,6 +158,9 @@ struct ctl_table_root { struct ctl_table_set default_set; struct ctl_table_set *(*lookup)(struct ctl_table_root *root, struct nsproxy *namespaces); + void (*set_ownership)(struct ctl_table_header *head, + struct ctl_table *table, + kuid_t *uid, kgid_t *gid); int (*permissions)(struct ctl_table_header *head, struct ctl_table *table); }; -- cgit v1.2.3 From 09a7d9eca1a6cf5eb4f9abfdf8914db9dbd96f08 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 19 Jul 2016 01:17:59 +0300 Subject: {net,IB}/mlx5: QP/XRCD commands via mlx5 ifc Remove old representation of manually created QP/XRCD commands layout amd use mlx5_ifc canonical structures and defines. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 5 +- include/linux/mlx5/qp.h | 108 +++--------------------------------------- 2 files changed, 11 insertions(+), 102 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2a39a06dbad4..cb94ac5b8420 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1966,7 +1966,10 @@ struct mlx5_ifc_qpc_bits { u8 reserved_at_3e0[0x8]; u8 cqn_snd[0x18]; - u8 reserved_at_400[0x40]; + u8 reserved_at_400[0x8]; + u8 deth_sqpn[0x18]; + + u8 reserved_at_420[0x20]; u8 reserved_at_440[0x8]; u8 last_acked_psn[0x18]; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 7879bf411891..16e1efecaf66 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -123,12 +123,13 @@ enum { }; enum { - MLX5_NON_ZERO_RQ = 0 << 24, - MLX5_SRQ_RQ = 1 << 24, - MLX5_CRQ_RQ = 2 << 24, - MLX5_ZERO_LEN_RQ = 3 << 24 + MLX5_NON_ZERO_RQ = 0x0, + MLX5_SRQ_RQ = 0x1, + MLX5_CRQ_RQ = 0x2, + MLX5_ZERO_LEN_RQ = 0x3 }; +/* TODO REM */ enum { /* params1 */ MLX5_QP_BIT_SRE = 1 << 15, @@ -177,12 +178,6 @@ enum { MLX5_FENCE_MODE_SMALL_AND_FENCE = 4 << 5, }; -enum { - MLX5_QP_LAT_SENSITIVE = 1 << 28, - MLX5_QP_BLOCK_MCAST = 1 << 30, - MLX5_QP_ENABLE_SIG = 1 << 31, -}; - enum { MLX5_RCV_DBR = 0, MLX5_SND_DBR = 1, @@ -525,34 +520,6 @@ struct mlx5_qp_context { u8 rsvd1[24]; }; -struct mlx5_create_qp_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 input_qpn; - u8 rsvd0[4]; - __be32 opt_param_mask; - u8 rsvd1[4]; - struct mlx5_qp_context ctx; - u8 rsvd3[16]; - __be64 pas[0]; -}; - -struct mlx5_create_qp_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 qpn; - u8 rsvd0[4]; -}; - -struct mlx5_destroy_qp_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 qpn; - u8 rsvd0[4]; -}; - -struct mlx5_destroy_qp_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; -}; - struct mlx5_modify_qp_mbox_in { struct mlx5_inbox_hdr hdr; __be32 qpn; @@ -568,56 +535,6 @@ struct mlx5_modify_qp_mbox_out { u8 rsvd0[8]; }; -struct mlx5_query_qp_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 qpn; - u8 rsvd[4]; -}; - -struct mlx5_query_qp_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd1[8]; - __be32 optparam; - u8 rsvd0[4]; - struct mlx5_qp_context ctx; - u8 rsvd2[16]; - __be64 pas[0]; -}; - -struct mlx5_conf_sqp_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 qpn; - u8 rsvd[3]; - u8 type; -}; - -struct mlx5_conf_sqp_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_alloc_xrcd_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_alloc_xrcd_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 xrcdn; - u8 rsvd[4]; -}; - -struct mlx5_dealloc_xrcd_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 xrcdn; - u8 rsvd[4]; -}; - -struct mlx5_dealloc_xrcd_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u32 qpn) { return radix_tree_lookup(&dev->priv.qp_table.tree, qpn); @@ -628,20 +545,9 @@ static inline struct mlx5_core_mkey *__mlx5_mr_lookup(struct mlx5_core_dev *dev, return radix_tree_lookup(&dev->priv.mkey_table.tree, key); } -struct mlx5_page_fault_resume_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 flags_qpn; - u8 reserved[4]; -}; - -struct mlx5_page_fault_resume_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, - struct mlx5_create_qp_mbox_in *in, + u32 *in, int inlen); int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation, struct mlx5_modify_qp_mbox_in *in, int sqd_event, @@ -649,7 +555,7 @@ int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation, int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, - struct mlx5_query_qp_mbox_out *out, int outlen); + u32 *out, int outlen); int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn); int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn); -- cgit v1.2.3 From 1a412fb1caa2c1b77719ccb5ed8b0c3c2bc65da7 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 19 Jul 2016 18:03:21 +0300 Subject: {net,IB}/mlx5: Modify QP commands via mlx5 ifc Prior to this patch we assumed that modify QP commands have the same layout. In ConnectX-4 for each QP transition there is a specific command and their layout can vary. e.g: 2err/2rst commands don't have QP context in their layout and before this patch we posted the QP context in those commands. Fortunately the FW only checks the suffix of the commands and executes them, while ignoring all invalid data sent after the valid command layout. This patch removes mlx5_modify_qp_mbox_in and changes mlx5_core_qp_modify to receive the required transition and QP context with opt_param_mask if needed. This way the caller is not required to provide the command inbox layout and it will be generated automatically. mlx5_core_qp_modify will generate the command inbox/outbox layouts according to the requested transition and will fill the requested parameters. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/qp.h | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 16e1efecaf66..0aacb2a7480d 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -479,6 +479,7 @@ struct mlx5_qp_path { u8 rmac[6]; }; +/* FIXME: use mlx5_ifc.h qpc */ struct mlx5_qp_context { __be32 flags; __be32 flags_pd; @@ -520,21 +521,6 @@ struct mlx5_qp_context { u8 rsvd1[24]; }; -struct mlx5_modify_qp_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 qpn; - u8 rsvd0[4]; - __be32 optparam; - u8 rsvd1[4]; - struct mlx5_qp_context ctx; - u8 rsvd2[16]; -}; - -struct mlx5_modify_qp_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; -}; - static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u32 qpn) { return radix_tree_lookup(&dev->priv.qp_table.tree, qpn); @@ -549,8 +535,8 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, u32 *in, int inlen); -int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation, - struct mlx5_modify_qp_mbox_in *in, int sqd_event, +int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode, + u32 opt_param_mask, void *qpc, struct mlx5_core_qp *qp); int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); -- cgit v1.2.3 From c4f287c4a6ac489c18afc4acc4353141a8c53070 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 19 Jul 2016 20:17:12 +0300 Subject: net/mlx5: Unify and improve command interface Now as all commands use mlx5 ifc interface, instead of doing two calls for executing a command we embed command status checking into mlx5_cmd_exec to simplify the interface. Also we do here some cleanup for redundant software structures (inbox/outbox) and functions and improved command failure output. Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 115 -------------------------------------------- include/linux/mlx5/driver.h | 7 +-- 2 files changed, 4 insertions(+), 118 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index fb002db1e2f0..2575070c836e 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -398,33 +398,6 @@ enum { MLX5_MAX_SGE_RD = (512 - 16 - 16) / 16 }; -struct mlx5_inbox_hdr { - __be16 opcode; - u8 rsvd[4]; - __be16 opmod; -}; - -struct mlx5_outbox_hdr { - u8 status; - u8 rsvd[3]; - __be32 syndrome; -}; - -struct mlx5_cmd_query_adapter_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_cmd_query_adapter_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[24]; - u8 intapin; - u8 rsvd1[13]; - __be16 vsd_vendor_id; - u8 vsd[208]; - u8 vsd_psid[16]; -}; - enum mlx5_odp_transport_cap_bits { MLX5_ODP_SUPPORT_SEND = 1 << 31, MLX5_ODP_SUPPORT_RECV = 1 << 30, @@ -457,7 +430,6 @@ struct mlx5_cmd_layout { u8 status_own; }; - struct health_buffer { __be32 assert_var[5]; __be32 rsvd0[3]; @@ -819,93 +791,6 @@ struct mlx5_cqe128 { struct mlx5_cqe64 cqe64; }; -struct mlx5_srq_ctx { - u8 state_log_sz; - u8 rsvd0[3]; - __be32 flags_xrcd; - __be32 pgoff_cqn; - u8 rsvd1[4]; - u8 log_pg_sz; - u8 rsvd2[7]; - __be32 pd; - __be16 lwm; - __be16 wqe_cnt; - u8 rsvd3[8]; - __be64 db_record; -}; - -struct mlx5_create_srq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 input_srqn; - u8 rsvd0[4]; - struct mlx5_srq_ctx ctx; - u8 rsvd1[208]; - __be64 pas[0]; -}; - -struct mlx5_create_srq_mbox_out { - struct mlx5_outbox_hdr hdr; - __be32 srqn; - u8 rsvd[4]; -}; - -struct mlx5_destroy_srq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 srqn; - u8 rsvd[4]; -}; - -struct mlx5_destroy_srq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_query_srq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 srqn; - u8 rsvd0[4]; -}; - -struct mlx5_query_srq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd0[8]; - struct mlx5_srq_ctx ctx; - u8 rsvd1[32]; - __be64 pas[0]; -}; - -struct mlx5_arm_srq_mbox_in { - struct mlx5_inbox_hdr hdr; - __be32 srqn; - __be16 rsvd; - __be16 lwm; -}; - -struct mlx5_arm_srq_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_enable_hca_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_enable_hca_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_disable_hca_mbox_in { - struct mlx5_inbox_hdr hdr; - u8 rsvd[8]; -}; - -struct mlx5_disable_hca_mbox_out { - struct mlx5_outbox_hdr hdr; - u8 rsvd[8]; -}; - enum { MLX5_MKEY_STATUS_FREE = 1 << 6, }; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 173817187abb..ebe57abf3324 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -771,14 +771,15 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev); void mlx5_cmd_cleanup(struct mlx5_core_dev *dev); void mlx5_cmd_use_events(struct mlx5_core_dev *dev); void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); -int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr); -int mlx5_cmd_status_to_err_v2(void *ptr); -int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); + int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size, mlx5_cmd_cbk_t callback, void *context); +void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome); + +int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn); int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn); int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari); -- cgit v1.2.3 From 9def7121bed3be8a9d126c900ca7067647bc4789 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Wed, 3 Aug 2016 17:27:30 +0300 Subject: net/mlx5: Enable setting minimum inline header mode for VFs Implement the low-level part of the PF side in setting minimum inline header mode for VFs. Signed-off-by: Hadar Hen Zion Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 2 +- include/linux/mlx5/vport.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index cb94ac5b8420..7a8ef0af94e7 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4724,7 +4724,7 @@ struct mlx5_ifc_modify_nic_vport_field_select_bits { u8 reserved_at_0[0x16]; u8 node_guid[0x1]; u8 port_guid[0x1]; - u8 reserved_at_18[0x1]; + u8 min_inline[0x1]; u8 mtu[0x1]; u8 change_event[0x1]; u8 promisc[0x1]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index e087b7d047ac..451b0bde9083 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -45,6 +45,8 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, u8 *addr); void mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev, u8 *min_inline); +int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev, + u16 vport, u8 min_inline); int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev, u16 vport, u8 *addr); int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu); -- cgit v1.2.3 From 7adbde2035c2e5baf2f6a90eba11813db4813a67 Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Wed, 3 Aug 2016 15:08:33 +0300 Subject: net/mlx5: Update mlx5_ifc.h for vxlan encap/decap Add the required definitions related to vxlan encap/decap. Signed-off-by: Hadar Hen Zion Signed-off-by: Ilya Lesokhin Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 105 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 101 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 7a8ef0af94e7..3766110e13ea 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -212,6 +212,8 @@ enum { MLX5_CMD_OP_DEALLOC_FLOW_COUNTER = 0x93a, MLX5_CMD_OP_QUERY_FLOW_COUNTER = 0x93b, MLX5_CMD_OP_MODIFY_FLOW_TABLE = 0x93c, + MLX5_CMD_OP_ALLOC_ENCAP_HEADER = 0x93d, + MLX5_CMD_OP_DEALLOC_ENCAP_HEADER = 0x93e, MLX5_CMD_OP_MAX }; @@ -281,7 +283,9 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 modify_root[0x1]; u8 identified_miss_table_mode[0x1]; u8 flow_table_modify[0x1]; - u8 reserved_at_7[0x19]; + u8 encap[0x1]; + u8 decap[0x1]; + u8 reserved_at_9[0x17]; u8 reserved_at_20[0x2]; u8 log_max_ft_size[0x6]; @@ -512,7 +516,15 @@ struct mlx5_ifc_e_switch_cap_bits { u8 nic_vport_node_guid_modify[0x1]; u8 nic_vport_port_guid_modify[0x1]; - u8 reserved_at_20[0x7e0]; + u8 vxlan_encap_decap[0x1]; + u8 nvgre_encap_decap[0x1]; + u8 reserved_at_22[0x9]; + u8 log_max_encap_headers[0x5]; + u8 reserved_2b[0x6]; + u8 max_encap_header_size[0xa]; + + u8 reserved_40[0x7c0]; + }; struct mlx5_ifc_qos_cap_bits { @@ -2067,6 +2079,8 @@ enum { MLX5_FLOW_CONTEXT_ACTION_DROP = 0x2, MLX5_FLOW_CONTEXT_ACTION_FWD_DEST = 0x4, MLX5_FLOW_CONTEXT_ACTION_COUNT = 0x8, + MLX5_FLOW_CONTEXT_ACTION_ENCAP = 0x10, + MLX5_FLOW_CONTEXT_ACTION_DECAP = 0x20, }; struct mlx5_ifc_flow_context_bits { @@ -2086,7 +2100,9 @@ struct mlx5_ifc_flow_context_bits { u8 reserved_at_a0[0x8]; u8 flow_counter_list_size[0x18]; - u8 reserved_at_c0[0x140]; + u8 encap_id[0x20]; + + u8 reserved_at_e0[0x120]; struct mlx5_ifc_fte_match_param_bits match_value; @@ -4216,6 +4232,85 @@ struct mlx5_ifc_query_eq_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_encap_header_in_bits { + u8 reserved_at_0[0x5]; + u8 header_type[0x3]; + u8 reserved_at_8[0xe]; + u8 encap_header_size[0xa]; + + u8 reserved_at_20[0x10]; + u8 encap_header[2][0x8]; + + u8 more_encap_header[0][0x8]; +}; + +struct mlx5_ifc_query_encap_header_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0xa0]; + + struct mlx5_ifc_encap_header_in_bits encap_header[0]; +}; + +struct mlx5_ifc_query_encap_header_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 encap_id[0x20]; + + u8 reserved_at_60[0xa0]; +}; + +struct mlx5_ifc_alloc_encap_header_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 encap_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_alloc_encap_header_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0xa0]; + + struct mlx5_ifc_encap_header_in_bits encap_header; +}; + +struct mlx5_ifc_dealloc_encap_header_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_dealloc_encap_header_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_20[0x10]; + u8 op_mod[0x10]; + + u8 encap_id[0x20]; + + u8 reserved_60[0x20]; +}; + struct mlx5_ifc_query_dct_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -6102,7 +6197,9 @@ struct mlx5_ifc_create_flow_table_in_bits { u8 reserved_at_a0[0x20]; - u8 reserved_at_c0[0x4]; + u8 encap_en[0x1]; + u8 decap_en[0x1]; + u8 reserved_at_c2[0x2]; u8 table_miss_mode[0x4]; u8 level[0x8]; u8 reserved_at_d0[0x8]; -- cgit v1.2.3 From 83b502a12e82d0ae97907d415496fbafe044f0ce Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 4 Aug 2016 17:32:02 +0300 Subject: net/mlx5: Modify RQ bitmask from mlx5 ifc Use mlx5 ifc MODIFY_BITMASK_VSD in mlx5e_modify_rq_vsd and expose counter set capability bit in hca caps structure. Signed-off-by: Alex Vesker Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 4 ---- include/linux/mlx5/mlx5_ifc.h | 9 ++++++++- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ebe57abf3324..0ea78b5edbb2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -48,10 +48,6 @@ #include #include -enum { - MLX5_RQ_BITMASK_VSD = 1 << 1, -}; - enum { MLX5_BOARD_ID_LEN = 64, MLX5_MAX_NAME_LEN = 16, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3766110e13ea..e1f8e3491867 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -779,7 +779,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 out_of_seq_cnt[0x1]; u8 vport_counters[0x1]; u8 retransmission_q_counters[0x1]; - u8 reserved_at_183[0x3]; + u8 reserved_at_183[0x1]; + u8 modify_rq_counter_set_id[0x1]; + u8 reserved_at_185[0x1]; u8 max_qp_cnt[0xa]; u8 pkey_table_size[0x10]; @@ -4750,6 +4752,11 @@ struct mlx5_ifc_modify_rq_out_bits { u8 reserved_at_40[0x40]; }; +enum { + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD = 1ULL << 1, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_MODIFY_RQ_COUNTER_SET_ID = 1ULL << 3, +}; + struct mlx5_ifc_modify_rq_in_bits { u8 opcode[0x10]; u8 reserved_at_10[0x10]; -- cgit v1.2.3 From 2e353b3468ecb1d12a44aaf35888f7de47d5c047 Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Sun, 3 Jul 2016 14:57:33 +0300 Subject: net/mlx5: Update struct mlx5_ifc_xrqc_bits Update struct mlx5_ifc_xrqc_bits according to last specification Signed-off-by: Artemy Kovalyov Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e1f8e3491867..5f150c849a8f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2829,7 +2829,7 @@ struct mlx5_ifc_xrqc_bits { struct mlx5_ifc_tag_matching_topology_context_bits tag_matching_topology_context; - u8 reserved_at_180[0x180]; + u8 reserved_at_180[0x200]; struct mlx5_ifc_wq_bits wq; }; -- cgit v1.2.3 From 8cca30a7f914fe363fa9700715619ca5c8cb38cc Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Sun, 26 Jun 2016 12:43:24 +0300 Subject: net/mlx5: Expose mlx5e_link_mode The mlx5e_link_mode enumeration will also be used in mlx5_ib for RoCE. This patch moves the enumeration to the mlx5 driver port header file. Signed-off-by: Noa Osherovich Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/port.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index e3012cc64b8a..6f876a4770f6 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -61,6 +61,39 @@ enum mlx5_an_status { #define MLX5_I2C_ADDR_HIGH 0x51 #define MLX5_EEPROM_PAGE_LENGTH 256 +enum mlx5e_link_mode { + MLX5E_1000BASE_CX_SGMII = 0, + MLX5E_1000BASE_KX = 1, + MLX5E_10GBASE_CX4 = 2, + MLX5E_10GBASE_KX4 = 3, + MLX5E_10GBASE_KR = 4, + MLX5E_20GBASE_KR2 = 5, + MLX5E_40GBASE_CR4 = 6, + MLX5E_40GBASE_KR4 = 7, + MLX5E_56GBASE_R4 = 8, + MLX5E_10GBASE_CR = 12, + MLX5E_10GBASE_SR = 13, + MLX5E_10GBASE_ER = 14, + MLX5E_40GBASE_SR4 = 15, + MLX5E_40GBASE_LR4 = 16, + MLX5E_50GBASE_SR2 = 18, + MLX5E_100GBASE_CR4 = 20, + MLX5E_100GBASE_SR4 = 21, + MLX5E_100GBASE_KR4 = 22, + MLX5E_100GBASE_LR4 = 23, + MLX5E_100BASE_TX = 24, + MLX5E_1000BASE_T = 25, + MLX5E_10GBASE_T = 26, + MLX5E_25GBASE_CR = 27, + MLX5E_25GBASE_KR = 28, + MLX5E_25GBASE_SR = 29, + MLX5E_50GBASE_CR2 = 30, + MLX5E_50GBASE_KR2 = 31, + MLX5E_LINK_MODES_NUMBER, +}; + +#define MLX5E_PROT_MASK(link_mode) (1 << link_mode) + int mlx5_set_port_caps(struct mlx5_core_dev *dev, u8 port_num, u32 caps); int mlx5_query_port_ptys(struct mlx5_core_dev *dev, u32 *ptys, int ptys_size, int proto_mask, u8 local_port); -- cgit v1.2.3 From d5beb7f2aff4a60237fd97a98d49a78c9045b8f2 Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Thu, 2 Jun 2016 10:47:53 +0300 Subject: net/mlx5: Separate query_port_proto_oper for IB and EN Replaced mlx5_query_port_proto_oper with separate functions per link type. The functions should take different arguments so no point in trying to unite them. Signed-off-by: Noa Osherovich Signed-off-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/port.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 6f876a4770f6..b3065acd20b4 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -103,9 +103,10 @@ int mlx5_query_port_proto_admin(struct mlx5_core_dev *dev, u32 *proto_admin, int proto_mask); int mlx5_query_port_link_width_oper(struct mlx5_core_dev *dev, u8 *link_width_oper, u8 local_port); -int mlx5_query_port_proto_oper(struct mlx5_core_dev *dev, - u8 *proto_oper, int proto_mask, - u8 local_port); +int mlx5_query_port_ib_proto_oper(struct mlx5_core_dev *dev, + u8 *proto_oper, u8 local_port); +int mlx5_query_port_eth_proto_oper(struct mlx5_core_dev *dev, + u32 *proto_oper, u8 local_port); int mlx5_set_port_ptys(struct mlx5_core_dev *dev, bool an_disable, u32 proto_admin, int proto_mask); void mlx5_toggle_port_link(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 84df61ebc69bdc466180e02d654e9b0284781288 Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Tue, 10 May 2016 13:47:50 +0300 Subject: net/mlx5: Add HW interfaces used by LAG Exposed LAG commands enum and layouts: - CREATE_LAG HW enters LAG mode: RoCE traffic from port two is received on PF0 core dev. Allows to set tx_affinity (tx port) for QPs and TISes. Allows to port remap QPs and TISes, overriding their tx_affinity behavior. - MODIFY_LAG Remap QPs and TISes to another port. - QUERY_LAG Query whether LAG mode is active. - DESTROY_LAG HW exits LAG mode, returning to non-LAG behavior. - CREATE_VPORT_LAG Merge Ethernet flow steering, such that traffic received on port two jumps to PF0 root flow table. Available only in LAG mode. - DESTROY_VPORT_LAG Ethernet flow steering returns to non-LAG behavior. Caps added: - lag_master Driver is in charge of managing the LAG. This is currently the only option. - num_lag_ports LAG is supported only if this field's value is 2. Other fields: - QP/TIS tx port affinity During LAG, this field controls on which port a QP or TIS resides. - TIS strict tx affinity When this field is set, the TIS will not be subject to port remap by CREATE_LAG/MODIFY_LAG. - LAG demux flow table Flow table used for redirecting non user-space traffic back to PF1 root flow table, if the packet was received on port two. Signed-off-by: Aviv Heller Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 166 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 159 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5f150c849a8f..043d5256b754 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -174,6 +174,12 @@ enum { MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY = 0x82b, MLX5_CMD_OP_SET_WOL_ROL = 0x830, MLX5_CMD_OP_QUERY_WOL_ROL = 0x831, + MLX5_CMD_OP_CREATE_LAG = 0x840, + MLX5_CMD_OP_MODIFY_LAG = 0x841, + MLX5_CMD_OP_QUERY_LAG = 0x842, + MLX5_CMD_OP_DESTROY_LAG = 0x843, + MLX5_CMD_OP_CREATE_VPORT_LAG = 0x844, + MLX5_CMD_OP_DESTROY_VPORT_LAG = 0x845, MLX5_CMD_OP_CREATE_TIR = 0x900, MLX5_CMD_OP_MODIFY_TIR = 0x901, MLX5_CMD_OP_DESTROY_TIR = 0x902, @@ -884,7 +890,10 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 pad_tx_eth_packet[0x1]; u8 reserved_at_263[0x8]; u8 log_bf_reg_size[0x5]; - u8 reserved_at_270[0x10]; + + u8 reserved_at_270[0xb]; + u8 lag_master[0x1]; + u8 num_lag_ports[0x4]; u8 reserved_at_280[0x10]; u8 max_wqe_sz_sq[0x10]; @@ -1918,7 +1927,7 @@ enum { struct mlx5_ifc_qpc_bits { u8 state[0x4]; - u8 reserved_at_4[0x4]; + u8 lag_tx_port_affinity[0x4]; u8 st[0x8]; u8 reserved_at_10[0x3]; u8 pm_state[0x2]; @@ -2167,7 +2176,11 @@ struct mlx5_ifc_traffic_counter_bits { }; struct mlx5_ifc_tisc_bits { - u8 reserved_at_0[0xc]; + u8 strict_lag_tx_port_affinity[0x1]; + u8 reserved_at_1[0x3]; + u8 lag_tx_port_affinity[0x04]; + + u8 reserved_at_8[0x4]; u8 prio[0x4]; u8 reserved_at_10[0x10]; @@ -4617,7 +4630,9 @@ struct mlx5_ifc_modify_tis_out_bits { struct mlx5_ifc_modify_tis_bitmask_bits { u8 reserved_at_0[0x20]; - u8 reserved_at_20[0x1f]; + u8 reserved_at_20[0x1d]; + u8 lag_tx_port_affinity[0x1]; + u8 strict_lag_tx_port_affinity[0x1]; u8 prio[0x1]; }; @@ -6215,7 +6230,10 @@ struct mlx5_ifc_create_flow_table_in_bits { u8 reserved_at_e0[0x8]; u8 table_miss_id[0x18]; - u8 reserved_at_100[0x100]; + u8 reserved_at_100[0x8]; + u8 lag_master_next_table_id[0x18]; + + u8 reserved_at_120[0x80]; }; struct mlx5_ifc_create_flow_group_out_bits { @@ -7669,7 +7687,8 @@ struct mlx5_ifc_set_flow_table_root_in_bits { }; enum { - MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID = 0x1, + MLX5_MODIFY_FLOW_TABLE_MISS_TABLE_ID = (1UL << 0), + MLX5_MODIFY_FLOW_TABLE_LAG_NEXT_TABLE_ID = (1UL << 15), }; struct mlx5_ifc_modify_flow_table_out_bits { @@ -7708,7 +7727,10 @@ struct mlx5_ifc_modify_flow_table_in_bits { u8 reserved_at_e0[0x8]; u8 table_miss_id[0x18]; - u8 reserved_at_100[0x100]; + u8 reserved_at_100[0x8]; + u8 lag_master_next_table_id[0x18]; + + u8 reserved_at_120[0x80]; }; struct mlx5_ifc_ets_tcn_config_reg_bits { @@ -7816,4 +7838,134 @@ struct mlx5_ifc_dcbx_param_bits { u8 error[0x8]; u8 reserved_at_a0[0x160]; }; + +struct mlx5_ifc_lagc_bits { + u8 reserved_at_0[0x1d]; + u8 lag_state[0x3]; + + u8 reserved_at_20[0x14]; + u8 tx_remap_affinity_2[0x4]; + u8 reserved_at_38[0x4]; + u8 tx_remap_affinity_1[0x4]; +}; + +struct mlx5_ifc_create_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_create_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + struct mlx5_ifc_lagc_bits ctx; +}; + +struct mlx5_ifc_modify_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_modify_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x20]; + u8 field_select[0x20]; + + struct mlx5_ifc_lagc_bits ctx; +}; + +struct mlx5_ifc_query_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_lagc_bits ctx; +}; + +struct mlx5_ifc_query_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_destroy_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_destroy_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_create_vport_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_create_vport_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_destroy_vport_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_destroy_vport_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From 7907f23adc186700efbe56c032527e47485c86ab Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Sun, 17 Apr 2016 16:57:32 +0300 Subject: net/mlx5: Implement RoCE LAG feature Available on dual port cards only, this feature keeps track, using netdev LAG events, of the bonding and link status of each port's PF netdev. When both of the card's PF netdevs are enslaved to the same bond/team master, and only them, LAG state is active. During LAG, only one IB device is present for both ports. In addition to the above, this commit includes FW commands used for managing the LAG, new facilities for adding and removing a single device by interface, and port remap functionality according to bond events. Please note that this feature is currently used only for mimicking Ethernet bonding for RoCE - netdevs functionality is not altered, and their bonding continues to be managed solely by bond/team driver. Signed-off-by: Aviv Heller Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 0ea78b5edbb2..ed983b8c3213 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -477,6 +477,7 @@ struct mlx5_fc_stats { }; struct mlx5_eswitch; +struct mlx5_lag; struct mlx5_rl_entry { u32 rate; @@ -550,6 +551,7 @@ struct mlx5_priv { struct mlx5_flow_steering *steering; struct mlx5_eswitch *eswitch; struct mlx5_core_sriov sriov; + struct mlx5_lag *lag; unsigned long pci_dev_data; struct mlx5_fc_stats fc_stats; struct mlx5_rl_table rl_table; @@ -942,6 +944,8 @@ int mlx5_register_interface(struct mlx5_interface *intf); void mlx5_unregister_interface(struct mlx5_interface *intf); int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id); +bool mlx5_lag_is_active(struct mlx5_core_dev *dev); + struct mlx5_profile { u64 mask; u8 log_max_qp; -- cgit v1.2.3 From 6a32047a441b870dd2570fe0831dada5e9ce40f6 Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Mon, 9 May 2016 11:06:44 +0000 Subject: net/mlx5: Get RoCE netdev Used by IB driver for determining the IB bond device's netdev, when LAG is active. Returns PF0's netdev if mode is not active-backup, or the PF netdev of the active slave when mode is active-backup. Signed-off-by: Aviv Heller Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index ed983b8c3213..c568dd927330 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -945,6 +945,7 @@ void mlx5_unregister_interface(struct mlx5_interface *intf); int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); +struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); struct mlx5_profile { u64 mask; -- cgit v1.2.3 From aaff1bea16bb7f259a263c3ae4633d092e2da799 Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Mon, 9 May 2016 09:57:05 +0000 Subject: net/mlx5: LAG demux flow table support Add interfaces to allow the creation and destruction of a LAG demux flow table. It is a special flow table used during LAG for redirecting non user-mode packets from PF0 to PF1 root ft, if a packet was received on phys port two. Signed-off-by: Aviv Heller Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index e036d6030867..7edfe0b8f1ec 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -106,6 +106,9 @@ mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns, int prio, int num_flow_table_entries, u32 level, u16 vport); +struct mlx5_flow_table *mlx5_create_lag_demux_flow_table( + struct mlx5_flow_namespace *ns, + int prio, u32 level); int mlx5_destroy_flow_table(struct mlx5_flow_table *ft); /* inbox should be set with the following values: -- cgit v1.2.3 From 3e75d4ebaae7aac5ba82fc7a6e0e6fb56dac1916 Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Mon, 9 May 2016 10:02:29 +0000 Subject: net/mlx5: Add LAG flow steering namespace This namespace is used for LAG demux flowtable. The idea is to position the LAG demux ft between bypass and kernel flowtables, allowing raw-eth traffic from both ports to be received by the PF0 IB device. Signed-off-by: Aviv Heller Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 7edfe0b8f1ec..8803212fc3aa 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -54,6 +54,7 @@ static inline void build_leftovers_ft_param(int *priority, enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_BYPASS, + MLX5_FLOW_NAMESPACE_LAG, MLX5_FLOW_NAMESPACE_OFFLOADS, MLX5_FLOW_NAMESPACE_ETHTOOL, MLX5_FLOW_NAMESPACE_KERNEL, -- cgit v1.2.3 From 3bc34f3bcb087764796d9a6eaa476e270114eb8f Mon Sep 17 00:00:00 2001 From: Aviv Heller Date: Mon, 9 May 2016 10:38:42 +0000 Subject: net/mlx5: Vport LAG creation support Add interfaces for issuing CREATE_VPORT_LAG and DESTROY_VPORT_LAG commands. Used for receiving PF1's eth traffic on PF0's root ft. Signed-off-by: Aviv Heller Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c568dd927330..5cb9fa7aec61 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -944,6 +944,8 @@ int mlx5_register_interface(struct mlx5_interface *intf); void mlx5_unregister_interface(struct mlx5_interface *intf); int mlx5_core_query_vendor_id(struct mlx5_core_dev *mdev, u32 *vendor_id); +int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev); +int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); -- cgit v1.2.3 From cea824d416522ce63d83b45fc0dc53c0f5b68cee Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 31 May 2016 14:09:09 +0300 Subject: net/mlx5: Introduce sniffer steering hardware capabilities Define needed hardware capabilities for sniffer RX and TX flow tables. Add the following capabilities: 1. Sniffer RX flow table capabilities. 2. Sniffer TX flow table capabilities. 3. If same TIR can be used by multiple flow tables of different types. Signed-off-by: Maor Gottlieb Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 12 ++++++++++++ include/linux/mlx5/mlx5_ifc.h | 4 +++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 2575070c836e..77c141797152 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -964,6 +964,18 @@ enum mlx5_cap_type { #define MLX5_CAP_FLOWTABLE_NIC_RX_MAX(mdev, cap) \ MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_receive.cap) +#define MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) \ + MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_receive_sniffer.cap) + +#define MLX5_CAP_FLOWTABLE_SNIFFER_RX_MAX(mdev, cap) \ + MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_receive_sniffer.cap) + +#define MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) \ + MLX5_CAP_FLOWTABLE(mdev, flow_table_properties_nic_transmit_sniffer.cap) + +#define MLX5_CAP_FLOWTABLE_SNIFFER_TX_MAX(mdev, cap) \ + MLX5_CAP_FLOWTABLE_MAX(mdev, flow_table_properties_nic_transmit_sniffer.cap) + #define MLX5_CAP_ESW_FLOWTABLE(mdev, cap) \ MLX5_GET(flow_table_eswitch_cap, \ mdev->hca_caps_cur[MLX5_CAP_ESWITCH_FLOW_TABLE], cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 043d5256b754..73a720f74a69 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -483,7 +483,9 @@ struct mlx5_ifc_ads_bits { struct mlx5_ifc_flow_table_nic_cap_bits { u8 nic_rx_multi_path_tirs[0x1]; - u8 reserved_at_1[0x1ff]; + u8 nic_rx_multi_path_tirs_fts[0x1]; + u8 allow_sniffer_and_nic_rx_shared_tir[0x1]; + u8 reserved_at_3[0x1fd]; struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive; -- cgit v1.2.3 From 87d22483ce68e609818d61e3a65361f5634c6cd6 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 6 Jun 2016 18:09:35 +0300 Subject: net/mlx5: Add sniffer namespaces Add sniffer TX and RX namespaces to receive ingoing and outgoing traffic. Each outgoing/incoming packet is duplicated and steered to the sniffer TX/RX namespace in addition to the regular flow. Signed-off-by: Maor Gottlieb Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 8803212fc3aa..93ebc5e21334 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -63,6 +63,8 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_FDB, MLX5_FLOW_NAMESPACE_ESW_EGRESS, MLX5_FLOW_NAMESPACE_ESW_INGRESS, + MLX5_FLOW_NAMESPACE_SNIFFER_RX, + MLX5_FLOW_NAMESPACE_SNIFFER_TX, }; struct mlx5_flow_table; -- cgit v1.2.3 From d194fd265e78ca1b2a4607918778446de44818b2 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Fri, 19 Aug 2016 08:34:57 +0300 Subject: qed*: Fix pause setting When moving into using ethtool's link_ksetting, qed started supplying its own bitmask of speed/capabilities, but qede is still checking for the SUPPORTED value to determine whether it supports pause. Fixes: 054c67d1c82a ("qed*: Add support for ethtool link_ksettings callbacks") Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 3ed7d20e3811..d8dc5c2243d5 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -318,9 +318,11 @@ struct qed_link_params { struct qed_link_output { bool link_up; - u32 supported_caps; /* In SUPPORTED defs */ - u32 advertised_caps; /* In ADVERTISED defs */ - u32 lp_caps; /* In ADVERTISED defs */ + /* In QED_LM_* defs */ + u32 supported_caps; + u32 advertised_caps; + u32 lp_caps; + u32 speed; /* In Mb/s */ u8 duplex; /* In DUPLEX defs */ u8 port; /* In PORT defs */ -- cgit v1.2.3 From f6a66927692e30bdc1792e7a1fc2107d4dfcf42d Mon Sep 17 00:00:00 2001 From: Hadar Hen Zion Date: Wed, 17 Aug 2016 13:36:11 +0300 Subject: flow_dissector: Get vlan priority in addition to vlan id Add vlan priority check to the flow dissector by adding new flow dissector struct, flow_dissector_key_vlan which includes vlan tag fields. vlan_id and flow_label fields were under the same struct (flow_dissector_key_tags). It was a convenient setting since struct flow_dissector_key_tags is used by struct flow_keys and by setting vlan_id and flow_label under the same struct, we get precisely 24 or 48 bytes in flow_keys from flow_dissector_key_basic. Now, when adding vlan priority support, the code will be cleaner if flow_label and vlan tag won't be under the same struct anymore. Signed-off-by: Hadar Hen Zion Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index a5f6ce6b578c..49d4aef1f789 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -81,6 +81,7 @@ static inline bool is_vlan_dev(const struct net_device *dev) #define skb_vlan_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) #define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) +#define skb_vlan_tag_get_prio(__skb) ((__skb)->vlan_tci & VLAN_PRIO_MASK) /** * struct vlan_pcpu_stats - VLAN percpu rx/tx stats -- cgit v1.2.3 From 1cb94db3d1bfe0075bde78fb2989f17e0a8a3936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 17 Aug 2016 23:00:30 +0200 Subject: net: bgmac: support Ethernet core on BCM53573 SoCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BCM53573 is a new series of Broadcom's SoCs. It's based on ARM and can be found in two packages (versions): BCM53573 and BCM47189. It shares some code with the Northstar family, but also requires some new quirks. First of all there can be up to 2 Ethernet cores on this SoC. If that is the case, they are connected to two different switch ports allowing some more complex/optimized setups. It seems the second unit doesn't come fully configured and requires some IRQ quirk. Other than that only the first core is connected to the PHY. For the second one we have to register fixed PHY (similarly to the Northstar), otherwise generic PHY driver would get some invalid info. This has been successfully tested on Tenda AC9 (BCM47189B0). Signed-off-by: Rafał Miłecki Signed-off-by: David S. Miller --- include/linux/bcma/bcma.h | 3 +++ include/linux/bcma/bcma_regs.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 3db25df396cb..8eeedb2db924 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -205,6 +205,9 @@ struct bcma_host_ops { #define BCMA_PKG_ID_BCM4709 0 #define BCMA_CHIP_ID_BCM47094 53030 #define BCMA_CHIP_ID_BCM53018 53018 +#define BCMA_CHIP_ID_BCM53573 53573 +#define BCMA_PKG_ID_BCM53573 0 +#define BCMA_PKG_ID_BCM47189 1 /* Board types (on PCI usually equals to the subsystem dev id) */ /* BCM4313 */ diff --git a/include/linux/bcma/bcma_regs.h b/include/linux/bcma/bcma_regs.h index ebd5c1fcdea4..c607fce6aadd 100644 --- a/include/linux/bcma/bcma_regs.h +++ b/include/linux/bcma/bcma_regs.h @@ -23,6 +23,7 @@ #define BCMA_CLKCTLST_4328A0_HAVEALP 0x00020000 /* 4328a0 has reversed bits */ /* Agent registers (common for every core) */ +#define BCMA_OOB_SEL_OUT_A30 0x0100 #define BCMA_IOCTL 0x0408 /* IO control */ #define BCMA_IOCTL_CLK 0x0001 #define BCMA_IOCTL_FGC 0x0002 -- cgit v1.2.3 From 5293efe62df81908f2e90c9820c7edcc8e61f5e9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 18 Aug 2016 01:00:39 +0200 Subject: bpf: add bpf_skb_change_tail helper This work adds a bpf_skb_change_tail() helper for tc BPF programs. The basic idea is to expand or shrink the skb in a controlled manner. The eBPF program can then rewrite the rest via helpers like bpf_skb_store_bytes(), bpf_lX_csum_replace() and others rather than passing a raw buffer for writing here. bpf_skb_change_tail() is really a slow path helper and intended for replies with f.e. ICMP control messages. Concept is similar to other helpers like bpf_skb_change_proto() helper to keep the helper without protocol specifics and let the BPF program mangle the remaining parts. A flags field has been added and is reserved for now should we extend the helper in future. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 43 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0f665cb26b50..7047448e8129 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2295,7 +2295,7 @@ static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len) int ___pskb_trim(struct sk_buff *skb, unsigned int len); -static inline void __skb_trim(struct sk_buff *skb, unsigned int len) +static inline void __skb_set_length(struct sk_buff *skb, unsigned int len) { if (unlikely(skb_is_nonlinear(skb))) { WARN_ON(1); @@ -2305,6 +2305,11 @@ static inline void __skb_trim(struct sk_buff *skb, unsigned int len) skb_set_tail_pointer(skb, len); } +static inline void __skb_trim(struct sk_buff *skb, unsigned int len) +{ + __skb_set_length(skb, len); +} + void skb_trim(struct sk_buff *skb, unsigned int len); static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) @@ -2335,6 +2340,20 @@ static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len) BUG_ON(err); } +static inline int __skb_grow(struct sk_buff *skb, unsigned int len) +{ + unsigned int diff = len - skb->len; + + if (skb_tailroom(skb) < diff) { + int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb), + GFP_ATOMIC); + if (ret) + return ret; + } + __skb_set_length(skb, len); + return 0; +} + /** * skb_orphan - orphan a buffer * @skb: buffer to orphan @@ -2938,6 +2957,21 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) return __pskb_trim(skb, len); } +static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + __skb_trim(skb, len); + return 0; +} + +static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + return __skb_grow(skb, len); +} + #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next; \ skb != (struct sk_buff *)(queue); \ @@ -3726,6 +3760,13 @@ static inline bool skb_is_gso_v6(const struct sk_buff *skb) return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6; } +static inline void skb_gso_reset(struct sk_buff *skb) +{ + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_segs = 0; + skb_shinfo(skb)->gso_type = 0; +} + void __skb_warn_lro_forwarding(const struct sk_buff *skb); static inline bool skb_warn_if_lro(const struct sk_buff *skb) -- cgit v1.2.3 From 246779dd090bd1b74d2652b3a6ca7759f593b27a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 18 Aug 2016 16:50:56 +0800 Subject: rhashtable: Remove GFP flag from rhashtable_walk_init The commit 8f6fd83c6c5ec66a4a70c728535ddcdfef4f3697 ("rhashtable: accept GFP flags in rhashtable_walk_init") added a GFP flag argument to rhashtable_walk_init because some users wish to use the walker in an unsleepable context. In fact we don't need to allocate memory in rhashtable_walk_init at all. The walker is always paired with an iterator so we could just stash ourselves there. This patch does that by introducing a new enter function to replace the existing init function. This way we don't have to churn all the existing users again. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 3eef0802a0cd..8b72ee710f95 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -173,7 +173,7 @@ struct rhashtable_walker { struct rhashtable_iter { struct rhashtable *ht; struct rhash_head *p; - struct rhashtable_walker *walker; + struct rhashtable_walker walker; unsigned int slot; unsigned int skip; }; @@ -346,8 +346,8 @@ struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, struct bucket_table *old_tbl); int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl); -int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter, - gfp_t gfp); +void rhashtable_walk_enter(struct rhashtable *ht, + struct rhashtable_iter *iter); void rhashtable_walk_exit(struct rhashtable_iter *iter); int rhashtable_walk_start(struct rhashtable_iter *iter) __acquires(RCU); void *rhashtable_walk_next(struct rhashtable_iter *iter); @@ -906,4 +906,12 @@ static inline int rhashtable_replace_fast( return err; } +/* Obsolete function, do not use in new code. */ +static inline int rhashtable_walk_init(struct rhashtable *ht, + struct rhashtable_iter *iter, gfp_t gfp) +{ + rhashtable_walk_enter(ht, iter); + return 0; +} + #endif /* _LINUX_RHASHTABLE_H */ -- cgit v1.2.3 From 05fafbfb3d77f43ae18341ddc61eb5c477896778 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Fri, 19 Aug 2016 09:33:31 +0300 Subject: qed: utilize FW 8.10.10.0 This new firmware for the qed* adpaters fixes several issues: - Better blocking of malicious VFs. - After FLR, Tx-switching [internal routing] of packets might be incorrect. - Deletion of unicast MAC filters would sometime have side-effect of corrupting the MAC filters configred for a device. It also contains fixes for future qed* drivers that *hopefully* would be sent for review in the near future. In addition, it would allow driver some new functionality, including: - Allowing PF/VF driver compaitibility with old drivers [running pre-8.10.5.0 firmware]. - Better debug facilities. This would also bump the qed* driver versions to 8.10.9.20. Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/common_hsi.h | 361 +++++++++++++++++++++++++++++++++++---- include/linux/qed/eth_common.h | 155 ++++++++++------- include/linux/qed/iscsi_common.h | 28 +-- include/linux/qed/qed_chain.h | 13 -- include/linux/qed/tcp_common.h | 16 +- 5 files changed, 442 insertions(+), 131 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 40c0ada01806..d306e0b55581 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -5,28 +5,83 @@ * (GPL) Version 2, available from the file COPYING in the main directory of * this source tree. */ +#ifndef _COMMON_HSI_H +#define _COMMON_HSI_H +#include +#include +#include +#include + +/* dma_addr_t manip */ +#define DMA_LO(x) ((u32)(((dma_addr_t)(x)) & 0xffffffff)) +#define DMA_HI(x) ((u32)(((dma_addr_t)(x)) >> 32)) + +#define DMA_LO_LE(x) cpu_to_le32(DMA_LO(x)) +#define DMA_HI_LE(x) cpu_to_le32(DMA_HI(x)) + +/* It's assumed that whoever includes this has previously included an hsi + * file defining the regpair. + */ +#define DMA_REGPAIR_LE(x, val) (x).hi = DMA_HI_LE((val)); \ + (x).lo = DMA_LO_LE((val)) + +#define HILO_GEN(hi, lo, type) ((((type)(hi)) << 32) + (lo)) +#define HILO_DMA(hi, lo) HILO_GEN(hi, lo, dma_addr_t) +#define HILO_64(hi, lo) HILO_GEN(hi, lo, u64) +#define HILO_DMA_REGPAIR(regpair) (HILO_DMA(regpair.hi, regpair.lo)) +#define HILO_64_REGPAIR(regpair) (HILO_64(regpair.hi, regpair.lo)) #ifndef __COMMON_HSI__ #define __COMMON_HSI__ -#define CORE_SPQE_PAGE_SIZE_BYTES 4096 #define X_FINAL_CLEANUP_AGG_INT 1 + +#define EVENT_RING_PAGE_SIZE_BYTES 4096 + #define NUM_OF_GLOBAL_QUEUES 128 +#define COMMON_QUEUE_ENTRY_MAX_BYTE_SIZE 64 + +#define ISCSI_CDU_TASK_SEG_TYPE 0 +#define RDMA_CDU_TASK_SEG_TYPE 1 + +#define FW_ASSERT_GENERAL_ATTN_IDX 32 + +#define MAX_PINNED_CCFC 32 /* Queue Zone sizes in bytes */ #define TSTORM_QZONE_SIZE 8 -#define MSTORM_QZONE_SIZE 0 +#define MSTORM_QZONE_SIZE 16 #define USTORM_QZONE_SIZE 8 #define XSTORM_QZONE_SIZE 8 #define YSTORM_QZONE_SIZE 0 #define PSTORM_QZONE_SIZE 0 -#define ETH_MAX_NUM_RX_QUEUES_PER_VF 16 +#define MSTORM_VF_ZONE_DEFAULT_SIZE_LOG 7 +#define ETH_MAX_NUM_RX_QUEUES_PER_VF_DEFAULT 16 +#define ETH_MAX_NUM_RX_QUEUES_PER_VF_DOUBLE 48 +#define ETH_MAX_NUM_RX_QUEUES_PER_VF_QUAD 112 + +/********************************/ +/* CORE (LIGHT L2) FW CONSTANTS */ +/********************************/ + +#define CORE_LL2_MAX_RAMROD_PER_CON 8 +#define CORE_LL2_TX_BD_PAGE_SIZE_BYTES 4096 +#define CORE_LL2_RX_BD_PAGE_SIZE_BYTES 4096 +#define CORE_LL2_RX_CQE_PAGE_SIZE_BYTES 4096 +#define CORE_LL2_RX_NUM_NEXT_PAGE_BDS 1 + +#define CORE_LL2_TX_MAX_BDS_PER_PACKET 12 + +#define CORE_SPQE_PAGE_SIZE_BYTES 4096 + +#define MAX_NUM_LL2_RX_QUEUES 32 +#define MAX_NUM_LL2_TX_STATS_COUNTERS 32 #define FW_MAJOR_VERSION 8 #define FW_MINOR_VERSION 10 -#define FW_REVISION_VERSION 5 +#define FW_REVISION_VERSION 10 #define FW_ENGINEERING_VERSION 0 /***********************/ @@ -83,6 +138,17 @@ #define NUM_OF_LCIDS (320) #define NUM_OF_LTIDS (320) +/* Clock values */ +#define MASTER_CLK_FREQ_E4 (375e6) +#define STORM_CLK_FREQ_E4 (1000e6) +#define CLK25M_CLK_FREQ_E4 (25e6) + +/* Global PXP windows (GTT) */ +#define NUM_OF_GTT 19 +#define GTT_DWORD_SIZE_BITS 10 +#define GTT_BYTE_SIZE_BITS (GTT_DWORD_SIZE_BITS + 2) +#define GTT_DWORD_SIZE BIT(GTT_DWORD_SIZE_BITS) + /*****************/ /* CDU CONSTANTS */ /*****************/ @@ -90,6 +156,8 @@ #define CDU_SEG_TYPE_OFFSET_REG_TYPE_SHIFT (17) #define CDU_SEG_TYPE_OFFSET_REG_OFFSET_MASK (0x1ffff) +#define CDU_VF_FL_SEG_TYPE_OFFSET_REG_TYPE_SHIFT (12) +#define CDU_VF_FL_SEG_TYPE_OFFSET_REG_OFFSET_MASK (0xfff) /*****************/ /* DQ CONSTANTS */ /*****************/ @@ -115,6 +183,11 @@ #define DQ_XCM_ETH_TX_BD_CONS_CMD DQ_XCM_AGG_VAL_SEL_WORD3 #define DQ_XCM_ETH_TX_BD_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 #define DQ_XCM_ETH_GO_TO_BD_CONS_CMD DQ_XCM_AGG_VAL_SEL_WORD5 +#define DQ_XCM_ISCSI_SQ_CONS_CMD DQ_XCM_AGG_VAL_SEL_WORD3 +#define DQ_XCM_ISCSI_SQ_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 +#define DQ_XCM_ISCSI_MORE_TO_SEND_SEQ_CMD DQ_XCM_AGG_VAL_SEL_REG3 +#define DQ_XCM_ISCSI_EXP_STAT_SN_CMD DQ_XCM_AGG_VAL_SEL_REG6 +#define DQ_XCM_ROCE_SQ_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 /* UCM agg val selection (HW) */ #define DQ_UCM_AGG_VAL_SEL_WORD0 0 @@ -159,13 +232,16 @@ #define DQ_XCM_AGG_FLG_SHIFT_CF23 7 /* XCM agg counter flag selection */ -#define DQ_XCM_CORE_DQ_CF_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF18) -#define DQ_XCM_CORE_TERMINATE_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF19) -#define DQ_XCM_CORE_SLOW_PATH_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF22) -#define DQ_XCM_ETH_DQ_CF_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF18) -#define DQ_XCM_ETH_TERMINATE_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF19) -#define DQ_XCM_ETH_SLOW_PATH_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF22) -#define DQ_XCM_ETH_TPH_EN_CMD (1 << DQ_XCM_AGG_FLG_SHIFT_CF23) +#define DQ_XCM_CORE_DQ_CF_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF18) +#define DQ_XCM_CORE_TERMINATE_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF19) +#define DQ_XCM_CORE_SLOW_PATH_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF22) +#define DQ_XCM_ETH_DQ_CF_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF18) +#define DQ_XCM_ETH_TERMINATE_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF19) +#define DQ_XCM_ETH_SLOW_PATH_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF22) +#define DQ_XCM_ETH_TPH_EN_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF23) +#define DQ_XCM_ISCSI_DQ_FLUSH_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF19) +#define DQ_XCM_ISCSI_SLOW_PATH_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF22) +#define DQ_XCM_ISCSI_PROC_ONLY_CLEANUP_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF23) /* UCM agg counter flag selection (HW) */ #define DQ_UCM_AGG_FLG_SHIFT_CF0 0 @@ -178,9 +254,45 @@ #define DQ_UCM_AGG_FLG_SHIFT_RULE1EN 7 /* UCM agg counter flag selection (FW) */ -#define DQ_UCM_ETH_PMD_TX_ARM_CMD (1 << DQ_UCM_AGG_FLG_SHIFT_CF4) -#define DQ_UCM_ETH_PMD_RX_ARM_CMD (1 << DQ_UCM_AGG_FLG_SHIFT_CF5) - +#define DQ_UCM_ETH_PMD_TX_ARM_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF4) +#define DQ_UCM_ETH_PMD_RX_ARM_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF5) +#define DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF4) +#define DQ_UCM_ROCE_CQ_ARM_CF_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF5) + +/* TCM agg counter flag selection (HW) */ +#define DQ_TCM_AGG_FLG_SHIFT_CF0 0 +#define DQ_TCM_AGG_FLG_SHIFT_CF1 1 +#define DQ_TCM_AGG_FLG_SHIFT_CF2 2 +#define DQ_TCM_AGG_FLG_SHIFT_CF3 3 +#define DQ_TCM_AGG_FLG_SHIFT_CF4 4 +#define DQ_TCM_AGG_FLG_SHIFT_CF5 5 +#define DQ_TCM_AGG_FLG_SHIFT_CF6 6 +#define DQ_TCM_AGG_FLG_SHIFT_CF7 7 +/* TCM agg counter flag selection (FW) */ +#define DQ_TCM_ISCSI_FLUSH_Q0_CMD BIT(DQ_TCM_AGG_FLG_SHIFT_CF1) +#define DQ_TCM_ISCSI_TIMER_STOP_ALL_CMD BIT(DQ_TCM_AGG_FLG_SHIFT_CF3) + +/* PWM address mapping */ +#define DQ_PWM_OFFSET_DPM_BASE 0x0 +#define DQ_PWM_OFFSET_DPM_END 0x27 +#define DQ_PWM_OFFSET_XCM16_BASE 0x40 +#define DQ_PWM_OFFSET_XCM32_BASE 0x44 +#define DQ_PWM_OFFSET_UCM16_BASE 0x48 +#define DQ_PWM_OFFSET_UCM32_BASE 0x4C +#define DQ_PWM_OFFSET_UCM16_4 0x50 +#define DQ_PWM_OFFSET_TCM16_BASE 0x58 +#define DQ_PWM_OFFSET_TCM32_BASE 0x5C +#define DQ_PWM_OFFSET_XCM_FLAGS 0x68 +#define DQ_PWM_OFFSET_UCM_FLAGS 0x69 +#define DQ_PWM_OFFSET_TCM_FLAGS 0x6B + +#define DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD (DQ_PWM_OFFSET_XCM16_BASE + 2) +#define DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT (DQ_PWM_OFFSET_UCM32_BASE) +#define DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_16BIT (DQ_PWM_OFFSET_UCM16_4) +#define DQ_PWM_OFFSET_UCM_RDMA_INT_TIMEOUT (DQ_PWM_OFFSET_UCM16_BASE + 2) +#define DQ_PWM_OFFSET_UCM_RDMA_ARM_FLAGS (DQ_PWM_OFFSET_UCM_FLAGS) +#define DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD (DQ_PWM_OFFSET_TCM16_BASE + 1) +#define DQ_PWM_OFFSET_TCM_IWARP_RQ_PROD (DQ_PWM_OFFSET_TCM16_BASE + 3) #define DQ_REGION_SHIFT (12) /* DPM */ @@ -214,15 +326,17 @@ */ #define CM_TX_PQ_BASE 0x200 +/* number of global Vport/QCN rate limiters */ +#define MAX_QM_GLOBAL_RLS 256 /* QM registers data */ #define QM_LINE_CRD_REG_WIDTH 16 -#define QM_LINE_CRD_REG_SIGN_BIT (1 << (QM_LINE_CRD_REG_WIDTH - 1)) +#define QM_LINE_CRD_REG_SIGN_BIT BIT((QM_LINE_CRD_REG_WIDTH - 1)) #define QM_BYTE_CRD_REG_WIDTH 24 -#define QM_BYTE_CRD_REG_SIGN_BIT (1 << (QM_BYTE_CRD_REG_WIDTH - 1)) +#define QM_BYTE_CRD_REG_SIGN_BIT BIT((QM_BYTE_CRD_REG_WIDTH - 1)) #define QM_WFQ_CRD_REG_WIDTH 32 -#define QM_WFQ_CRD_REG_SIGN_BIT (1 << (QM_WFQ_CRD_REG_WIDTH - 1)) +#define QM_WFQ_CRD_REG_SIGN_BIT BIT((QM_WFQ_CRD_REG_WIDTH - 1)) #define QM_RL_CRD_REG_WIDTH 32 -#define QM_RL_CRD_REG_SIGN_BIT (1 << (QM_RL_CRD_REG_WIDTH - 1)) +#define QM_RL_CRD_REG_SIGN_BIT BIT((QM_RL_CRD_REG_WIDTH - 1)) /*****************/ /* CAU CONSTANTS */ @@ -287,6 +401,17 @@ /* PXP CONSTANTS */ /*****************/ +/* Bars for Blocks */ +#define PXP_BAR_GRC 0 +#define PXP_BAR_TSDM 0 +#define PXP_BAR_USDM 0 +#define PXP_BAR_XSDM 0 +#define PXP_BAR_MSDM 0 +#define PXP_BAR_YSDM 0 +#define PXP_BAR_PSDM 0 +#define PXP_BAR_IGU 0 +#define PXP_BAR_DQ 1 + /* PTT and GTT */ #define PXP_NUM_PF_WINDOWS 12 #define PXP_PER_PF_ENTRY_SIZE 8 @@ -334,6 +459,52 @@ (PXP_EXTERNAL_BAR_GLOBAL_WINDOW_START + \ PXP_EXTERNAL_BAR_GLOBAL_WINDOW_LENGTH - 1) +/* PF BAR */ +#define PXP_BAR0_START_GRC 0x0000 +#define PXP_BAR0_GRC_LENGTH 0x1C00000 +#define PXP_BAR0_END_GRC (PXP_BAR0_START_GRC + \ + PXP_BAR0_GRC_LENGTH - 1) + +#define PXP_BAR0_START_IGU 0x1C00000 +#define PXP_BAR0_IGU_LENGTH 0x10000 +#define PXP_BAR0_END_IGU (PXP_BAR0_START_IGU + \ + PXP_BAR0_IGU_LENGTH - 1) + +#define PXP_BAR0_START_TSDM 0x1C80000 +#define PXP_BAR0_SDM_LENGTH 0x40000 +#define PXP_BAR0_SDM_RESERVED_LENGTH 0x40000 +#define PXP_BAR0_END_TSDM (PXP_BAR0_START_TSDM + \ + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_MSDM 0x1D00000 +#define PXP_BAR0_END_MSDM (PXP_BAR0_START_MSDM + \ + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_USDM 0x1D80000 +#define PXP_BAR0_END_USDM (PXP_BAR0_START_USDM + \ + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_XSDM 0x1E00000 +#define PXP_BAR0_END_XSDM (PXP_BAR0_START_XSDM + \ + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_YSDM 0x1E80000 +#define PXP_BAR0_END_YSDM (PXP_BAR0_START_YSDM + \ + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_START_PSDM 0x1F00000 +#define PXP_BAR0_END_PSDM (PXP_BAR0_START_PSDM + \ + PXP_BAR0_SDM_LENGTH - 1) + +#define PXP_BAR0_FIRST_INVALID_ADDRESS (PXP_BAR0_END_PSDM + 1) + +/* VF BAR */ +#define PXP_VF_BAR0 0 + +#define PXP_VF_BAR0_START_GRC 0x3E00 +#define PXP_VF_BAR0_GRC_LENGTH 0x200 +#define PXP_VF_BAR0_END_GRC (PXP_VF_BAR0_START_GRC + \ + PXP_VF_BAR0_GRC_LENGTH - 1) #define PXP_VF_BAR0_START_IGU 0 #define PXP_VF_BAR0_IGU_LENGTH 0x3000 @@ -399,6 +570,20 @@ #define PXP_NUM_ILT_RECORDS_BB 7600 #define PXP_NUM_ILT_RECORDS_K2 11000 #define MAX_NUM_ILT_RECORDS MAX(PXP_NUM_ILT_RECORDS_BB, PXP_NUM_ILT_RECORDS_K2) +#define PXP_QUEUES_ZONE_MAX_NUM 320 +/*****************/ +/* PRM CONSTANTS */ +/*****************/ +#define PRM_DMA_PAD_BYTES_NUM 2 +/******************/ +/* SDMs CONSTANTS */ +/******************/ +#define SDM_OP_GEN_TRIG_NONE 0 +#define SDM_OP_GEN_TRIG_WAKE_THREAD 1 +#define SDM_OP_GEN_TRIG_AGG_INT 2 +#define SDM_OP_GEN_TRIG_LOADER 4 +#define SDM_OP_GEN_TRIG_INDICATE_ERROR 6 +#define SDM_OP_GEN_TRIG_RELEASE_THREAD 7 #define SDM_COMP_TYPE_NONE 0 #define SDM_COMP_TYPE_WAKE_THREAD 1 @@ -424,6 +609,8 @@ /* PRS CONSTANTS */ /*****************/ +#define PRS_GFT_CAM_LINES_NO_MATCH 31 + /* Async data KCQ CQE */ struct async_data { __le32 cid; @@ -440,20 +627,6 @@ struct coalescing_timeset { #define COALESCING_TIMESET_VALID_SHIFT 7 }; -struct common_prs_pf_msg_info { - __le32 value; -#define COMMON_PRS_PF_MSG_INFO_NPAR_DEFAULT_PF_MASK 0x1 -#define COMMON_PRS_PF_MSG_INFO_NPAR_DEFAULT_PF_SHIFT 0 -#define COMMON_PRS_PF_MSG_INFO_FW_DEBUG_1_MASK 0x1 -#define COMMON_PRS_PF_MSG_INFO_FW_DEBUG_1_SHIFT 1 -#define COMMON_PRS_PF_MSG_INFO_FW_DEBUG_2_MASK 0x1 -#define COMMON_PRS_PF_MSG_INFO_FW_DEBUG_2_SHIFT 2 -#define COMMON_PRS_PF_MSG_INFO_FW_DEBUG_3_MASK 0x1 -#define COMMON_PRS_PF_MSG_INFO_FW_DEBUG_3_SHIFT 3 -#define COMMON_PRS_PF_MSG_INFO_RESERVED_MASK 0xFFFFFFF -#define COMMON_PRS_PF_MSG_INFO_RESERVED_SHIFT 4 -}; - struct common_queue_zone { __le16 ring_drv_data_consumer; __le16 reserved; @@ -473,6 +646,19 @@ struct vf_pf_channel_eqe_data { struct regpair msg_addr; }; +struct iscsi_eqe_data { + __le32 cid; + __le16 conn_id; + u8 error_code; + u8 error_pdu_opcode_reserved; +#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_MASK 0x3F +#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_SHIFT 0 +#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_VALID_MASK 0x1 +#define ISCSI_EQE_DATA_ERROR_PDU_OPCODE_VALID_SHIFT 6 +#define ISCSI_EQE_DATA_RESERVED0_MASK 0x1 +#define ISCSI_EQE_DATA_RESERVED0_SHIFT 7 +}; + struct malicious_vf_eqe_data { u8 vf_id; u8 err_id; @@ -488,6 +674,7 @@ struct initial_cleanup_eqe_data { union event_ring_data { u8 bytes[8]; struct vf_pf_channel_eqe_data vf_pf_channel; + struct iscsi_eqe_data iscsi_info; struct malicious_vf_eqe_data malicious_vf; struct initial_cleanup_eqe_data vf_init_cleanup; }; @@ -616,6 +803,52 @@ enum db_dest { MAX_DB_DEST }; +/* Enum of doorbell DPM types */ +enum db_dpm_type { + DPM_LEGACY, + DPM_ROCE, + DPM_L2_INLINE, + DPM_L2_BD, + MAX_DB_DPM_TYPE +}; + +/* Structure for doorbell data, in L2 DPM mode, for 1st db in a DPM burst */ +struct db_l2_dpm_data { + __le16 icid; + __le16 bd_prod; + __le32 params; +#define DB_L2_DPM_DATA_SIZE_MASK 0x3F +#define DB_L2_DPM_DATA_SIZE_SHIFT 0 +#define DB_L2_DPM_DATA_DPM_TYPE_MASK 0x3 +#define DB_L2_DPM_DATA_DPM_TYPE_SHIFT 6 +#define DB_L2_DPM_DATA_NUM_BDS_MASK 0xFF +#define DB_L2_DPM_DATA_NUM_BDS_SHIFT 8 +#define DB_L2_DPM_DATA_PKT_SIZE_MASK 0x7FF +#define DB_L2_DPM_DATA_PKT_SIZE_SHIFT 16 +#define DB_L2_DPM_DATA_RESERVED0_MASK 0x1 +#define DB_L2_DPM_DATA_RESERVED0_SHIFT 27 +#define DB_L2_DPM_DATA_SGE_NUM_MASK 0x7 +#define DB_L2_DPM_DATA_SGE_NUM_SHIFT 28 +#define DB_L2_DPM_DATA_RESERVED1_MASK 0x1 +#define DB_L2_DPM_DATA_RESERVED1_SHIFT 31 +}; + +/* Structure for SGE in a DPM doorbell of type DPM_L2_BD */ +struct db_l2_dpm_sge { + struct regpair addr; + __le16 nbytes; + __le16 bitfields; +#define DB_L2_DPM_SGE_TPH_ST_INDEX_MASK 0x1FF +#define DB_L2_DPM_SGE_TPH_ST_INDEX_SHIFT 0 +#define DB_L2_DPM_SGE_RESERVED0_MASK 0x3 +#define DB_L2_DPM_SGE_RESERVED0_SHIFT 9 +#define DB_L2_DPM_SGE_ST_VALID_MASK 0x1 +#define DB_L2_DPM_SGE_ST_VALID_SHIFT 11 +#define DB_L2_DPM_SGE_RESERVED1_MASK 0xF +#define DB_L2_DPM_SGE_RESERVED1_SHIFT 12 + __le32 reserved2; +}; + /* Structure for doorbell address, in legacy mode */ struct db_legacy_addr { __le32 addr; @@ -627,6 +860,49 @@ struct db_legacy_addr { #define DB_LEGACY_ADDR_ICID_SHIFT 5 }; +/* Structure for doorbell address, in PWM mode */ +struct db_pwm_addr { + __le32 addr; +#define DB_PWM_ADDR_RESERVED0_MASK 0x7 +#define DB_PWM_ADDR_RESERVED0_SHIFT 0 +#define DB_PWM_ADDR_OFFSET_MASK 0x7F +#define DB_PWM_ADDR_OFFSET_SHIFT 3 +#define DB_PWM_ADDR_WID_MASK 0x3 +#define DB_PWM_ADDR_WID_SHIFT 10 +#define DB_PWM_ADDR_DPI_MASK 0xFFFF +#define DB_PWM_ADDR_DPI_SHIFT 12 +#define DB_PWM_ADDR_RESERVED1_MASK 0xF +#define DB_PWM_ADDR_RESERVED1_SHIFT 28 +}; + +/* Parameters to RoCE firmware, passed in EDPM doorbell */ +struct db_roce_dpm_params { + __le32 params; +#define DB_ROCE_DPM_PARAMS_SIZE_MASK 0x3F +#define DB_ROCE_DPM_PARAMS_SIZE_SHIFT 0 +#define DB_ROCE_DPM_PARAMS_DPM_TYPE_MASK 0x3 +#define DB_ROCE_DPM_PARAMS_DPM_TYPE_SHIFT 6 +#define DB_ROCE_DPM_PARAMS_OPCODE_MASK 0xFF +#define DB_ROCE_DPM_PARAMS_OPCODE_SHIFT 8 +#define DB_ROCE_DPM_PARAMS_WQE_SIZE_MASK 0x7FF +#define DB_ROCE_DPM_PARAMS_WQE_SIZE_SHIFT 16 +#define DB_ROCE_DPM_PARAMS_RESERVED0_MASK 0x1 +#define DB_ROCE_DPM_PARAMS_RESERVED0_SHIFT 27 +#define DB_ROCE_DPM_PARAMS_COMPLETION_FLG_MASK 0x1 +#define DB_ROCE_DPM_PARAMS_COMPLETION_FLG_SHIFT 28 +#define DB_ROCE_DPM_PARAMS_S_FLG_MASK 0x1 +#define DB_ROCE_DPM_PARAMS_S_FLG_SHIFT 29 +#define DB_ROCE_DPM_PARAMS_RESERVED1_MASK 0x3 +#define DB_ROCE_DPM_PARAMS_RESERVED1_SHIFT 30 +}; + +/* Structure for doorbell data, in ROCE DPM mode, for 1st db in a DPM burst */ +struct db_roce_dpm_data { + __le16 icid; + __le16 prod_val; + struct db_roce_dpm_params params; +}; + /* Igu interrupt command */ enum igu_int_cmd { IGU_INT_ENABLE = 0, @@ -764,6 +1040,19 @@ struct pxp_ptt_entry { struct pxp_pretend_cmd pretend; }; +/* VF Zone A Permission Register. */ +struct pxp_vf_zone_a_permission { + __le32 control; +#define PXP_VF_ZONE_A_PERMISSION_VFID_MASK 0xFF +#define PXP_VF_ZONE_A_PERMISSION_VFID_SHIFT 0 +#define PXP_VF_ZONE_A_PERMISSION_VALID_MASK 0x1 +#define PXP_VF_ZONE_A_PERMISSION_VALID_SHIFT 8 +#define PXP_VF_ZONE_A_PERMISSION_RESERVED0_MASK 0x7F +#define PXP_VF_ZONE_A_PERMISSION_RESERVED0_SHIFT 9 +#define PXP_VF_ZONE_A_PERMISSION_RESERVED1_MASK 0xFFFF +#define PXP_VF_ZONE_A_PERMISSION_RESERVED1_SHIFT 16 +}; + /* RSS hash type */ struct rdif_task_context { __le32 initial_ref_tag; @@ -831,6 +1120,7 @@ struct rdif_task_context { __le32 reserved2; }; +/* RSS hash type */ enum rss_hash_type { RSS_HASH_TYPE_DEFAULT = 0, RSS_HASH_TYPE_IPV4 = 1, @@ -942,7 +1232,7 @@ struct tdif_task_context { }; struct timers_context { - __le32 logical_client0; + __le32 logical_client_0; #define TIMERS_CONTEXT_EXPIRATIONTIMELC0_MASK 0xFFFFFFF #define TIMERS_CONTEXT_EXPIRATIONTIMELC0_SHIFT 0 #define TIMERS_CONTEXT_VALIDLC0_MASK 0x1 @@ -951,7 +1241,7 @@ struct timers_context { #define TIMERS_CONTEXT_ACTIVELC0_SHIFT 29 #define TIMERS_CONTEXT_RESERVED0_MASK 0x3 #define TIMERS_CONTEXT_RESERVED0_SHIFT 30 - __le32 logical_client1; + __le32 logical_client_1; #define TIMERS_CONTEXT_EXPIRATIONTIMELC1_MASK 0xFFFFFFF #define TIMERS_CONTEXT_EXPIRATIONTIMELC1_SHIFT 0 #define TIMERS_CONTEXT_VALIDLC1_MASK 0x1 @@ -960,7 +1250,7 @@ struct timers_context { #define TIMERS_CONTEXT_ACTIVELC1_SHIFT 29 #define TIMERS_CONTEXT_RESERVED1_MASK 0x3 #define TIMERS_CONTEXT_RESERVED1_SHIFT 30 - __le32 logical_client2; + __le32 logical_client_2; #define TIMERS_CONTEXT_EXPIRATIONTIMELC2_MASK 0xFFFFFFF #define TIMERS_CONTEXT_EXPIRATIONTIMELC2_SHIFT 0 #define TIMERS_CONTEXT_VALIDLC2_MASK 0x1 @@ -978,3 +1268,4 @@ struct timers_context { #define TIMERS_CONTEXT_RESERVED3_SHIFT 29 }; #endif /* __COMMON_HSI__ */ +#endif diff --git a/include/linux/qed/eth_common.h b/include/linux/qed/eth_common.h index b5ebc697d05f..1aa0727c4136 100644 --- a/include/linux/qed/eth_common.h +++ b/include/linux/qed/eth_common.h @@ -13,9 +13,12 @@ /* ETH FW CONSTANTS */ /********************/ #define ETH_HSI_VER_MAJOR 3 -#define ETH_HSI_VER_MINOR 0 -#define ETH_CACHE_LINE_SIZE 64 +#define ETH_HSI_VER_MINOR 10 + +#define ETH_HSI_VER_NO_PKT_LEN_TUNN 5 +#define ETH_CACHE_LINE_SIZE 64 +#define ETH_RX_CQE_GAP 32 #define ETH_MAX_RAMROD_PER_CON 8 #define ETH_TX_BD_PAGE_SIZE_BYTES 4096 #define ETH_RX_BD_PAGE_SIZE_BYTES 4096 @@ -24,15 +27,25 @@ #define ETH_TX_MIN_BDS_PER_NON_LSO_PKT 1 #define ETH_TX_MAX_BDS_PER_NON_LSO_PACKET 18 +#define ETH_TX_MAX_BDS_PER_LSO_PACKET 255 #define ETH_TX_MAX_LSO_HDR_NBD 4 #define ETH_TX_MIN_BDS_PER_LSO_PKT 3 #define ETH_TX_MIN_BDS_PER_TUNN_IPV6_WITH_EXT_PKT 3 #define ETH_TX_MIN_BDS_PER_IPV6_WITH_EXT_PKT 2 #define ETH_TX_MIN_BDS_PER_PKT_W_LOOPBACK_MODE 2 -#define ETH_TX_MAX_NON_LSO_PKT_LEN (9700 - (4 + 12 + 8)) +#define ETH_TX_MAX_NON_LSO_PKT_LEN (9700 - (4 + 4 + 12 + 8)) #define ETH_TX_MAX_LSO_HDR_BYTES 510 +#define ETH_TX_LSO_WINDOW_BDS_NUM (18 - 1) +#define ETH_TX_LSO_WINDOW_MIN_LEN 9700 +#define ETH_TX_MAX_LSO_PAYLOAD_LEN 0xFE000 +#define ETH_TX_NUM_SAME_AS_LAST_ENTRIES 320 +#define ETH_TX_INACTIVE_SAME_AS_LAST 0xFFFF #define ETH_NUM_STATISTIC_COUNTERS MAX_NUM_VPORTS +#define ETH_NUM_STATISTIC_COUNTERS_DOUBLE_VF_ZONE \ + (ETH_NUM_STATISTIC_COUNTERS - MAX_NUM_VFS / 2) +#define ETH_NUM_STATISTIC_COUNTERS_QUAD_VF_ZONE \ + (ETH_NUM_STATISTIC_COUNTERS - 3 * MAX_NUM_VFS / 4) /* Maximum number of buffers, used for RX packet placement */ #define ETH_RX_MAX_BUFF_PER_PKT 5 @@ -59,6 +72,8 @@ #define ETH_TPA_CQE_CONT_LEN_LIST_SIZE 6 #define ETH_TPA_CQE_END_LEN_LIST_SIZE 4 +/* Control frame check constants */ +#define ETH_CTL_FRAME_ETH_TYPE_NUM 4 struct eth_tx_1st_bd_flags { u8 bitfields; @@ -82,10 +97,10 @@ struct eth_tx_1st_bd_flags { /* The parsing information data fo rthe first tx bd of a given packet. */ struct eth_tx_data_1st_bd { - __le16 vlan; - u8 nbds; - struct eth_tx_1st_bd_flags bd_flags; - __le16 bitfields; + __le16 vlan; + u8 nbds; + struct eth_tx_1st_bd_flags bd_flags; + __le16 bitfields; #define ETH_TX_DATA_1ST_BD_TUNN_FLAG_MASK 0x1 #define ETH_TX_DATA_1ST_BD_TUNN_FLAG_SHIFT 0 #define ETH_TX_DATA_1ST_BD_RESERVED0_MASK 0x1 @@ -96,7 +111,7 @@ struct eth_tx_data_1st_bd { /* The parsing information data for the second tx bd of a given packet. */ struct eth_tx_data_2nd_bd { - __le16 tunn_ip_size; + __le16 tunn_ip_size; __le16 bitfields1; #define ETH_TX_DATA_2ND_BD_TUNN_INNER_L2_HDR_SIZE_W_MASK 0xF #define ETH_TX_DATA_2ND_BD_TUNN_INNER_L2_HDR_SIZE_W_SHIFT 0 @@ -125,9 +140,14 @@ struct eth_tx_data_2nd_bd { #define ETH_TX_DATA_2ND_BD_RESERVED0_SHIFT 13 }; +/* Firmware data for L2-EDPM packet. */ +struct eth_edpm_fw_data { + struct eth_tx_data_1st_bd data_1st_bd; + struct eth_tx_data_2nd_bd data_2nd_bd; + __le32 reserved; +}; + struct eth_fast_path_cqe_fw_debug { - u8 reserved0; - u8 reserved1; __le16 reserved2; }; @@ -148,6 +168,17 @@ struct eth_tunnel_parsing_flags { #define ETH_TUNNEL_PARSING_FLAGS_IPV4_OPTIONS_SHIFT 7 }; +/* PMD flow control bits */ +struct eth_pmd_flow_flags { + u8 flags; +#define ETH_PMD_FLOW_FLAGS_VALID_MASK 0x1 +#define ETH_PMD_FLOW_FLAGS_VALID_SHIFT 0 +#define ETH_PMD_FLOW_FLAGS_TOGGLE_MASK 0x1 +#define ETH_PMD_FLOW_FLAGS_TOGGLE_SHIFT 1 +#define ETH_PMD_FLOW_FLAGS_RESERVED_MASK 0x3F +#define ETH_PMD_FLOW_FLAGS_RESERVED_SHIFT 2 +}; + /* Regular ETH Rx FP CQE. */ struct eth_fast_path_rx_reg_cqe { u8 type; @@ -166,64 +197,63 @@ struct eth_fast_path_rx_reg_cqe { u8 placement_offset; struct eth_tunnel_parsing_flags tunnel_pars_flags; u8 bd_num; - u8 reserved[7]; + u8 reserved[9]; struct eth_fast_path_cqe_fw_debug fw_debug; u8 reserved1[3]; - u8 flags; -#define ETH_FAST_PATH_RX_REG_CQE_VALID_MASK 0x1 -#define ETH_FAST_PATH_RX_REG_CQE_VALID_SHIFT 0 -#define ETH_FAST_PATH_RX_REG_CQE_VALID_TOGGLE_MASK 0x1 -#define ETH_FAST_PATH_RX_REG_CQE_VALID_TOGGLE_SHIFT 1 -#define ETH_FAST_PATH_RX_REG_CQE_RESERVED2_MASK 0x3F -#define ETH_FAST_PATH_RX_REG_CQE_RESERVED2_SHIFT 2 + struct eth_pmd_flow_flags pmd_flags; }; /* TPA-continue ETH Rx FP CQE. */ struct eth_fast_path_rx_tpa_cont_cqe { - u8 type; - u8 tpa_agg_index; - __le16 len_list[ETH_TPA_CQE_CONT_LEN_LIST_SIZE]; - u8 reserved[5]; - u8 reserved1; - __le16 reserved2[ETH_TPA_CQE_CONT_LEN_LIST_SIZE]; + u8 type; + u8 tpa_agg_index; + __le16 len_list[ETH_TPA_CQE_CONT_LEN_LIST_SIZE]; + u8 reserved; + u8 reserved1; + __le16 reserved2[ETH_TPA_CQE_CONT_LEN_LIST_SIZE]; + u8 reserved3[3]; + struct eth_pmd_flow_flags pmd_flags; }; /* TPA-end ETH Rx FP CQE. */ struct eth_fast_path_rx_tpa_end_cqe { - u8 type; - u8 tpa_agg_index; - __le16 total_packet_len; - u8 num_of_bds; - u8 end_reason; - __le16 num_of_coalesced_segs; - __le32 ts_delta; - __le16 len_list[ETH_TPA_CQE_END_LEN_LIST_SIZE]; - u8 reserved1[3]; - u8 reserved2; - __le16 reserved3[ETH_TPA_CQE_END_LEN_LIST_SIZE]; + u8 type; + u8 tpa_agg_index; + __le16 total_packet_len; + u8 num_of_bds; + u8 end_reason; + __le16 num_of_coalesced_segs; + __le32 ts_delta; + __le16 len_list[ETH_TPA_CQE_END_LEN_LIST_SIZE]; + __le16 reserved3[ETH_TPA_CQE_END_LEN_LIST_SIZE]; + __le16 reserved1; + u8 reserved2; + struct eth_pmd_flow_flags pmd_flags; }; /* TPA-start ETH Rx FP CQE. */ struct eth_fast_path_rx_tpa_start_cqe { - u8 type; - u8 bitfields; + u8 type; + u8 bitfields; #define ETH_FAST_PATH_RX_TPA_START_CQE_RSS_HASH_TYPE_MASK 0x7 #define ETH_FAST_PATH_RX_TPA_START_CQE_RSS_HASH_TYPE_SHIFT 0 #define ETH_FAST_PATH_RX_TPA_START_CQE_TC_MASK 0xF #define ETH_FAST_PATH_RX_TPA_START_CQE_TC_SHIFT 3 #define ETH_FAST_PATH_RX_TPA_START_CQE_RESERVED0_MASK 0x1 #define ETH_FAST_PATH_RX_TPA_START_CQE_RESERVED0_SHIFT 7 - __le16 seg_len; + __le16 seg_len; struct parsing_and_err_flags pars_flags; - __le16 vlan_tag; - __le32 rss_hash; - __le16 len_on_first_bd; - u8 placement_offset; + __le16 vlan_tag; + __le32 rss_hash; + __le16 len_on_first_bd; + u8 placement_offset; struct eth_tunnel_parsing_flags tunnel_pars_flags; - u8 tpa_agg_index; - u8 header_len; - __le16 ext_bd_len_list[ETH_TPA_CQE_START_LEN_LIST_SIZE]; + u8 tpa_agg_index; + u8 header_len; + __le16 ext_bd_len_list[ETH_TPA_CQE_START_LEN_LIST_SIZE]; struct eth_fast_path_cqe_fw_debug fw_debug; + u8 reserved; + struct eth_pmd_flow_flags pmd_flags; }; /* The L4 pseudo checksum mode for Ethernet */ @@ -245,15 +275,7 @@ struct eth_slow_path_rx_cqe { u8 reserved[25]; __le16 echo; u8 reserved1; - u8 flags; -/* for PMD mode - valid indication */ -#define ETH_SLOW_PATH_RX_CQE_VALID_MASK 0x1 -#define ETH_SLOW_PATH_RX_CQE_VALID_SHIFT 0 -/* for PMD mode - valid toggle indication */ -#define ETH_SLOW_PATH_RX_CQE_VALID_TOGGLE_MASK 0x1 -#define ETH_SLOW_PATH_RX_CQE_VALID_TOGGLE_SHIFT 1 -#define ETH_SLOW_PATH_RX_CQE_RESERVED2_MASK 0x3F -#define ETH_SLOW_PATH_RX_CQE_RESERVED2_SHIFT 2 + struct eth_pmd_flow_flags pmd_flags; }; /* union for all ETH Rx CQE types */ @@ -276,6 +298,11 @@ enum eth_rx_cqe_type { MAX_ETH_RX_CQE_TYPE }; +struct eth_rx_pmd_cqe { + union eth_rx_cqe cqe; + u8 reserved[ETH_RX_CQE_GAP]; +}; + enum eth_rx_tunn_type { ETH_RX_NO_TUNN, ETH_RX_TUNN_GENEVE, @@ -313,8 +340,8 @@ struct eth_tx_2nd_bd { /* The parsing information data for the third tx bd of a given packet. */ struct eth_tx_data_3rd_bd { - __le16 lso_mss; - __le16 bitfields; + __le16 lso_mss; + __le16 bitfields; #define ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_MASK 0xF #define ETH_TX_DATA_3RD_BD_TCP_HDR_LEN_DW_SHIFT 0 #define ETH_TX_DATA_3RD_BD_HDR_NBD_MASK 0xF @@ -323,8 +350,8 @@ struct eth_tx_data_3rd_bd { #define ETH_TX_DATA_3RD_BD_START_BD_SHIFT 8 #define ETH_TX_DATA_3RD_BD_RESERVED0_MASK 0x7F #define ETH_TX_DATA_3RD_BD_RESERVED0_SHIFT 9 - u8 tunn_l4_hdr_start_offset_w; - u8 tunn_hdr_size_w; + u8 tunn_l4_hdr_start_offset_w; + u8 tunn_hdr_size_w; }; /* The third tx bd of a given packet */ @@ -355,10 +382,10 @@ struct eth_tx_bd { }; union eth_tx_bd_types { - struct eth_tx_1st_bd first_bd; - struct eth_tx_2nd_bd second_bd; - struct eth_tx_3rd_bd third_bd; - struct eth_tx_bd reg_bd; + struct eth_tx_1st_bd first_bd; + struct eth_tx_2nd_bd second_bd; + struct eth_tx_3rd_bd third_bd; + struct eth_tx_bd reg_bd; }; /* Mstorm Queue Zone */ @@ -389,8 +416,8 @@ struct eth_db_data { #define ETH_DB_DATA_RESERVED_SHIFT 5 #define ETH_DB_DATA_AGG_VAL_SEL_MASK 0x3 #define ETH_DB_DATA_AGG_VAL_SEL_SHIFT 6 - u8 agg_flags; - __le16 bd_prod; + u8 agg_flags; + __le16 bd_prod; }; #endif /* __ETH_COMMON__ */ diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h index b3c0feb15ae9..8f64b1223c2f 100644 --- a/include/linux/qed/iscsi_common.h +++ b/include/linux/qed/iscsi_common.h @@ -311,7 +311,7 @@ struct iscsi_login_req_hdr { #define ISCSI_LOGIN_REQ_HDR_DATA_SEG_LEN_SHIFT 0 #define ISCSI_LOGIN_REQ_HDR_TOTAL_AHS_LEN_MASK 0xFF #define ISCSI_LOGIN_REQ_HDR_TOTAL_AHS_LEN_SHIFT 24 - __le32 isid_TABC; + __le32 isid_tabc; __le16 tsih; __le16 isid_d; __le32 itt; @@ -464,7 +464,7 @@ struct iscsi_login_response_hdr { #define ISCSI_LOGIN_RESPONSE_HDR_DATA_SEG_LEN_SHIFT 0 #define ISCSI_LOGIN_RESPONSE_HDR_TOTAL_AHS_LEN_MASK 0xFF #define ISCSI_LOGIN_RESPONSE_HDR_TOTAL_AHS_LEN_SHIFT 24 - __le32 isid_TABC; + __le32 isid_tabc; __le16 tsih; __le16 isid_d; __le32 itt; @@ -688,8 +688,7 @@ union iscsi_cqe { enum iscsi_cqes_type { ISCSI_CQE_TYPE_SOLICITED = 1, ISCSI_CQE_TYPE_UNSOLICITED, - ISCSI_CQE_TYPE_SOLICITED_WITH_SENSE - , + ISCSI_CQE_TYPE_SOLICITED_WITH_SENSE, ISCSI_CQE_TYPE_TASK_CLEANUP, ISCSI_CQE_TYPE_DUMMY, MAX_ISCSI_CQES_TYPE @@ -769,9 +768,9 @@ enum iscsi_eqe_opcode { ISCSI_EVENT_TYPE_UPDATE_CONN, ISCSI_EVENT_TYPE_CLEAR_SQ, ISCSI_EVENT_TYPE_TERMINATE_CONN, + ISCSI_EVENT_TYPE_MAC_UPDATE_CONN, ISCSI_EVENT_TYPE_ASYN_CONNECT_COMPLETE, ISCSI_EVENT_TYPE_ASYN_TERMINATE_DONE, - RESERVED8, RESERVED9, ISCSI_EVENT_TYPE_START_OF_ERROR_TYPES = 10, ISCSI_EVENT_TYPE_ASYN_ABORT_RCVD, @@ -867,6 +866,7 @@ enum iscsi_ramrod_cmd_id { ISCSI_RAMROD_CMD_ID_UPDATE_CONN = 4, ISCSI_RAMROD_CMD_ID_TERMINATION_CONN = 5, ISCSI_RAMROD_CMD_ID_CLEAR_SQ = 6, + ISCSI_RAMROD_CMD_ID_MAC_UPDATE = 7, MAX_ISCSI_RAMROD_CMD_ID }; @@ -883,6 +883,16 @@ union iscsi_seq_num { __le16 r2t_sn; }; +struct iscsi_spe_conn_mac_update { + struct iscsi_slow_path_hdr hdr; + __le16 conn_id; + __le32 fw_cid; + __le16 remote_mac_addr_lo; + __le16 remote_mac_addr_mid; + __le16 remote_mac_addr_hi; + u8 reserved0[2]; +}; + struct iscsi_spe_conn_offload { struct iscsi_slow_path_hdr hdr; __le16 conn_id; @@ -1302,14 +1312,6 @@ struct mstorm_iscsi_stats_drv { struct regpair iscsi_rx_dropped_pdus_task_not_valid; }; -struct ooo_opaque { - __le32 cid; - u8 drop_isle; - u8 drop_size; - u8 ooo_opcode; - u8 ooo_isle; -}; - struct pstorm_iscsi_stats_drv { struct regpair iscsi_tx_bytes_cnt; struct regpair iscsi_tx_packet_cnt; diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h index 7e441bdeabdc..72d88cf3ca25 100644 --- a/include/linux/qed/qed_chain.h +++ b/include/linux/qed/qed_chain.h @@ -16,19 +16,6 @@ #include #include -/* dma_addr_t manip */ -#define DMA_LO_LE(x) cpu_to_le32(lower_32_bits(x)) -#define DMA_HI_LE(x) cpu_to_le32(upper_32_bits(x)) -#define DMA_REGPAIR_LE(x, val) do { \ - (x).hi = DMA_HI_LE((val)); \ - (x).lo = DMA_LO_LE((val)); \ - } while (0) - -#define HILO_GEN(hi, lo, type) ((((type)(hi)) << 32) + (lo)) -#define HILO_64(hi, lo) HILO_GEN((le32_to_cpu(hi)), (le32_to_cpu(lo)), u64) -#define HILO_64_REGPAIR(regpair) (HILO_64(regpair.hi, regpair.lo)) -#define HILO_DMA_REGPAIR(regpair) ((dma_addr_t)HILO_64_REGPAIR(regpair)) - enum qed_chain_mode { /* Each Page contains a next pointer at its end */ QED_CHAIN_MODE_NEXT_PTR, diff --git a/include/linux/qed/tcp_common.h b/include/linux/qed/tcp_common.h index accba0e6b704..dc3889d1bbe6 100644 --- a/include/linux/qed/tcp_common.h +++ b/include/linux/qed/tcp_common.h @@ -11,6 +11,14 @@ #define TCP_INVALID_TIMEOUT_VAL -1 +struct ooo_opaque { + __le32 cid; + u8 drop_isle; + u8 drop_size; + u8 ooo_opcode; + u8 ooo_isle; +}; + enum tcp_connect_mode { TCP_CONNECT_ACTIVE, TCP_CONNECT_PASSIVE, @@ -18,14 +26,10 @@ enum tcp_connect_mode { }; struct tcp_init_params { - __le32 max_cwnd; - __le16 dup_ack_threshold; + __le32 two_msl_timer; __le16 tx_sws_timer; - __le16 min_rto; - __le16 min_rto_rt; - __le16 max_rto; u8 maxfinrt; - u8 reserved[1]; + u8 reserved[9]; }; enum tcp_ip_version { -- cgit v1.2.3 From d8c2c7e3404e5bcaeae4af78d6935e5b8fcc97ee Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Mon, 22 Aug 2016 13:25:11 +0300 Subject: qed*: Add support for VFs over legacy PFs Modern VFs can't run on old non-compatible as the fastpath HSI is slightly changed - but as the HSI is actually very close [basically, a single bit whose meaning flipped] this can be supported with small modifications. The major differences would be in: - Recognizing that VF is running on top of a legacy PF. - Returning some slowpath configurations that are no longer needed on top of modern PFs, but would be required when working over the legacy ones. Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_eth_if.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 4475a9d8ae15..33c24ebc9b7f 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -23,6 +23,9 @@ struct qed_dev_eth_info { u8 port_mac[ETH_ALEN]; u8 num_vlan_filters; + + /* Legacy VF - this affects the datapath, so qede has to know */ + bool is_legacy; }; struct qed_update_vport_rss_params { -- cgit v1.2.3 From f1ff8666ed87b0013e45ce2d335085407bb38a60 Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Tue, 23 Aug 2016 07:19:50 +0300 Subject: qed: Fix address macros Last FW submission reverted various macros into an older form, where they generate compilation warnings on some architectures. Bring back the newer macros instead. Fixes: 05fafbfb3d77 ("qed: utilize FW 8.10.10.0") Reported-by: kbuild test robot Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/common_hsi.h | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index d306e0b55581..70b30e4d3cc4 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -13,23 +13,17 @@ #include /* dma_addr_t manip */ -#define DMA_LO(x) ((u32)(((dma_addr_t)(x)) & 0xffffffff)) -#define DMA_HI(x) ((u32)(((dma_addr_t)(x)) >> 32)) - -#define DMA_LO_LE(x) cpu_to_le32(DMA_LO(x)) -#define DMA_HI_LE(x) cpu_to_le32(DMA_HI(x)) - -/* It's assumed that whoever includes this has previously included an hsi - * file defining the regpair. - */ -#define DMA_REGPAIR_LE(x, val) (x).hi = DMA_HI_LE((val)); \ - (x).lo = DMA_LO_LE((val)) +#define DMA_LO_LE(x) cpu_to_le32(lower_32_bits(x)) +#define DMA_HI_LE(x) cpu_to_le32(upper_32_bits(x)) +#define DMA_REGPAIR_LE(x, val) do { \ + (x).hi = DMA_HI_LE((val)); \ + (x).lo = DMA_LO_LE((val)); \ + } while (0) #define HILO_GEN(hi, lo, type) ((((type)(hi)) << 32) + (lo)) -#define HILO_DMA(hi, lo) HILO_GEN(hi, lo, dma_addr_t) -#define HILO_64(hi, lo) HILO_GEN(hi, lo, u64) -#define HILO_DMA_REGPAIR(regpair) (HILO_DMA(regpair.hi, regpair.lo)) +#define HILO_64(hi, lo) HILO_GEN((le32_to_cpu(hi)), (le32_to_cpu(lo)), u64) #define HILO_64_REGPAIR(regpair) (HILO_64(regpair.hi, regpair.lo)) +#define HILO_DMA_REGPAIR(regpair) ((dma_addr_t)HILO_64_REGPAIR(regpair)) #ifndef __COMMON_HSI__ #define __COMMON_HSI__ -- cgit v1.2.3 From 30d1de08c87ddde6f73936c3350e7e153988fe02 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 23 Aug 2016 12:17:48 -0700 Subject: hv_netvsc: make inline functions static Several new functions were introduced into hyperv.h but only used in one file. Move them and let compiler decide on inline. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/hyperv.h | 84 -------------------------------------------------- 1 file changed, 84 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b10954a66939..a6bc974def8f 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1422,88 +1422,4 @@ static inline bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi) return false; } -/* - * An API to support in-place processing of incoming VMBUS packets. - */ -#define VMBUS_PKT_TRAILER 8 - -static inline struct vmpacket_descriptor * -get_next_pkt_raw(struct vmbus_channel *channel) -{ - struct hv_ring_buffer_info *ring_info = &channel->inbound; - u32 read_loc = ring_info->priv_read_index; - void *ring_buffer = hv_get_ring_buffer(ring_info); - struct vmpacket_descriptor *cur_desc; - u32 packetlen; - u32 dsize = ring_info->ring_datasize; - u32 delta = read_loc - ring_info->ring_buffer->read_index; - u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta); - - if (bytes_avail_toread < sizeof(struct vmpacket_descriptor)) - return NULL; - - if ((read_loc + sizeof(*cur_desc)) > dsize) - return NULL; - - cur_desc = ring_buffer + read_loc; - packetlen = cur_desc->len8 << 3; - - /* - * If the packet under consideration is wrapping around, - * return failure. - */ - if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > (dsize - 1)) - return NULL; - - return cur_desc; -} - -/* - * A helper function to step through packets "in-place" - * This API is to be called after each successful call - * get_next_pkt_raw(). - */ -static inline void put_pkt_raw(struct vmbus_channel *channel, - struct vmpacket_descriptor *desc) -{ - struct hv_ring_buffer_info *ring_info = &channel->inbound; - u32 read_loc = ring_info->priv_read_index; - u32 packetlen = desc->len8 << 3; - u32 dsize = ring_info->ring_datasize; - - if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > dsize) - BUG(); - /* - * Include the packet trailer. - */ - ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER; -} - -/* - * This call commits the read index and potentially signals the host. - * Here is the pattern for using the "in-place" consumption APIs: - * - * while (get_next_pkt_raw() { - * process the packet "in-place"; - * put_pkt_raw(); - * } - * if (packets processed in place) - * commit_rd_index(); - */ -static inline void commit_rd_index(struct vmbus_channel *channel) -{ - struct hv_ring_buffer_info *ring_info = &channel->inbound; - /* - * Make sure all reads are done before we update the read index since - * the writer may start writing to the read area once the read index - * is updated. - */ - virt_rmb(); - ring_info->ring_buffer->read_index = ring_info->priv_read_index; - - if (hv_need_to_signal_on_read(ring_info)) - vmbus_set_event(channel); -} - - #endif /* _HYPERV_H */ -- cgit v1.2.3 From e3f74b841d482e962b9f5a907eeb25eeeb09aa60 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 23 Aug 2016 12:17:56 -0700 Subject: hv_netvsc: report vmbus name in ethtool Make netvsc on vmbus behave more like PCI. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/hyperv.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index a6bc974def8f..b01c8c3dd531 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1114,6 +1114,13 @@ int __must_check __vmbus_driver_register(struct hv_driver *hv_driver, const char *mod_name); void vmbus_driver_unregister(struct hv_driver *hv_driver); +static inline const char *vmbus_dev_name(const struct hv_device *device_obj) +{ + const struct kobject *kobj = &device_obj->device.kobj; + + return kobj->name; +} + void vmbus_hvsock_device_unregister(struct vmbus_channel *channel); int vmbus_allocate_mmio(struct resource **new, struct hv_device *device_obj, -- cgit v1.2.3 From 5ca8cc5bf11faed257c762018aea9106d529232f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Aug 2016 12:31:31 +0200 Subject: rhashtable: add rhashtable_lookup_get_insert_key() This patch modifies __rhashtable_insert_fast() so it returns the existing object that clashes with the one that you want to insert. In case the object is successfully inserted, NULL is returned. Otherwise, you get an error via ERR_PTR(). This patch adapts the existing callers of __rhashtable_insert_fast() so they handle this new logic, and it adds a new rhashtable_lookup_get_insert_key() interface to fetch this existing object. nf_tables needs this change to improve handling of EEXIST cases via honoring the NLM_F_EXCL flag and by checking if the data part of the mapping matches what we have. Cc: Herbert Xu Cc: Thomas Graf Signed-off-by: Pablo Neira Ayuso Acked-by: Herbert Xu --- include/linux/rhashtable.h | 70 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 3eef0802a0cd..26b7a059c65e 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -343,7 +343,8 @@ int rhashtable_init(struct rhashtable *ht, struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj, - struct bucket_table *old_tbl); + struct bucket_table *old_tbl, + void **data); int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl); int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter, @@ -563,8 +564,11 @@ restart: return NULL; } -/* Internal function, please use rhashtable_insert_fast() instead */ -static inline int __rhashtable_insert_fast( +/* Internal function, please use rhashtable_insert_fast() instead. This + * function returns the existing element already in hashes in there is a clash, + * otherwise it returns an error via ERR_PTR(). + */ +static inline void *__rhashtable_insert_fast( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { @@ -577,6 +581,7 @@ static inline int __rhashtable_insert_fast( spinlock_t *lock; unsigned int elasticity; unsigned int hash; + void *data = NULL; int err; restart: @@ -601,11 +606,14 @@ restart: new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(new_tbl)) { - tbl = rhashtable_insert_slow(ht, key, obj, new_tbl); + tbl = rhashtable_insert_slow(ht, key, obj, new_tbl, &data); if (!IS_ERR_OR_NULL(tbl)) goto slow_path; err = PTR_ERR(tbl); + if (err == -EEXIST) + err = 0; + goto out; } @@ -619,25 +627,25 @@ slow_path: err = rhashtable_insert_rehash(ht, tbl); rcu_read_unlock(); if (err) - return err; + return ERR_PTR(err); goto restart; } - err = -EEXIST; + err = 0; elasticity = ht->elasticity; rht_for_each(head, tbl, hash) { if (key && unlikely(!(params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, head)) : - rhashtable_compare(&arg, rht_obj(ht, head))))) + rhashtable_compare(&arg, rht_obj(ht, head))))) { + data = rht_obj(ht, head); goto out; + } if (!--elasticity) goto slow_path; } - err = 0; - head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); RCU_INIT_POINTER(obj->next, head); @@ -652,7 +660,7 @@ out: spin_unlock_bh(lock); rcu_read_unlock(); - return err; + return err ? ERR_PTR(err) : data; } /** @@ -675,7 +683,13 @@ static inline int rhashtable_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { - return __rhashtable_insert_fast(ht, NULL, obj, params); + void *ret; + + ret = __rhashtable_insert_fast(ht, NULL, obj, params); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; } /** @@ -704,11 +718,15 @@ static inline int rhashtable_lookup_insert_fast( const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); + void *ret; BUG_ON(ht->p.obj_hashfn); - return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, - params); + ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; } /** @@ -736,6 +754,32 @@ static inline int rhashtable_lookup_insert_fast( static inline int rhashtable_lookup_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) +{ + void *ret; + + BUG_ON(!ht->p.obj_hashfn || !key); + + ret = __rhashtable_insert_fast(ht, key, obj, params); + if (IS_ERR(ret)) + return PTR_ERR(ret); + + return ret == NULL ? 0 : -EEXIST; +} + +/** + * rhashtable_lookup_get_insert_key - lookup and insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * @data: pointer to element data already in hashes + * + * Just like rhashtable_lookup_insert_key(), but this function returns the + * object if it exists, NULL if it does not and the insertion was successful, + * and an ERR_PTR otherwise. + */ +static inline void *rhashtable_lookup_get_insert_key( + struct rhashtable *ht, const void *key, struct rhash_head *obj, + const struct rhashtable_params params) { BUG_ON(!ht->p.obj_hashfn || !key); -- cgit v1.2.3 From 6bc506b4fb065eac3d89ca1ce37082e174493d9e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 25 Aug 2016 18:42:37 +0200 Subject: bridge: switchdev: Add forward mark support for stacked devices switchdev_port_fwd_mark_set() is used to set the 'offload_fwd_mark' of port netdevs so that packets being flooded by the device won't be flooded twice. It works by assigning a unique identifier (the ifindex of the first bridge port) to bridge ports sharing the same parent ID. This prevents packets from being flooded twice by the same switch, but will flood packets through bridge ports belonging to a different switch. This method is problematic when stacked devices are taken into account, such as VLANs. In such cases, a physical port netdev can have upper devices being members in two different bridges, thus requiring two different 'offload_fwd_mark's to be configured on the port netdev, which is impossible. The main problem is that packet and netdev marking is performed at the physical netdev level, whereas flooding occurs between bridge ports, which are not necessarily port netdevs. Instead, packet and netdev marking should really be done in the bridge driver with the switch driver only telling it which packets it already forwarded. The bridge driver will mark such packets using the mark assigned to the ingress bridge port and will prevent the packet from being forwarded through any bridge port sharing the same mark (i.e. having the same parent ID). Remove the current switchdev 'offload_fwd_mark' implementation and instead implement the proposed method. In addition, make rocker - the sole user of the mark - use the proposed method. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ----- include/linux/skbuff.h | 13 +++++-------- 2 files changed, 5 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 794bb0733799..d122be9345c7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1562,8 +1562,6 @@ enum netdev_priv_flags { * * @xps_maps: XXX: need comments on this one * - * @offload_fwd_mark: Offload device fwding mark - * * @watchdog_timeo: Represents the timeout that is used by * the watchdog (see dev_watchdog()) * @watchdog_timer: List of timers @@ -1814,9 +1812,6 @@ struct net_device { #ifdef CONFIG_NET_CLS_ACT struct tcf_proto __rcu *egress_cl_list; #endif -#ifdef CONFIG_NET_SWITCHDEV - u32 offload_fwd_mark; -#endif /* These may be needed for future network-power-down code. */ struct timer_list watchdog_timer; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7047448e8129..cfb7219be665 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -612,7 +612,6 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1, * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking - * @offload_fwd_mark: fwding offload mark * @mark: Generic packet mark * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information @@ -730,7 +729,10 @@ struct sk_buff { __u8 ipvs_property:1; __u8 inner_protocol_type:1; __u8 remcsum_offload:1; - /* 3 or 5 bit hole */ +#ifdef CONFIG_NET_SWITCHDEV + __u8 offload_fwd_mark:1; +#endif + /* 2, 4 or 5 bit hole */ #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -757,14 +759,9 @@ struct sk_buff { unsigned int sender_cpu; }; #endif - union { #ifdef CONFIG_NETWORK_SECMARK - __u32 secmark; + __u32 secmark; #endif -#ifdef CONFIG_NET_SWITCHDEV - __u32 offload_fwd_mark; -#endif - }; union { __u32 mark; -- cgit v1.2.3 From 0294b625ad5a6d1fb50632d67cf384862d8a4a46 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 28 Aug 2016 14:43:17 -0700 Subject: net: Add read_sock proto_op Add new function in proto_ops structure. This includes moving the typedef got sk_read_actor into net.h and removing the definition from tcp.h. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/net.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index b9f0ff4d489c..cd0c8bd0a1de 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -128,6 +129,9 @@ struct page; struct sockaddr; struct msghdr; struct module; +struct sk_buff; +typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, + unsigned int, size_t); struct proto_ops { int family; @@ -186,6 +190,8 @@ struct proto_ops { struct pipe_inode_info *pipe, size_t len, unsigned int flags); int (*set_peek_off)(struct sock *sk, int val); int (*peek_len)(struct socket *sock); + int (*read_sock)(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); }; #define DECLARE_SOCKADDR(type, dst, src) \ -- cgit v1.2.3 From d297653dd6f07afbe7e6c702a4bcd7615680002e Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 30 Aug 2016 21:56:45 -0700 Subject: rtnetlink: fdb dump: optimize by saving last interface markers fdb dumps spanning multiple skb's currently restart from the first interface again for every skb. This results in unnecessary iterations on the already visited interfaces and their fdb entries. In large scale setups, we have seen this to slow down fdb dumps considerably. On a system with 30k macs we see fdb dumps spanning across more than 300 skbs. To fix the problem, this patch replaces the existing single fdb marker with three markers: netdev hash entries, netdevs and fdb index to continue where we left off instead of restarting from the first netdev. This is consistent with link dumps. In the process of fixing the performance issue, this patch also re-implements fix done by commit 472681d57a5d ("net: ndo_fdb_dump should report -EMSGSIZE to rtnl_fdb_dump") (with an internal fix from Wilson Kok) in the following ways: - change ndo_fdb_dump handlers to return error code instead of the last fdb index - use cb->args strictly for dump frag markers and not error codes. This is consistent with other dump functions. Below results were taken on a system with 1000 netdevs and 35085 fdb entries: before patch: $time bridge fdb show | wc -l 15065 real 1m11.791s user 0m0.070s sys 1m8.395s (existing code does not return all macs) after patch: $time bridge fdb show | wc -l 35085 real 0m2.017s user 0m0.113s sys 0m1.942s Signed-off-by: Roopa Prabhu Signed-off-by: Wilson Kok Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- include/linux/rtnetlink.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d122be9345c7..67bb978470dc 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1031,7 +1031,7 @@ struct netdev_xdp { * Deletes the FDB entry from dev coresponding to addr. * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, * struct net_device *dev, struct net_device *filter_dev, - * int idx) + * int *idx) * Used to add FDB entries to dump requests. Implementers should add * entries to skb and update idx with the number of entries. * @@ -1263,7 +1263,7 @@ struct net_device_ops { struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, - int idx); + int *idx); int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 2daece8979f7..57e54847b0b9 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -105,7 +105,7 @@ extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, - int idx); + int *idx); extern int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, -- cgit v1.2.3 From b6cb5ac8331b6bcfe9ce38c7f7f58db6e1d6270a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 31 Aug 2016 15:36:52 +0200 Subject: net: bridge: add per-port multicast flood flag Add a per-port flag to control the unknown multicast flood, similar to the unknown unicast flood flag and break a few long lines in the netlink flag exports. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index dcb89e3515db..c6587c01d951 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -45,6 +45,7 @@ struct br_ip_list { #define BR_PROXYARP BIT(8) #define BR_LEARNING_SYNC BIT(9) #define BR_PROXYARP_WIFI BIT(10) +#define BR_MCAST_FLOOD BIT(11) #define BR_DEFAULT_AGEING_TIME (300 * HZ) -- cgit v1.2.3 From 0515e5999a466dfe6e1924f460da599bb6821487 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:22 -0700 Subject: bpf: introduce BPF_PROG_TYPE_PERF_EVENT program type Introduce BPF_PROG_TYPE_PERF_EVENT programs that can be attached to HW and SW perf events (PERF_TYPE_HARDWARE and PERF_TYPE_SOFTWARE correspondingly in uapi/linux/perf_event.h) The program visible context meta structure is struct bpf_perf_event_data { struct pt_regs regs; __u64 sample_period; }; which is accessible directly from the program: int bpf_prog(struct bpf_perf_event_data *ctx) { ... ctx->sample_period ... ... ctx->regs.ip ... } The bpf verifier rewrites the accesses into kernel internal struct bpf_perf_event_data_kern which allows changing struct perf_sample_data without affecting bpf programs. New fields can be added to the end of struct bpf_perf_event_data in the future. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 2b6b43cc0dd5..97bfe62f30d7 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -788,6 +788,11 @@ struct perf_output_handle { int page; }; +struct bpf_perf_event_data_kern { + struct pt_regs *regs; + struct perf_sample_data *data; +}; + #ifdef CONFIG_CGROUP_PERF /* -- cgit v1.2.3 From aa6a5f3cb2b2edc5b9aab0b4fdfdfa9c3b5096a8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 1 Sep 2016 18:37:24 -0700 Subject: perf, bpf: add perf events core support for BPF_PROG_TYPE_PERF_EVENT programs Allow attaching BPF_PROG_TYPE_PERF_EVENT programs to sw and hw perf events via overflow_handler mechanism. When program is attached the overflow_handlers become stacked. The program acts as a filter. Returning zero from the program means that the normal perf_event_output handler will not be called and sampling event won't be stored in the ring buffer. The overflow_handler_context==NULL is an additional safety check to make sure programs are not attached to hw breakpoints and watchdog in case other checks (that prevent that now anyway) get accidentally relaxed in the future. The program refcnt is incremented in case perf_events are inhereted when target task is forked. Similar to kprobe and tracepoint programs there is no ioctl to detach the program or swap already attached program. The user space expected to close(perf_event_fd) like it does right now for kprobe+bpf. That restriction simplifies the code quite a bit. The invocation of overflow_handler in __perf_event_overflow() is now done via READ_ONCE, since that pointer can be replaced when the program is attached while perf_event itself could have been active already. There is no need to do similar treatment for event->prog, since it's assigned only once before it's accessed. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 4 ++++ include/linux/perf_event.h | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 11134238417d..9a904f63f8c1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -297,6 +297,10 @@ static inline struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) static inline void bpf_prog_put(struct bpf_prog *prog) { } +static inline struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) +{ + return ERR_PTR(-EOPNOTSUPP); +} #endif /* CONFIG_BPF_SYSCALL */ /* verifier prototypes for helper functions called from eBPF programs */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 97bfe62f30d7..ccb73a58113d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -679,6 +679,10 @@ struct perf_event { u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; +#ifdef CONFIG_BPF_SYSCALL + perf_overflow_handler_t orig_overflow_handler; + struct bpf_prog *prog; +#endif #ifdef CONFIG_EVENT_TRACING struct trace_event_call *tp_event; -- cgit v1.2.3 From 3f37ec79dd21fbdbbab8143a48a87272b22fef22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Mon, 25 Jul 2016 20:33:56 +0200 Subject: bcma: support BCM53573 series of wireless SoCs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BCM53573 seems to be the first series of Northstar family with wireless on the chip. The base models are BCM53573-s (A0, A1) and there is also BCM47189B0 which seems to be some small modification. The only problem with these chipsets seems to be watchdog. It's totally unavailable on 53573A0 / 53573A1 and preferable PMU watchdog is broken on 53573B0 / 53573B1. Signed-off-by: Rafał Miłecki Signed-off-by: Kalle Valo --- include/linux/bcma/bcma.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h index 3db25df396cb..8eeedb2db924 100644 --- a/include/linux/bcma/bcma.h +++ b/include/linux/bcma/bcma.h @@ -205,6 +205,9 @@ struct bcma_host_ops { #define BCMA_PKG_ID_BCM4709 0 #define BCMA_CHIP_ID_BCM47094 53030 #define BCMA_CHIP_ID_BCM53018 53018 +#define BCMA_CHIP_ID_BCM53573 53573 +#define BCMA_PKG_ID_BCM53573 0 +#define BCMA_PKG_ID_BCM47189 1 /* Board types (on PCI usually equals to the subsystem dev id) */ /* BCM4313 */ -- cgit v1.2.3 From ecc6569f3503b39f45bc6b86197b5e0a8533fb72 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 25 Aug 2016 23:08:11 +0800 Subject: netfilter: gre: Use consistent GRE_* macros instead of ones defined by netfilter. There are already some GRE_* macros in kernel, so it is unnecessary to define these macros. And remove some useless macros Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_proto_gre.h | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h index df78dc2b5524..0189747f2691 100644 --- a/include/linux/netfilter/nf_conntrack_proto_gre.h +++ b/include/linux/netfilter/nf_conntrack_proto_gre.h @@ -1,29 +1,11 @@ #ifndef _CONNTRACK_PROTO_GRE_H #define _CONNTRACK_PROTO_GRE_H #include +#include +#include /* GRE PROTOCOL HEADER */ -/* GRE Version field */ -#define GRE_VERSION_1701 0x0 -#define GRE_VERSION_PPTP 0x1 - -/* GRE Protocol field */ -#define GRE_PROTOCOL_PPTP 0x880B - -/* GRE Flags */ -#define GRE_FLAG_C 0x80 -#define GRE_FLAG_R 0x40 -#define GRE_FLAG_K 0x20 -#define GRE_FLAG_S 0x10 -#define GRE_FLAG_A 0x80 - -#define GRE_IS_C(f) ((f)&GRE_FLAG_C) -#define GRE_IS_R(f) ((f)&GRE_FLAG_R) -#define GRE_IS_K(f) ((f)&GRE_FLAG_K) -#define GRE_IS_S(f) ((f)&GRE_FLAG_S) -#define GRE_IS_A(f) ((f)&GRE_FLAG_A) - /* GRE is a mess: Four different standards */ struct gre_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) -- cgit v1.2.3 From c579a9e7d58f66030a144c7a33cc9bdf827a4b6d Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Thu, 25 Aug 2016 23:08:47 +0800 Subject: netfilter: gre: Use consistent GRE and PTTP header structure instead of the ones defined by netfilter There are two existing strutures which defines the GRE and PPTP header. So use these two structures instead of the ones defined by netfilter to keep consitent with other codes. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_proto_gre.h | 42 ------------------------ 1 file changed, 42 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h index 0189747f2691..dee0acd0dd31 100644 --- a/include/linux/netfilter/nf_conntrack_proto_gre.h +++ b/include/linux/netfilter/nf_conntrack_proto_gre.h @@ -4,48 +4,6 @@ #include #include -/* GRE PROTOCOL HEADER */ - -/* GRE is a mess: Four different standards */ -struct gre_hdr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u16 rec:3, - srr:1, - seq:1, - key:1, - routing:1, - csum:1, - version:3, - reserved:4, - ack:1; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u16 csum:1, - routing:1, - key:1, - seq:1, - srr:1, - rec:3, - ack:1, - reserved:4, - version:3; -#else -#error "Adjust your defines" -#endif - __be16 protocol; -}; - -/* modified GRE header for PPTP */ -struct gre_hdr_pptp { - __u8 flags; /* bitfield */ - __u8 version; /* should be GRE_VERSION_PPTP */ - __be16 protocol; /* should be GRE_PROTOCOL_PPTP */ - __be16 payload_len; /* size of ppp payload, not inc. gre header */ - __be16 call_id; /* peer's call_id for this session */ - __be32 seq; /* sequence number. Present if S==1 */ - __be32 ack; /* seq number of highest packet received by */ - /* sender in this session */ -}; - struct nf_ct_gre { unsigned int stream_timeout; unsigned int timeout; -- cgit v1.2.3 From c965db44462919f613973aa618271f6c3f5a1e64 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 7 Sep 2016 16:36:24 +0300 Subject: qed: Add support for debug data collection This patch adds the support for dumping and formatting the HW/FW debug data. Signed-off-by: Tomer Tayar Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/common_hsi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 70b30e4d3cc4..19027635df0d 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -143,6 +143,9 @@ #define GTT_BYTE_SIZE_BITS (GTT_DWORD_SIZE_BITS + 2) #define GTT_DWORD_SIZE BIT(GTT_DWORD_SIZE_BITS) +/* Tools Version */ +#define TOOLS_VERSION 10 + /*****************/ /* CDU CONSTANTS */ /*****************/ -- cgit v1.2.3 From e0971c832af4cd906ab931c9f6e9e1791a62fc98 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Wed, 7 Sep 2016 16:36:25 +0300 Subject: qed*: Add support for the ethtool get_regs operation Signed-off-by: Tomer Tayar Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index d8dc5c2243d5..e4546abcea08 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -455,6 +455,10 @@ struct qed_common_ops { void (*simd_handler_clean)(struct qed_dev *cdev, int index); + int (*dbg_all_data) (struct qed_dev *cdev, void *buffer); + + int (*dbg_all_data_size) (struct qed_dev *cdev); + /** * @brief can_link_change - can the instance change the link or not * -- cgit v1.2.3 From d545caca827b65aab557a9e9dcdcf1e5a3823c2d Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Thu, 8 Sep 2016 00:42:25 +0900 Subject: net: inet: diag: expose the socket mark to privileged processes. This adds the capability for a process that has CAP_NET_ADMIN on a socket to see the socket mark in socket dumps. Commit a52e95abf772 ("net: diag: allow socket bytecode filters to match socket marks") recently gave privileged processes the ability to filter socket dumps based on mark. This patch is complementary: it ensures that the mark is also passed to userspace in the socket's netlink attributes. It is useful for tools like ss which display information about sockets. Tested: https://android-review.googlesource.com/270210 Signed-off-by: Lorenzo Colitti Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index feb04ea20f11..65da430e260f 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -37,7 +37,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 pid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh); + const struct nlmsghdr *unlh, bool net_admin); void inet_diag_dump_icsk(struct inet_hashinfo *h, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, @@ -56,7 +56,7 @@ void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk); int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, - struct user_namespace *user_ns); + struct user_namespace *user_ns, bool net_admin); extern int inet_diag_register(const struct inet_diag_handler *handler); extern void inet_diag_unregister(const struct inet_diag_handler *handler); -- cgit v1.2.3 From fe19c4f971a55cea3be442d8032a5f6021702791 Mon Sep 17 00:00:00 2001 From: Eric Garver Date: Wed, 7 Sep 2016 12:56:58 -0400 Subject: vlan: Check for vlan ethernet types for 8021.q or 802.1ad This is to simplify using double tagged vlans. This function allows all valid vlan ethertypes to be checked in a single function call. Also replace some instances that check for both ETH_P_8021Q and ETH_P_8021AD. Patch based on one originally by Thomas F Herbert. Signed-off-by: Thomas F Herbert Signed-off-by: Eric Garver Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 49d4aef1f789..3319d97d789d 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -272,6 +272,23 @@ static inline int vlan_get_encap_level(struct net_device *dev) } #endif +/** + * eth_type_vlan - check for valid vlan ether type. + * @ethertype: ether type to check + * + * Returns true if the ether type is a vlan ether type. + */ +static inline bool eth_type_vlan(__be16 ethertype) +{ + switch (ethertype) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + return true; + default: + return false; + } +} + static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { @@ -425,8 +442,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb->data; - if (veth->h_vlan_proto != htons(ETH_P_8021Q) && - veth->h_vlan_proto != htons(ETH_P_8021AD)) + if (!eth_type_vlan(veth->h_vlan_proto)) return -EINVAL; *vlan_tci = ntohs(veth->h_vlan_TCI); @@ -488,7 +504,7 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, * present at mac_len - VLAN_HLEN (if mac_len > 0), or at * ETH_HLEN otherwise */ - if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { + if (eth_type_vlan(type)) { if (vlan_depth) { if (WARN_ON(vlan_depth < VLAN_HLEN)) return 0; @@ -506,8 +522,7 @@ static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, vh = (struct vlan_hdr *)(skb->data + vlan_depth); type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; - } while (type == htons(ETH_P_8021Q) || - type == htons(ETH_P_8021AD)); + } while (eth_type_vlan(type)); } if (depth) @@ -572,8 +587,7 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, static inline bool skb_vlan_tagged(const struct sk_buff *skb) { if (!skb_vlan_tag_present(skb) && - likely(skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD))) + likely(!eth_type_vlan(skb->protocol))) return false; return true; @@ -593,15 +607,14 @@ static inline bool skb_vlan_tagged_multi(const struct sk_buff *skb) if (!skb_vlan_tag_present(skb)) { struct vlan_ethhdr *veh; - if (likely(protocol != htons(ETH_P_8021Q) && - protocol != htons(ETH_P_8021AD))) + if (likely(!eth_type_vlan(protocol))) return false; veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } - if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) + if (!eth_type_vlan(protocol)) return false; return true; -- cgit v1.2.3 From 9f5afeae51526b3ad7b7cb21ee8b145ce6ea7a7a Mon Sep 17 00:00:00 2001 From: Yaogong Wang Date: Wed, 7 Sep 2016 14:49:28 -0700 Subject: tcp: use an RB tree for ooo receive queue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Over the years, TCP BDP has increased by several orders of magnitude, and some people are considering to reach the 2 Gbytes limit. Even with current window scale limit of 14, ~1 Gbytes maps to ~740,000 MSS. In presence of packet losses (or reorders), TCP stores incoming packets into an out of order queue, and number of skbs sitting there waiting for the missing packets to be received can be in the 10^5 range. Most packets are appended to the tail of this queue, and when packets can finally be transferred to receive queue, we scan the queue from its head. However, in presence of heavy losses, we might have to find an arbitrary point in this queue, involving a linear scan for every incoming packet, throwing away cpu caches. This patch converts it to a RB tree, to get bounded latencies. Yaogong wrote a preliminary patch about 2 years ago. Eric did the rebase, added ofo_last_skb cache, polishing and tests. Tested with network dropping between 1 and 10 % packets, with good success (about 30 % increase of throughput in stress tests) Next step would be to also use an RB tree for the write queue at sender side ;) Signed-off-by: Yaogong Wang Signed-off-by: Eric Dumazet Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Ilpo Järvinen Acked-By: Ilpo Järvinen Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ include/linux/tcp.h | 7 +++---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cfb7219be665..4c5662f05bda 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2402,6 +2402,8 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) kfree_skb(skb); } +void skb_rbtree_purge(struct rb_root *root); + void *netdev_alloc_frag(unsigned int fragsz); struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 7be9b1242354..c723a465125d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -281,10 +281,9 @@ struct tcp_sock { struct sk_buff* lost_skb_hint; struct sk_buff *retransmit_skb_hint; - /* OOO segments go in this list. Note that socket lock must be held, - * as we do not use sk_buff_head lock. - */ - struct sk_buff_head out_of_order_queue; + /* OOO segments go in this rbtree. Socket lock must be held. */ + struct rb_root out_of_order_queue; + struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ -- cgit v1.2.3 From 3e9b3112ec74f192eaab976c3889e34255cae940 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 31 Aug 2016 12:46:44 +0100 Subject: add basic register-field manipulation macros Common approach to accessing register fields is to define structures or sets of macros containing mask and shift pair. Operations on the register are then performed as follows: field = (reg >> shift) & mask; reg &= ~(mask << shift); reg |= (field & mask) << shift; Defining shift and mask separately is tedious. Ivo van Doorn came up with an idea of computing them at compilation time based on a single shifted mask (later refined by Felix) which can be used like this: #define REG_FIELD 0x000ff000 field = FIELD_GET(REG_FIELD, reg); reg &= ~REG_FIELD; reg |= FIELD_PREP(REG_FIELD, field); FIELD_{GET,PREP} macros take care of finding out what the appropriate shift is based on compilation time ffs operation. GENMASK can be used to define registers (which is usually less error-prone and easier to match with datasheets). This approach is the most convenient I've seen so to limit code multiplication let's move the macros to a global header file. Attempts to use static inlines instead of macros failed due to false positive triggering of BUILD_BUG_ON()s, especially with GCC < 6.0. Signed-off-by: Jakub Kicinski Reviewed-by: Dinan Gunawardena Signed-off-by: Kalle Valo --- include/linux/bitfield.h | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/bug.h | 3 ++ 2 files changed, 96 insertions(+) create mode 100644 include/linux/bitfield.h (limited to 'include/linux') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h new file mode 100644 index 000000000000..f6505d83069d --- /dev/null +++ b/include/linux/bitfield.h @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2014 Felix Fietkau + * Copyright (C) 2004 - 2009 Ivo van Doorn + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _LINUX_BITFIELD_H +#define _LINUX_BITFIELD_H + +#include + +/* + * Bitfield access macros + * + * FIELD_{GET,PREP} macros take as first parameter shifted mask + * from which they extract the base mask and shift amount. + * Mask must be a compilation time constant. + * + * Example: + * + * #define REG_FIELD_A GENMASK(6, 0) + * #define REG_FIELD_B BIT(7) + * #define REG_FIELD_C GENMASK(15, 8) + * #define REG_FIELD_D GENMASK(31, 16) + * + * Get: + * a = FIELD_GET(REG_FIELD_A, reg); + * b = FIELD_GET(REG_FIELD_B, reg); + * + * Set: + * reg = FIELD_PREP(REG_FIELD_A, 1) | + * FIELD_PREP(REG_FIELD_B, 0) | + * FIELD_PREP(REG_FIELD_C, c) | + * FIELD_PREP(REG_FIELD_D, 0x40); + * + * Modify: + * reg &= ~REG_FIELD_C; + * reg |= FIELD_PREP(REG_FIELD_C, c); + */ + +#define __bf_shf(x) (__builtin_ffsll(x) - 1) + +#define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx) \ + ({ \ + BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \ + _pfx "mask is not constant"); \ + BUILD_BUG_ON_MSG(!(_mask), _pfx "mask is zero"); \ + BUILD_BUG_ON_MSG(__builtin_constant_p(_val) ? \ + ~((_mask) >> __bf_shf(_mask)) & (_val) : 0, \ + _pfx "value too large for the field"); \ + BUILD_BUG_ON_MSG((_mask) > (typeof(_reg))~0ull, \ + _pfx "type of reg too small for mask"); \ + __BUILD_BUG_ON_NOT_POWER_OF_2((_mask) + \ + (1ULL << __bf_shf(_mask))); \ + }) + +/** + * FIELD_PREP() - prepare a bitfield element + * @_mask: shifted mask defining the field's length and position + * @_val: value to put in the field + * + * FIELD_PREP() masks and shifts up the value. The result should + * be combined with other fields of the bitfield using logical OR. + */ +#define FIELD_PREP(_mask, _val) \ + ({ \ + __BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: "); \ + ((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask); \ + }) + +/** + * FIELD_GET() - extract a bitfield element + * @_mask: shifted mask defining the field's length and position + * @_reg: 32bit value of entire bitfield + * + * FIELD_GET() extracts the field specified by @_mask from the + * bitfield passed in as @_reg by masking and shifting it down. + */ +#define FIELD_GET(_mask, _reg) \ + ({ \ + __BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: "); \ + (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \ + }) + +#endif diff --git a/include/linux/bug.h b/include/linux/bug.h index e51b0709e78d..292d6a10b0c2 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -13,6 +13,7 @@ enum bug_trap_type { struct pt_regs; #ifdef __CHECKER__ +#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) #define BUILD_BUG_ON_NOT_POWER_OF_2(n) (0) #define BUILD_BUG_ON_ZERO(e) (0) #define BUILD_BUG_ON_NULL(e) ((void*)0) @@ -24,6 +25,8 @@ struct pt_regs; #else /* __CHECKER__ */ /* Force a compilation error if a constant expression is not a power of 2 */ +#define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \ + BUILD_BUG_ON(((n) & ((n) - 1)) != 0) #define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) -- cgit v1.2.3 From 634faf3686900ccdee87b77e2c56df8b2159912b Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Mon, 5 Sep 2016 11:42:12 +0100 Subject: brcmfmac: add support for bcm4339 chip with modalias sdio:c00v02D0d4339 The driver already supports the bcm4339 chipset but only for the variant that shares the same modalias as the bcm4335, ie. sdio:c00v02D0d4335. It turns out that there are also bcm4339 devices out there that have a more distiguishable modalias sdio:c00v02D0d4339. Reported-by: Steve deRosier Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo --- include/linux/mmc/sdio_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 0d126aeb3ec0..d43ef96bf075 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -32,6 +32,7 @@ #define SDIO_DEVICE_ID_BROADCOM_43340 0xa94c #define SDIO_DEVICE_ID_BROADCOM_43341 0xa94d #define SDIO_DEVICE_ID_BROADCOM_4335_4339 0x4335 +#define SDIO_DEVICE_ID_BROADCOM_4339 0x4339 #define SDIO_DEVICE_ID_BROADCOM_43362 0xa962 #define SDIO_DEVICE_ID_BROADCOM_43430 0xa9a6 #define SDIO_DEVICE_ID_BROADCOM_4345 0x4345 -- cgit v1.2.3 From f035a51536af9802f55d8c79bd87f184ebffb093 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Sep 2016 02:45:29 +0200 Subject: bpf: add BPF_SIZEOF and BPF_FIELD_SIZEOF macros Add BPF_SIZEOF() and BPF_FIELD_SIZEOF() macros to improve the code a bit which otherwise often result in overly long bytes_to_bpf_size(sizeof()) and bytes_to_bpf_size(FIELD_SIZEOF()) lines. So place them into a macro helper instead. Moreover, we currently have a BUILD_BUG_ON(BPF_FIELD_SIZEOF()) check in convert_bpf_extensions(), but we should rather make that generic as well and add a BUILD_BUG_ON() test in all BPF_SIZEOF()/BPF_FIELD_SIZEOF() users to detect any rewriter size issues at compile time. Note, there are currently none, but we want to assert that it stays this way. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index a16439b99fd9..7fabad8dc3fc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -314,6 +314,20 @@ struct bpf_prog_aux; bpf_size; \ }) +#define BPF_SIZEOF(type) \ + ({ \ + const int __size = bytes_to_bpf_size(sizeof(type)); \ + BUILD_BUG_ON(__size < 0); \ + __size; \ + }) + +#define BPF_FIELD_SIZEOF(type, field) \ + ({ \ + const int __size = bytes_to_bpf_size(FIELD_SIZEOF(type, field)); \ + BUILD_BUG_ON(__size < 0); \ + __size; \ + }) + #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { -- cgit v1.2.3 From f3694e00123802d688180e7ae90b240669910e3c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 9 Sep 2016 02:45:31 +0200 Subject: bpf: add BPF_CALL_x macros for declaring helpers This work adds BPF_CALL_() macros and converts all the eBPF helper functions to use them, in a similar fashion like we do with SYSCALL_DEFINE() macros that are used today. Motivation for this is to hide all the register handling and all necessary casts from the user, so that it is done automatically in the background when adding a BPF_CALL_() call. This makes current helpers easier to review, eases to write future helpers, avoids getting the casting mess wrong, and allows for extending all helpers at once (f.e. build time checks, etc). It also helps detecting more easily in code reviews that unused registers are not instrumented in the code by accident, breaking compatibility with existing programs. BPF_CALL_() internals are quite similar to SYSCALL_DEFINE() ones with some fundamental differences, for example, for generating the actual helper function that carries all u64 regs, we need to fill unused regs, so that we always end up with 5 u64 regs as an argument. I reviewed several 0-5 generated BPF_CALL_() variants of the .i results and they look all as expected. No sparse issue spotted. We let this also sit for a few days with Fengguang's kbuild test robot, and there were no issues seen. On s390, it barked on the "uses dynamic stack allocation" notice, which is an old one from bpf_perf_event_output{,_tp}() reappearing here due to the conversion to the call wrapper, just telling that the perf raw record/frag sits on stack (gcc with s390's -mwarn-dynamicstack), but that's all. Did various runtime tests and they were fine as well. All eBPF helpers are now converted to use these macros, getting rid of a good chunk of all the raw castings. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/filter.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7fabad8dc3fc..1f09c521adfe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -328,6 +328,56 @@ struct bpf_prog_aux; __size; \ }) +#define __BPF_MAP_0(m, v, ...) v +#define __BPF_MAP_1(m, v, t, a, ...) m(t, a) +#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__) +#define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__) +#define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__) +#define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__) + +#define __BPF_REG_0(...) __BPF_PAD(5) +#define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4) +#define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3) +#define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2) +#define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1) +#define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__) + +#define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__) +#define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__) + +#define __BPF_CAST(t, a) \ + (__force t) \ + (__force \ + typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long), \ + (unsigned long)0, (t)0))) a +#define __BPF_V void +#define __BPF_N + +#define __BPF_DECL_ARGS(t, a) t a +#define __BPF_DECL_REGS(t, a) u64 a + +#define __BPF_PAD(n) \ + __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2, \ + u64, __ur_3, u64, __ur_4, u64, __ur_5) + +#define BPF_CALL_x(x, name, ...) \ + static __always_inline \ + u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ + u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ + u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ + { \ + return ____##name(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ + } \ + static __always_inline \ + u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) + +#define BPF_CALL_0(name, ...) BPF_CALL_x(0, name, __VA_ARGS__) +#define BPF_CALL_1(name, ...) BPF_CALL_x(1, name, __VA_ARGS__) +#define BPF_CALL_2(name, ...) BPF_CALL_x(2, name, __VA_ARGS__) +#define BPF_CALL_3(name, ...) BPF_CALL_x(3, name, __VA_ARGS__) +#define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__) +#define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__) + #ifdef CONFIG_COMPAT /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { -- cgit v1.2.3 From 6b6adee3dad25bbe568ee24fc843372d02fb425f Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Fri, 9 Sep 2016 17:35:18 +0300 Subject: net/mlx5: SRIOV core code refactoring Simplify the code and makes it look modular and symmetric. Split sriov enable/disable to two levels: device level and pci level. When user enable/disable sriov (via sriov_configure driver callback) we will enable/disable both device and pci sriov. When driver load/unload we will enable/disable (on demand) only device sriov while keeping the PCI sriov enabled for next driver load. On internal/pci error, VFs will be kept enabled on PCI and the reset is done only in device level. Signed-off-by: Mohamad Haj Yahia Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5cb9fa7aec61..0d7aedfce1d7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -828,8 +828,6 @@ void mlx5_pagealloc_init(struct mlx5_core_dev *dev); void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev); int mlx5_pagealloc_start(struct mlx5_core_dev *dev); void mlx5_pagealloc_stop(struct mlx5_core_dev *dev); -int mlx5_sriov_init(struct mlx5_core_dev *dev); -int mlx5_sriov_cleanup(struct mlx5_core_dev *dev); void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id, s32 npages); int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot); -- cgit v1.2.3 From 737a234bb6384800a5b632be85c6b0ad6221d137 Mon Sep 17 00:00:00 2001 From: Mohamad Haj Yahia Date: Fri, 9 Sep 2016 17:35:19 +0300 Subject: net/mlx5: Introduce attach/detach to interface API Add attach/detach callbacks to interface API. This is crucial for implementing seamless reset flow which releases the hardware and it's resources upon detach while keeping software structures and state (e.g netdev) then reset and reallocate the hardware needed resources upon attach. Signed-off-by: Mohamad Haj Yahia Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 0d7aedfce1d7..85c4786427e4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -930,6 +930,8 @@ enum { struct mlx5_interface { void * (*add)(struct mlx5_core_dev *dev); void (*remove)(struct mlx5_core_dev *dev, void *context); + int (*attach)(struct mlx5_core_dev *dev, void *context); + void (*detach)(struct mlx5_core_dev *dev, void *context); void (*event)(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param); void * (*get_dev)(void *context); -- cgit v1.2.3 From 3a8963acc70e69606729404713cfa9a03b58b18c Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Fri, 9 Sep 2016 12:45:24 -0700 Subject: Revert "hv_netvsc: make inline functions static" These functions are used by other code misc-next tree. This reverts commit 30d1de08c87ddde6f73936c3350e7e153988fe02. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/hyperv.h | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index b01c8c3dd531..5df444b1ac18 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1429,4 +1429,88 @@ static inline bool hv_need_to_signal_on_read(struct hv_ring_buffer_info *rbi) return false; } +/* + * An API to support in-place processing of incoming VMBUS packets. + */ +#define VMBUS_PKT_TRAILER 8 + +static inline struct vmpacket_descriptor * +get_next_pkt_raw(struct vmbus_channel *channel) +{ + struct hv_ring_buffer_info *ring_info = &channel->inbound; + u32 read_loc = ring_info->priv_read_index; + void *ring_buffer = hv_get_ring_buffer(ring_info); + struct vmpacket_descriptor *cur_desc; + u32 packetlen; + u32 dsize = ring_info->ring_datasize; + u32 delta = read_loc - ring_info->ring_buffer->read_index; + u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta); + + if (bytes_avail_toread < sizeof(struct vmpacket_descriptor)) + return NULL; + + if ((read_loc + sizeof(*cur_desc)) > dsize) + return NULL; + + cur_desc = ring_buffer + read_loc; + packetlen = cur_desc->len8 << 3; + + /* + * If the packet under consideration is wrapping around, + * return failure. + */ + if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > (dsize - 1)) + return NULL; + + return cur_desc; +} + +/* + * A helper function to step through packets "in-place" + * This API is to be called after each successful call + * get_next_pkt_raw(). + */ +static inline void put_pkt_raw(struct vmbus_channel *channel, + struct vmpacket_descriptor *desc) +{ + struct hv_ring_buffer_info *ring_info = &channel->inbound; + u32 read_loc = ring_info->priv_read_index; + u32 packetlen = desc->len8 << 3; + u32 dsize = ring_info->ring_datasize; + + if ((read_loc + packetlen + VMBUS_PKT_TRAILER) > dsize) + BUG(); + /* + * Include the packet trailer. + */ + ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER; +} + +/* + * This call commits the read index and potentially signals the host. + * Here is the pattern for using the "in-place" consumption APIs: + * + * while (get_next_pkt_raw() { + * process the packet "in-place"; + * put_pkt_raw(); + * } + * if (packets processed in place) + * commit_rd_index(); + */ +static inline void commit_rd_index(struct vmbus_channel *channel) +{ + struct hv_ring_buffer_info *ring_info = &channel->inbound; + /* + * Make sure all reads are done before we update the read index since + * the writer may start writing to the read area once the read index + * is updated. + */ + virt_rmb(); + ring_info->ring_buffer->read_index = ring_info->priv_read_index; + + if (hv_need_to_signal_on_read(ring_info)) + vmbus_set_event(channel); +} + + #endif /* _HYPERV_H */ -- cgit v1.2.3 From 8e8118f893138d4cc3d4dbf4163d7497fca54a9d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 11 Sep 2016 22:55:53 +0200 Subject: netfilter: conntrack: remove packet hotpath stats These counters sit in hot path and do show up in perf, this is especially true for 'found' and 'searched' which get incremented for every packet processed. Information like searched=212030105 new=623431 found=333613 delete=623327 does not seem too helpful nowadays: - on busy systems found and searched will overflow every few hours (these are 32bit integers), other more busy ones every few days. - for debugging there are better methods, such as iptables' trace target, the conntrack log sysctls. Nowadays we also have perf tool. This removes packet path stat counters except those that are expected to be 0 (or close to 0) on a normal system, e.g. 'insert_failed' (race happened) or 'invalid' (proto tracker rejects). The insert stat is retained for the ctnetlink case. The found stat is retained for the tuple-is-taken check when NAT has to determine if it needs to pick a different source address. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 275505792664..1d1ef4e20512 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -4,13 +4,9 @@ #include struct ip_conntrack_stat { - unsigned int searched; unsigned int found; - unsigned int new; unsigned int invalid; unsigned int ignore; - unsigned int delete; - unsigned int delete_list; unsigned int insert; unsigned int insert_failed; unsigned int drop; -- cgit v1.2.3 From 2c9d85d4d82d9e0a62aad08bf50650804e68ed30 Mon Sep 17 00:00:00 2001 From: Nogah Frankel Date: Fri, 16 Sep 2016 15:05:36 +0200 Subject: netdevice: Add offload statistics ndo Add a new ndo to return statistics for offloaded operation. Since there can be many different offloaded operation with many stats types, the ndo gets an attribute id by which it knows which stats are wanted. The ndo also gets a void pointer to be cast according to the attribute id. Signed-off-by: Nogah Frankel Signed-off-by: Jiri Pirko Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2095b6ab3661..a10d8d18ce19 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -924,6 +924,14 @@ struct netdev_xdp { * 3. Update dev->stats asynchronously and atomically, and define * neither operation. * + * bool (*ndo_has_offload_stats)(int attr_id) + * Return true if this device supports offload stats of this attr_id. + * + * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev, + * void *attr_data) + * Get statistics for offload operations by attr_id. Write it into the + * attr_data pointer. + * * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid); * If device supports VLAN filtering this function is called when a * VLAN id is registered. @@ -1155,6 +1163,10 @@ struct net_device_ops { struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev, struct rtnl_link_stats64 *storage); + bool (*ndo_has_offload_stats)(int attr_id); + int (*ndo_get_offload_stats)(int attr_id, + const struct net_device *dev, + void *attr_data); struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); int (*ndo_vlan_rx_add_vid)(struct net_device *dev, -- cgit v1.2.3 From e8bffe0cf964f0330595bb376b74921cccdaac88 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Fri, 16 Sep 2016 12:59:13 -0700 Subject: net: Add _nf_(un)register_hooks symbols Add _nf_register_hooks() and _nf_unregister_hooks() calls which allow caller to hold RTNL mutex. Signed-off-by: Mahesh Bandewar CC: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netfilter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 9230f9aee896..e82b76781bf6 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -133,6 +133,8 @@ int nf_register_hook(struct nf_hook_ops *reg); void nf_unregister_hook(struct nf_hook_ops *reg); int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); +int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); +void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); /* Functions to register get/setsockopt ranges (non-inclusive). You need to check permissions yourself! */ -- cgit v1.2.3 From ca26893f05e86497a86732768ec53cd38c0819ca Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 19 Sep 2016 19:00:09 +0800 Subject: rhashtable: Add rhlist interface The insecure_elasticity setting is an ugly wart brought out by users who need to insert duplicate objects (that is, distinct objects with identical keys) into the same table. In fact, those users have a much bigger problem. Once those duplicate objects are inserted, they don't have an interface to find them (unless you count the walker interface which walks over the entire table). Some users have resorted to doing a manual walk over the hash table which is of course broken because they don't handle the potential existence of multiple hash tables. The result is that they will break sporadically when they encounter a hash table resize/rehash. This patch provides a way out for those users, at the expense of an extra pointer per object. Essentially each object is now a list of objects carrying the same key. The hash table will only see the lists so nothing changes as far as rhashtable is concerned. To use this new interface, you need to insert a struct rhlist_head into your objects instead of struct rhash_head. While the hash table is unchanged, for type-safety you'll need to use struct rhltable instead of struct rhashtable. All the existing interfaces have been duplicated for rhlist, including the hash table walker. One missing feature is nulls marking because AFAIK the only potential user of it does not need duplicate objects. Should anyone need this it shouldn't be too hard to add. Signed-off-by: Herbert Xu Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 491 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 377 insertions(+), 114 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index fd82584acd48..5c132d3188be 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -1,7 +1,7 @@ /* * Resizable, Scalable, Concurrent Hash Table * - * Copyright (c) 2015 Herbert Xu + * Copyright (c) 2015-2016 Herbert Xu * Copyright (c) 2014-2015 Thomas Graf * Copyright (c) 2008-2014 Patrick McHardy * @@ -53,6 +53,11 @@ struct rhash_head { struct rhash_head __rcu *next; }; +struct rhlist_head { + struct rhash_head rhead; + struct rhlist_head __rcu *next; +}; + /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets @@ -137,6 +142,7 @@ struct rhashtable_params { * @key_len: Key length for hashfn * @elasticity: Maximum chain length before rehash * @p: Configuration parameters + * @rhlist: True if this is an rhltable * @run_work: Deferred worker to expand/shrink asynchronously * @mutex: Mutex to protect current/future table swapping * @lock: Spin lock to protect walker list @@ -147,11 +153,20 @@ struct rhashtable { unsigned int key_len; unsigned int elasticity; struct rhashtable_params p; + bool rhlist; struct work_struct run_work; struct mutex mutex; spinlock_t lock; }; +/** + * struct rhltable - Hash table with duplicate objects in a list + * @ht: Underlying rhtable + */ +struct rhltable { + struct rhashtable ht; +}; + /** * struct rhashtable_walker - Hash table walker * @list: List entry on list of walkers @@ -163,9 +178,10 @@ struct rhashtable_walker { }; /** - * struct rhashtable_iter - Hash table iterator, fits into netlink cb + * struct rhashtable_iter - Hash table iterator * @ht: Table to iterate through * @p: Current pointer + * @list: Current hash list pointer * @walker: Associated rhashtable walker * @slot: Current slot * @skip: Number of entries to skip in slot @@ -173,6 +189,7 @@ struct rhashtable_walker { struct rhashtable_iter { struct rhashtable *ht; struct rhash_head *p; + struct rhlist_head *list; struct rhashtable_walker walker; unsigned int slot; unsigned int skip; @@ -339,13 +356,11 @@ static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, int rhashtable_init(struct rhashtable *ht, const struct rhashtable_params *params); +int rhltable_init(struct rhltable *hlt, + const struct rhashtable_params *params); -struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, - const void *key, - struct rhash_head *obj, - struct bucket_table *old_tbl, - void **data); -int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl); +void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, + struct rhash_head *obj); void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter); @@ -507,6 +522,31 @@ void rhashtable_destroy(struct rhashtable *ht); rht_for_each_entry_rcu_continue(tpos, pos, (tbl)->buckets[hash],\ tbl, hash, member) +/** + * rhl_for_each_rcu - iterate over rcu hash table list + * @pos: the &struct rlist_head to use as a loop cursor. + * @list: the head of the list + * + * This hash chain list-traversal primitive should be used on the + * list returned by rhltable_lookup. + */ +#define rhl_for_each_rcu(pos, list) \ + for (pos = list; pos; pos = rcu_dereference_raw(pos->next)) + +/** + * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct rlist_head to use as a loop cursor. + * @list: the head of the list + * @member: name of the &struct rlist_head within the hashable struct. + * + * This hash chain list-traversal primitive should be used on the + * list returned by rhltable_lookup. + */ +#define rhl_for_each_entry_rcu(tpos, pos, list, member) \ + for (pos = list; pos && rht_entry(tpos, pos, member); \ + pos = rcu_dereference_raw(pos->next)) + static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, const void *obj) { @@ -516,18 +556,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); } -/** - * rhashtable_lookup_fast - search hash table, inlined version - * @ht: hash table - * @key: the pointer to the key - * @params: hash table parameters - * - * Computes the hash value for the key and traverses the bucket chain looking - * for a entry with an identical key. The first matching entry is returned. - * - * Returns the first entry on which the compare function returned true. - */ -static inline void *rhashtable_lookup_fast( +/* Internal function, do not use. */ +static inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { @@ -539,8 +569,6 @@ static inline void *rhashtable_lookup_fast( struct rhash_head *he; unsigned int hash; - rcu_read_lock(); - tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); @@ -549,8 +577,7 @@ restart: params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) continue; - rcu_read_unlock(); - return rht_obj(ht, he); + return he; } /* Ensure we see any new tables. */ @@ -559,96 +586,165 @@ restart: tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(tbl)) goto restart; - rcu_read_unlock(); return NULL; } +/** + * rhashtable_lookup - search hash table + * @ht: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for a entry with an identical key. The first matching entry is returned. + * + * This must only be called under the RCU read lock. + * + * Returns the first entry on which the compare function returned true. + */ +static inline void *rhashtable_lookup( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(ht, key, params); + + return he ? rht_obj(ht, he) : NULL; +} + +/** + * rhashtable_lookup_fast - search hash table, without RCU read lock + * @ht: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for a entry with an identical key. The first matching entry is returned. + * + * Only use this function when you have other mechanisms guaranteeing + * that the object won't go away after the RCU read lock is released. + * + * Returns the first entry on which the compare function returned true. + */ +static inline void *rhashtable_lookup_fast( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + void *obj; + + rcu_read_lock(); + obj = rhashtable_lookup(ht, key, params); + rcu_read_unlock(); + + return obj; +} + +/** + * rhltable_lookup - search hash list table + * @hlt: hash table + * @key: the pointer to the key + * @params: hash table parameters + * + * Computes the hash value for the key and traverses the bucket chain looking + * for a entry with an identical key. All matching entries are returned + * in a list. + * + * This must only be called under the RCU read lock. + * + * Returns the list of entries that match the given key. + */ +static inline struct rhlist_head *rhltable_lookup( + struct rhltable *hlt, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); + + return he ? container_of(he, struct rhlist_head, rhead) : NULL; +} + /* Internal function, please use rhashtable_insert_fast() instead. This * function returns the existing element already in hashes in there is a clash, * otherwise it returns an error via ERR_PTR(). */ static inline void *__rhashtable_insert_fast( struct rhashtable *ht, const void *key, struct rhash_head *obj, - const struct rhashtable_params params) + const struct rhashtable_params params, bool rhlist) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; - struct bucket_table *tbl, *new_tbl; + struct rhash_head __rcu **pprev; + struct bucket_table *tbl; struct rhash_head *head; spinlock_t *lock; - unsigned int elasticity; unsigned int hash; - void *data = NULL; - int err; + int elasticity; + void *data; -restart: rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); + hash = rht_head_hashfn(ht, tbl, obj, params); + lock = rht_bucket_lock(tbl, hash); + spin_lock_bh(lock); - /* All insertions must grab the oldest table containing - * the hashed bucket that is yet to be rehashed. - */ - for (;;) { - hash = rht_head_hashfn(ht, tbl, obj, params); - lock = rht_bucket_lock(tbl, hash); - spin_lock_bh(lock); - - if (tbl->rehash <= hash) - break; - + if (unlikely(rht_dereference_bucket(tbl->future_tbl, tbl, hash))) { +slow_path: spin_unlock_bh(lock); - tbl = rht_dereference_rcu(tbl->future_tbl, ht); + rcu_read_unlock(); + return rhashtable_insert_slow(ht, key, obj); } - new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); - if (unlikely(new_tbl)) { - tbl = rhashtable_insert_slow(ht, key, obj, new_tbl, &data); - if (!IS_ERR_OR_NULL(tbl)) - goto slow_path; + elasticity = ht->elasticity; + pprev = &tbl->buckets[hash]; + rht_for_each(head, tbl, hash) { + struct rhlist_head *plist; + struct rhlist_head *list; + + elasticity--; + if (!key || + (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, head)) : + rhashtable_compare(&arg, rht_obj(ht, head)))) + continue; + + data = rht_obj(ht, head); - err = PTR_ERR(tbl); - if (err == -EEXIST) - err = 0; + if (!rhlist) + goto out; - goto out; - } - err = -E2BIG; - if (unlikely(rht_grow_above_max(ht, tbl))) - goto out; + list = container_of(obj, struct rhlist_head, rhead); + plist = container_of(head, struct rhlist_head, rhead); - if (unlikely(rht_grow_above_100(ht, tbl))) { -slow_path: - spin_unlock_bh(lock); - err = rhashtable_insert_rehash(ht, tbl); - rcu_read_unlock(); - if (err) - return ERR_PTR(err); + RCU_INIT_POINTER(list->next, plist); + head = rht_dereference_bucket(head->next, tbl, hash); + RCU_INIT_POINTER(list->rhead.next, head); + rcu_assign_pointer(*pprev, obj); - goto restart; + goto good; } - err = 0; - elasticity = ht->elasticity; - rht_for_each(head, tbl, hash) { - if (key && - unlikely(!(params.obj_cmpfn ? - params.obj_cmpfn(&arg, rht_obj(ht, head)) : - rhashtable_compare(&arg, rht_obj(ht, head))))) { - data = rht_obj(ht, head); - goto out; - } - if (!--elasticity) - goto slow_path; - } + if (elasticity <= 0) + goto slow_path; + + data = ERR_PTR(-E2BIG); + if (unlikely(rht_grow_above_max(ht, tbl))) + goto out; + + if (unlikely(rht_grow_above_100(ht, tbl))) + goto slow_path; head = rht_dereference_bucket(tbl->buckets[hash], tbl, hash); RCU_INIT_POINTER(obj->next, head); + if (rhlist) { + struct rhlist_head *list; + + list = container_of(obj, struct rhlist_head, rhead); + RCU_INIT_POINTER(list->next, NULL); + } rcu_assign_pointer(tbl->buckets[hash], obj); @@ -656,11 +752,14 @@ slow_path: if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); +good: + data = NULL; + out: spin_unlock_bh(lock); rcu_read_unlock(); - return err ? ERR_PTR(err) : data; + return data; } /** @@ -685,13 +784,65 @@ static inline int rhashtable_insert_fast( { void *ret; - ret = __rhashtable_insert_fast(ht, NULL, obj, params); + ret = __rhashtable_insert_fast(ht, NULL, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } +/** + * rhltable_insert_key - insert object into hash list table + * @hlt: hash list table + * @key: the pointer to the key + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Will take a per bucket spinlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket lock. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if the size grows + * beyond the watermark indicated by grow_decision() which can be passed + * to rhashtable_init(). + */ +static inline int rhltable_insert_key( + struct rhltable *hlt, const void *key, struct rhlist_head *list, + const struct rhashtable_params params) +{ + return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, + params, true)); +} + +/** + * rhltable_insert - insert object into hash list table + * @hlt: hash list table + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Will take a per bucket spinlock to protect against mutual mutations + * on the same bucket. Multiple insertions may occur in parallel unless + * they map to the same bucket lock. + * + * It is safe to call this function from atomic context. + * + * Will trigger an automatic deferred table resizing if the size grows + * beyond the watermark indicated by grow_decision() which can be passed + * to rhashtable_init(). + */ +static inline int rhltable_insert( + struct rhltable *hlt, struct rhlist_head *list, + const struct rhashtable_params params) +{ + const char *key = rht_obj(&hlt->ht, &list->rhead); + + key += params.key_offset; + + return rhltable_insert_key(hlt, key, list, params); +} + /** * rhashtable_lookup_insert_fast - lookup and insert object into hash table * @ht: hash table @@ -722,7 +873,8 @@ static inline int rhashtable_lookup_insert_fast( BUG_ON(ht->p.obj_hashfn); - ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params); + ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, + false); if (IS_ERR(ret)) return PTR_ERR(ret); @@ -759,7 +911,7 @@ static inline int rhashtable_lookup_insert_key( BUG_ON(!ht->p.obj_hashfn || !key); - ret = __rhashtable_insert_fast(ht, key, obj, params); + ret = __rhashtable_insert_fast(ht, key, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); @@ -783,13 +935,14 @@ static inline void *rhashtable_lookup_get_insert_key( { BUG_ON(!ht->p.obj_hashfn || !key); - return __rhashtable_insert_fast(ht, key, obj, params); + return __rhashtable_insert_fast(ht, key, obj, params, false); } /* Internal function, please use rhashtable_remove_fast() instead */ -static inline int __rhashtable_remove_fast( +static inline int __rhashtable_remove_fast_one( struct rhashtable *ht, struct bucket_table *tbl, - struct rhash_head *obj, const struct rhashtable_params params) + struct rhash_head *obj, const struct rhashtable_params params, + bool rhlist) { struct rhash_head __rcu **pprev; struct rhash_head *he; @@ -804,39 +957,66 @@ static inline int __rhashtable_remove_fast( pprev = &tbl->buckets[hash]; rht_for_each(he, tbl, hash) { + struct rhlist_head *list; + + list = container_of(he, struct rhlist_head, rhead); + if (he != obj) { + struct rhlist_head __rcu **lpprev; + pprev = &he->next; - continue; + + if (!rhlist) + continue; + + do { + lpprev = &list->next; + list = rht_dereference_bucket(list->next, + tbl, hash); + } while (list && obj != &list->rhead); + + if (!list) + continue; + + list = rht_dereference_bucket(list->next, tbl, hash); + RCU_INIT_POINTER(*lpprev, list); + err = 0; + break; } - rcu_assign_pointer(*pprev, obj->next); - err = 0; + obj = rht_dereference_bucket(obj->next, tbl, hash); + err = 1; + + if (rhlist) { + list = rht_dereference_bucket(list->next, tbl, hash); + if (list) { + RCU_INIT_POINTER(list->rhead.next, obj); + obj = &list->rhead; + err = 0; + } + } + + rcu_assign_pointer(*pprev, obj); break; } spin_unlock_bh(lock); + if (err > 0) { + atomic_dec(&ht->nelems); + if (unlikely(ht->p.automatic_shrinking && + rht_shrink_below_30(ht, tbl))) + schedule_work(&ht->run_work); + err = 0; + } + return err; } -/** - * rhashtable_remove_fast - remove object from hash table - * @ht: hash table - * @obj: pointer to hash head inside object - * @params: hash table parameters - * - * Since the hash chain is single linked, the removal operation needs to - * walk the bucket chain upon removal. The removal operation is thus - * considerable slow if the hash table is not correctly sized. - * - * Will automatically shrink the table via rhashtable_expand() if the - * shrink_decision function specified at rhashtable_init() returns true. - * - * Returns zero on success, -ENOENT if the entry could not be found. - */ -static inline int rhashtable_remove_fast( +/* Internal function, please use rhashtable_remove_fast() instead */ +static inline int __rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, - const struct rhashtable_params params) + const struct rhashtable_params params, bool rhlist) { struct bucket_table *tbl; int err; @@ -850,24 +1030,60 @@ static inline int rhashtable_remove_fast( * visible then that guarantees the entry to still be in * the old tbl if it exists. */ - while ((err = __rhashtable_remove_fast(ht, tbl, obj, params)) && + while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params, + rhlist)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; - if (err) - goto out; - - atomic_dec(&ht->nelems); - if (unlikely(ht->p.automatic_shrinking && - rht_shrink_below_30(ht, tbl))) - schedule_work(&ht->run_work); - -out: rcu_read_unlock(); return err; } +/** + * rhashtable_remove_fast - remove object from hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Since the hash chain is single linked, the removal operation needs to + * walk the bucket chain upon removal. The removal operation is thus + * considerable slow if the hash table is not correctly sized. + * + * Will automatically shrink the table via rhashtable_expand() if the + * shrink_decision function specified at rhashtable_init() returns true. + * + * Returns zero on success, -ENOENT if the entry could not be found. + */ +static inline int rhashtable_remove_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + return __rhashtable_remove_fast(ht, obj, params, false); +} + +/** + * rhltable_remove - remove object from hash list table + * @hlt: hash list table + * @list: pointer to hash list head inside object + * @params: hash table parameters + * + * Since the hash chain is single linked, the removal operation needs to + * walk the bucket chain upon removal. The removal operation is thus + * considerable slow if the hash table is not correctly sized. + * + * Will automatically shrink the table via rhashtable_expand() if the + * shrink_decision function specified at rhashtable_init() returns true. + * + * Returns zero on success, -ENOENT if the entry could not be found. + */ +static inline int rhltable_remove( + struct rhltable *hlt, struct rhlist_head *list, + const struct rhashtable_params params) +{ + return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true); +} + /* Internal function, please use rhashtable_replace_fast() instead */ static inline int __rhashtable_replace_fast( struct rhashtable *ht, struct bucket_table *tbl, @@ -958,4 +1174,51 @@ static inline int rhashtable_walk_init(struct rhashtable *ht, return 0; } +/** + * rhltable_walk_enter - Initialise an iterator + * @hlt: Table to walk over + * @iter: Hash table Iterator + * + * This function prepares a hash table walk. + * + * Note that if you restart a walk after rhashtable_walk_stop you + * may see the same object twice. Also, you may miss objects if + * there are removals in between rhashtable_walk_stop and the next + * call to rhashtable_walk_start. + * + * For a completely stable walk you should construct your own data + * structure outside the hash table. + * + * This function may sleep so you must not call it from interrupt + * context or with spin locks held. + * + * You must call rhashtable_walk_exit after this function returns. + */ +static inline void rhltable_walk_enter(struct rhltable *hlt, + struct rhashtable_iter *iter) +{ + return rhashtable_walk_enter(&hlt->ht, iter); +} + +/** + * rhltable_free_and_destroy - free elements and destroy hash list table + * @hlt: the hash list table to destroy + * @free_fn: callback to release resources of element + * @arg: pointer passed to free_fn + * + * See documentation for rhashtable_free_and_destroy. + */ +static inline void rhltable_free_and_destroy(struct rhltable *hlt, + void (*free_fn)(void *ptr, + void *arg), + void *arg) +{ + return rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); +} + +static inline void rhltable_destroy(struct rhltable *hlt) +{ + return rhltable_free_and_destroy(hlt, NULL, NULL); +} + #endif /* _LINUX_RHASHTABLE_H */ -- cgit v1.2.3 From 36bbef52c7eb646ed6247055a2acd3851e317857 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 20 Sep 2016 00:26:13 +0200 Subject: bpf: direct packet write and access for helpers for clsact progs This work implements direct packet access for helpers and direct packet write in a similar fashion as already available for XDP types via commits 4acf6c0b84c9 ("bpf: enable direct packet data write for xdp progs") and 6841de8b0d03 ("bpf: allow helpers access the packet directly"), and as a complementary feature to the already available direct packet read for tc (cls/act) programs. For enabling this, we need to introduce two helpers, bpf_skb_pull_data() and bpf_csum_update(). The first is generally needed for both, read and write, because they would otherwise only be limited to the current linear skb head. Usually, when the data_end test fails, programs just bail out, or, in the direct read case, use bpf_skb_load_bytes() as an alternative to overcome this limitation. If such data sits in non-linear parts, we can just pull them in once with the new helper, retest and eventually access them. At the same time, this also makes sure the skb is uncloned, which is, of course, a necessary condition for direct write. As this needs to be an invariant for the write part only, the verifier detects writes and adds a prologue that is calling bpf_skb_pull_data() to effectively unclone the skb from the very beginning in case it is indeed cloned. The heuristic makes use of a similar trick that was done in 233577a22089 ("net: filter: constify detection of pkt_type_offset"). This comes at zero cost for other programs that do not use the direct write feature. Should a program use this feature only sparsely and has read access for the most parts with, for example, drop return codes, then such write action can be delegated to a tail called program for mitigating this cost of potential uncloning to a late point in time where it would have been paid similarly with the bpf_skb_store_bytes() as well. Advantage of direct write is that the writes are inlined whereas the helper cannot make any length assumptions and thus needs to generate a call to memcpy() also for small sizes, as well as cost of helper call itself with sanity checks are avoided. Plus, when direct read is already used, we don't need to cache or perform rechecks on the data boundaries (due to verifier invalidating previous checks for helpers that change skb->data), so more complex programs using rewrites can benefit from switching to direct read plus write. For direct packet access to helpers, we save the otherwise needed copy into a temp struct sitting on stack memory when use-case allows. Both facilities are enabled via may_access_direct_pkt_data() in verifier. For now, we limit this to map helpers and csum_diff, and can successively enable other helpers where we find it makes sense. Helpers that definitely cannot be allowed for this are those part of bpf_helper_changes_skb_data() since they can change underlying data, and those that write into memory as this could happen for packet typed args when still cloned. bpf_csum_update() helper accommodates for the fact that we need to fixup checksum_complete when using direct write instead of bpf_skb_store_bytes(), meaning the programs can use available helpers like bpf_csum_diff(), and implement csum_add(), csum_sub(), csum_block_add(), csum_block_sub() equivalents in eBPF together with the new helper. A usage example will be provided for iproute2's examples/bpf/ directory. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 4 +++- include/linux/skbuff.h | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9a904f63f8c1..5691fdc83819 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -96,6 +96,7 @@ enum bpf_return_type { struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; + bool pkt_access; enum bpf_return_type ret_type; enum bpf_arg_type arg1_type; enum bpf_arg_type arg2_type; @@ -151,7 +152,8 @@ struct bpf_verifier_ops { */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type); - + int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, + const struct bpf_prog *prog); u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg, int src_reg, int ctx_off, struct bpf_insn *insn, struct bpf_prog *prog); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4c5662f05bda..c6dab3f7457c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -676,13 +676,23 @@ struct sk_buff { */ kmemcheck_bitfield_begin(flags1); __u16 queue_mapping; + +/* if you move cloned around you also must adapt those constants */ +#ifdef __BIG_ENDIAN_BITFIELD +#define CLONED_MASK (1 << 7) +#else +#define CLONED_MASK 1 +#endif +#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset) + + __u8 __cloned_offset[0]; __u8 cloned:1, nohdr:1, fclone:2, peeked:1, head_frag:1, - xmit_more:1; - /* one bit hole */ + xmit_more:1, + __unused:1; /* one bit hole */ kmemcheck_bitfield_end(flags1); /* fields enclosed in headers_start/headers_end are copied -- cgit v1.2.3 From a4f1f9ac8153e22869b6408832b5a9bb9c762bf6 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:09 -0400 Subject: lib/win_minmax: windowed min or max estimator This commit introduces a generic library to estimate either the min or max value of a time-varying variable over a recent time window. This is code originally from Kathleen Nichols. The current form of the code is from Van Jacobson. A single struct minmax_sample will track the estimated windowed-max value of the series if you call minmax_running_max() or the estimated windowed-min value of the series if you call minmax_running_min(). Nearly equivalent code is already in place for minimum RTT estimation in the TCP stack. This commit extracts that code and generalizes it to handle both min and max. Moving the code here reduces the footprint and complexity of the TCP code base and makes the filter generally available for other parts of the codebase, including an upcoming TCP congestion control module. This library works well for time series where the measurements are smoothly increasing or decreasing. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/win_minmax.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 include/linux/win_minmax.h (limited to 'include/linux') diff --git a/include/linux/win_minmax.h b/include/linux/win_minmax.h new file mode 100644 index 000000000000..56569604278f --- /dev/null +++ b/include/linux/win_minmax.h @@ -0,0 +1,37 @@ +/** + * lib/minmax.c: windowed min/max tracker by Kathleen Nichols. + * + */ +#ifndef MINMAX_H +#define MINMAX_H + +#include + +/* A single data point for our parameterized min-max tracker */ +struct minmax_sample { + u32 t; /* time measurement was taken */ + u32 v; /* value measured */ +}; + +/* State for the parameterized min-max tracker */ +struct minmax { + struct minmax_sample s[3]; +}; + +static inline u32 minmax_get(const struct minmax *m) +{ + return m->s[0].v; +} + +static inline u32 minmax_reset(struct minmax *m, u32 t, u32 meas) +{ + struct minmax_sample val = { .t = t, .v = meas }; + + m->s[2] = m->s[1] = m->s[0] = val; + return m->s[0].v; +} + +u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas); +u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas); + +#endif -- cgit v1.2.3 From 6403389211e1f4d40ed963fe47a96fce1a3ba7a9 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:10 -0400 Subject: tcp: use windowed min filter library for TCP min_rtt estimation Refactor the TCP min_rtt code to reuse the new win_minmax library in lib/win_minmax.c to simplify the TCP code. This is a pure refactor: the functionality is exactly the same. We just moved the windowed min code to make TCP easier to read and maintain, and to allow other parts of the kernel to use the windowed min/max filter code. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/tcp.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c723a465125d..6433cc8b4667 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -234,9 +235,7 @@ struct tcp_sock { u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 rttvar_us; /* smoothed mdev_max */ u32 rtt_seq; /* sequence number to update rttvar */ - struct rtt_meas { - u32 rtt, ts; /* RTT in usec and sampling time in jiffies. */ - } rtt_min[3]; + struct minmax rtt_min; u32 packets_out; /* Packets which are "in flight" */ u32 retrans_out; /* Retransmitted packets out */ -- cgit v1.2.3 From 0682e6902a52aca7caf6ad42551b16ea0f87bc31 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Sep 2016 23:39:13 -0400 Subject: tcp: count packets marked lost for a TCP connection Count the number of packets that a TCP connection marks lost. Congestion control modules can use this loss rate information for more intelligent decisions about how fast to send. Specifically, this is used in TCP BBR policer detection. BBR uses a high packet loss rate as one signal in its policer detection and policer bandwidth estimation algorithm. The BBR policer detection algorithm cannot simply track retransmits, because a retransmit can be (and often is) an indicator of packets lost long, long ago. This is particularly true in a long CA_Loss period that repairs the initial massive losses when a policer kicks in. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 6433cc8b4667..38590fbc0ac5 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -267,6 +267,7 @@ struct tcp_sock { * receiver in Recovery. */ u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ + u32 lost; /* Total data packets lost incl. rexmits */ u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ -- cgit v1.2.3 From b9f64820fb226a4e8ab10591f46cecd91ca56b30 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 19 Sep 2016 23:39:14 -0400 Subject: tcp: track data delivery rate for a TCP connection This patch generates data delivery rate (throughput) samples on a per-ACK basis. These rate samples can be used by congestion control modules, and specifically will be used by TCP BBR in later patches in this series. Key state: tp->delivered: Tracks the total number of data packets (original or not) delivered so far. This is an already-existing field. tp->delivered_mstamp: the last time tp->delivered was updated. Algorithm: A rate sample is calculated as (d1 - d0)/(t1 - t0) on a per-ACK basis: d1: the current tp->delivered after processing the ACK t1: the current time after processing the ACK d0: the prior tp->delivered when the acked skb was transmitted t0: the prior tp->delivered_mstamp when the acked skb was transmitted When an skb is transmitted, we snapshot d0 and t0 in its control block in tcp_rate_skb_sent(). When an ACK arrives, it may SACK and ACK some skbs. For each SACKed or ACKed skb, tcp_rate_skb_delivered() updates the rate_sample struct to reflect the latest (d0, t0). Finally, tcp_rate_gen() generates a rate sample by storing (d1 - d0) in rs->delivered and (t1 - t0) in rs->interval_us. One caveat: if an skb was sent with no packets in flight, then tp->delivered_mstamp may be either invalid (if the connection is starting) or outdated (if the connection was idle). In that case, we'll re-stamp tp->delivered_mstamp. At first glance it seems t0 should always be the time when an skb was transmitted, but actually this could over-estimate the rate due to phase mismatch between transmit and ACK events. To track the delivery rate, we ensure that if packets are in flight then t0 and and t1 are times at which packets were marked delivered. If the initial and final RTTs are different then one may be corrupted by some sort of noise. The noise we see most often is sending gaps caused by delayed, compressed, or stretched acks. This either affects both RTTs equally or artificially reduces the final RTT. We approach this by recording the info we need to compute the initial RTT (duration of the "send phase" of the window) when we recorded the associated inflight. Then, for a filter to avoid bandwidth overestimates, we generalize the per-sample bandwidth computation from: bw = delivered / ack_phase_rtt to the following: bw = delivered / max(send_phase_rtt, ack_phase_rtt) In large-scale experiments, this filtering approach incorporating send_phase_rtt is effective at avoiding bandwidth overestimates due to ACK compression or stretched ACKs. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 38590fbc0ac5..c50e6aec005a 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -268,6 +268,8 @@ struct tcp_sock { u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 lost; /* Total data packets lost incl. rexmits */ + struct skb_mstamp first_tx_mstamp; /* start of window send phase */ + struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */ u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ -- cgit v1.2.3 From d7722e8570fc0f1e003cee7cf37694041828918b Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Mon, 19 Sep 2016 23:39:15 -0400 Subject: tcp: track application-limited rate samples This commit adds code to track whether the delivery rate represented by each rate_sample was limited by the application. Upon each transmit, we store in the is_app_limited field in the skb a boolean bit indicating whether there is a known "bubble in the pipe": a point in the rate sample interval where the sender was application-limited, and did not transmit even though the cwnd and pacing rate allowed it. This logic marks the flow app-limited on a write if *all* of the following are true: 1) There is less than 1 MSS of unsent data in the write queue available to transmit. 2) There is no packet in the sender's queues (e.g. in fq or the NIC tx queue). 3) The connection is not limited by cwnd. 4) There are no lost packets to retransmit. The tcp_rate_check_app_limited() code in tcp_rate.c determines whether the connection is application-limited at the moment. If the flow is application-limited, it sets the tp->app_limited field. If the flow is application-limited then that means there is effectively a "bubble" of silence in the pipe now, and this silence will be reflected in a lower bandwidth sample for any rate samples from now until we get an ACK indicating this bubble has exited the pipe: specifically, until we get an ACK for the next packet we transmit. When we send every skb we record in scb->tx.is_app_limited whether the resulting rate sample will be application-limited. The code in tcp_rate_gen() checks to see when it is safe to mark all known application-limited bubbles of silence as having exited the pipe. It does this by checking to see when the delivered count moves past the tp->app_limited marker. At this point it zeroes the tp->app_limited marker, as all known bubbles are out of the pipe. We make room for the tx.is_app_limited bit in the skb by borrowing a bit from the in_flight field used by NV to record the number of bytes in flight. The receive window in the TCP header is 16 bits, and the max receive window scaling shift factor is 14 (RFC 1323). So the max receive window offered by the TCP protocol is 2^(16+14) = 2^30. So we only need 30 bits for the tx.in_flight used by NV. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index c50e6aec005a..fdcd00ffcb66 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -268,6 +268,7 @@ struct tcp_sock { u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 lost; /* Total data packets lost incl. rexmits */ + u32 app_limited; /* limited until "delivered" reaches this val */ struct skb_mstamp first_tx_mstamp; /* start of window send phase */ struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */ -- cgit v1.2.3 From eb8329e0a04db0061f714f033b4454326ba147f4 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Mon, 19 Sep 2016 23:39:16 -0400 Subject: tcp: export data delivery rate This commit export two new fields in struct tcp_info: tcpi_delivery_rate: The most recent goodput, as measured by tcp_rate_gen(). If the socket is limited by the sending application (e.g., no data to send), it reports the highest measurement instead of the most recent. The unit is bytes per second (like other rate fields in tcp_info). tcpi_delivery_rate_app_limited: A boolean indicating if the goodput was measured when the socket's throughput was limited by the sending application. This delivery rate information can be useful for applications that want to know the current throughput the TCP connection is seeing, e.g. adaptive bitrate video streaming. It can also be very useful for debugging or troubleshooting. Signed-off-by: Van Jacobson Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/linux/tcp.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index fdcd00ffcb66..a17ae7b85218 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -213,7 +213,8 @@ struct tcp_sock { u8 reord; /* reordering detected */ } rack; u16 advmss; /* Advertised MSS */ - u8 unused; + u8 rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ + unused:7; u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ thin_dupack : 1,/* Fast retransmit on first dupack */ @@ -271,6 +272,8 @@ struct tcp_sock { u32 app_limited; /* limited until "delivered" reaches this val */ struct skb_mstamp first_tx_mstamp; /* start of window send phase */ struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */ + u32 rate_delivered; /* saved rate sample: packets delivered */ + u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ -- cgit v1.2.3 From 332ae8e2f6ecda5e50c5c62ed62894963e3a83f5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:53 +0100 Subject: net: cls_bpf: add hardware offload This patch adds hardware offload capability to cls_bpf classifier, similar to what have been done with U32 and flower. Signed-off-by: Jakub Kicinski Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a10d8d18ce19..69f242c71865 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -789,6 +789,7 @@ enum { TC_SETUP_CLSU32, TC_SETUP_CLSFLOWER, TC_SETUP_MATCHALL, + TC_SETUP_CLSBPF, }; struct tc_cls_u32_offload; @@ -800,6 +801,7 @@ struct tc_to_netdev { struct tc_cls_u32_offload *cls_u32; struct tc_cls_flower_offload *cls_flower; struct tc_cls_matchall_offload *cls_mall; + struct tc_cls_bpf_offload *cls_bpf; }; }; -- cgit v1.2.3 From 58e2af8b3a6b587e4ac8414343581da4349d3c0f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:57 +0100 Subject: bpf: expose internal verfier structures Move verifier's internal structures to a header file and prefix their names with bpf_ to avoid potential namespace conflicts. Those structures will soon be used by external analyzers. Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 79 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 include/linux/bpf_verifier.h (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h new file mode 100644 index 000000000000..9457a22fc6e0 --- /dev/null +++ b/include/linux/bpf_verifier.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _LINUX_BPF_VERIFIER_H +#define _LINUX_BPF_VERIFIER_H 1 + +#include /* for enum bpf_reg_type */ +#include /* for MAX_BPF_STACK */ + +struct bpf_reg_state { + enum bpf_reg_type type; + union { + /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ + s64 imm; + + /* valid when type == PTR_TO_PACKET* */ + struct { + u32 id; + u16 off; + u16 range; + }; + + /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | + * PTR_TO_MAP_VALUE_OR_NULL + */ + struct bpf_map *map_ptr; + }; +}; + +enum bpf_stack_slot_type { + STACK_INVALID, /* nothing was stored in this stack slot */ + STACK_SPILL, /* register spilled into stack */ + STACK_MISC /* BPF program wrote some data into this slot */ +}; + +#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ + +/* state of the program: + * type of all registers and stack info + */ +struct bpf_verifier_state { + struct bpf_reg_state regs[MAX_BPF_REG]; + u8 stack_slot_type[MAX_BPF_STACK]; + struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; +}; + +/* linked list of verifier states used to prune search */ +struct bpf_verifier_state_list { + struct bpf_verifier_state state; + struct bpf_verifier_state_list *next; +}; + +struct bpf_insn_aux_data { + enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ +}; + +#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ + +/* single container for all structs + * one verifier_env per bpf_check() call + */ +struct bpf_verifier_env { + struct bpf_prog *prog; /* eBPF program being verified */ + struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ + int stack_size; /* number of states to be processed */ + struct bpf_verifier_state cur_state; /* current verifier state */ + struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ + struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + u32 used_map_cnt; /* number of used maps */ + u32 id_gen; /* used to generate unique reg IDs */ + bool allow_ptr_leaks; + bool seen_direct_write; + struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ +}; + +#endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From 13a27dfc669724564aafa2699976ee756029fed2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 21 Sep 2016 11:43:58 +0100 Subject: bpf: enable non-core use of the verfier Advanced JIT compilers and translators may want to use eBPF verifier as a base for parsers or to perform custom checks and validations. Add ability for external users to invoke the verifier and provide callbacks to be invoked for every intruction checked. For now only add most basic callback for per-instruction pre-interpretation checks is added. More advanced users may also like to have per-instruction post callback and state comparison callback. Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf_verifier.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9457a22fc6e0..c5cb661712c9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -59,6 +59,12 @@ struct bpf_insn_aux_data { #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +struct bpf_verifier_env; +struct bpf_ext_analyzer_ops { + int (*insn_hook)(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx); +}; + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -68,6 +74,8 @@ struct bpf_verifier_env { int stack_size; /* number of states to be processed */ struct bpf_verifier_state cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ + const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */ + void *analyzer_priv; /* pointer to external analyzer's private data */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ @@ -76,4 +84,7 @@ struct bpf_verifier_env { struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ }; +int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, + void *priv); + #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From bfca4c520f7ea78138ddccea2de18dc062b0fefd Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Mon, 19 Sep 2016 19:11:09 +0300 Subject: net: skbuff: Export __skb_vlan_pop This exports the functionality of extracting the tag from the payload, without moving next vlan tag into hw accel tag. Signed-off-by: Shmulik Ladkani Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c6dab3f7457c..9bf60b556bd2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3085,6 +3085,7 @@ bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, int write_len); +int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, -- cgit v1.2.3 From efee95f42b5dddedcaff0a0eaa44e170fc7522e8 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 20 Sep 2016 19:25:58 -0400 Subject: ptp_clock: future-proofing drivers against PTP subsystem becoming optional Drivers must be ready to accept NULL from ptp_clock_register() if the PTP clock subsystem is configured out. This patch documents that and ensures that all drivers cope well with a NULL return. Signed-off-by: Nicolas Pitre Reviewed-by: Eugenia Emantayev Acked-by: Richard Cochran Acked-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 6b15e168148a..5ad54fc66cf0 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -127,6 +127,11 @@ struct ptp_clock; * * @info: Structure describing the new clock. * @parent: Pointer to the parent device of the new clock. + * + * Returns a valid pointer on success or PTR_ERR on failure. If PHC + * support is missing at the configuration level, this function + * returns NULL, and drivers are expected to gracefully handle that + * case separately. */ extern struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, -- cgit v1.2.3 From 77f2efcbdd7133466060198e02c6e8a170c3cd14 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 22 Sep 2016 00:29:01 +0100 Subject: rxrpc: Add ktime_sub_ms() Add a ktime_sub_ms() to go with ktime_add_ms() and co. for use in AF_RXRPC RTT determination. Signed-off-by: David Howells --- include/linux/ktime.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 2b6a204bd8d4..aa118bad1407 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -231,6 +231,11 @@ static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec) return ktime_sub_ns(kt, usec * NSEC_PER_USEC); } +static inline ktime_t ktime_sub_ms(const ktime_t kt, const u64 msec) +{ + return ktime_sub_ns(kt, msec * NSEC_PER_MSEC); +} + extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs); /** -- cgit v1.2.3 From 572de608e36279f249c9a6350f142e69f23dacab Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Thu, 22 Sep 2016 10:33:54 +0800 Subject: net: ethernet: mediatek: add extension of phy-mode for TRGMII adds PHY-mode "trgmii" as an extension for the operation mode of the PHY interface for PHY_INTERFACE_MODE_TRGMII. and adds a variable trgmii inside mtk_mac as the indication to make the difference between the MAC connected to internal switch or connected to external PHY by the given configuration on the board and then to perform the corresponding setup on TRGMII hardware module. Signed-off-by: Sean Wang Cc: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 2d24b283aa2d..e25f1830fbcf 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -80,6 +80,7 @@ typedef enum { PHY_INTERFACE_MODE_XGMII, PHY_INTERFACE_MODE_MOCA, PHY_INTERFACE_MODE_QSGMII, + PHY_INTERFACE_MODE_TRGMII, PHY_INTERFACE_MODE_MAX, } phy_interface_t; @@ -123,6 +124,8 @@ static inline const char *phy_modes(phy_interface_t interface) return "moca"; case PHY_INTERFACE_MODE_QSGMII: return "qsgmii"; + case PHY_INTERFACE_MODE_TRGMII: + return "trgmii"; default: return "unknown"; } -- cgit v1.2.3 From 7c3d21c8153c6bfb5690e35e086b0522c42442d9 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 22 Sep 2016 12:11:13 +0300 Subject: net/mlx4_core: Preparation for VF vlan protocol 802.1ad Check device capability to support VF vlan protocol 802.1ad mode. Add vport attribute vlan protocol. Init vport vlan protocol by default to 802.1Q. Add update QP support for VF vlan protocol 802.1ad. Add func capability vlan_offload_disable to disable all vlan HW acceleration on VF while the VF is set to VF vlan protocol 802.1ad mode. No change in VF vlan protocol 802.1Q (VST) mode. Signed-off-by: Moshe Shemesh Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/mlx4/device.h | 3 +++ include/linux/mlx4/qp.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 42da3552f7cb..59b50d3eedb4 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -221,6 +221,7 @@ enum { MLX4_DEV_CAP_FLAG2_ROCE_V1_V2 = 1ULL << 33, MLX4_DEV_CAP_FLAG2_DMFS_UC_MC_SNIFFER = 1ULL << 34, MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT = 1ULL << 35, + MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP = 1ULL << 36, }; enum { @@ -1371,6 +1372,8 @@ int mlx4_SET_PORT_fcs_check(struct mlx4_dev *dev, u8 port, int mlx4_SET_PORT_VXLAN(struct mlx4_dev *dev, u8 port, u8 steering, int enable); int set_phv_bit(struct mlx4_dev *dev, u8 port, int new_val); int get_phv_bit(struct mlx4_dev *dev, u8 port, int *phv); +int mlx4_get_is_vlan_offload_disabled(struct mlx4_dev *dev, u8 port, + bool *vlan_offload_disabled); int mlx4_find_cached_mac(struct mlx4_dev *dev, u8 port, u64 mac, int *idx); int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index deaa2217214d..b4ee8f62ce8d 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -160,6 +160,7 @@ struct mlx4_qp_path { enum { /* fl */ MLX4_FL_CV = 1 << 6, + MLX4_FL_SV = 1 << 5, MLX4_FL_ETH_HIDE_CQE_VLAN = 1 << 2, MLX4_FL_ETH_SRC_CHECK_MC_LB = 1 << 1, MLX4_FL_ETH_SRC_CHECK_UC_LB = 1 << 0, @@ -267,6 +268,7 @@ enum { MLX4_UPD_QP_PATH_MASK_FVL_RX = 16 + 32, MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_UC_LB = 18 + 32, MLX4_UPD_QP_PATH_MASK_ETH_SRC_CHECK_MC_LB = 19 + 32, + MLX4_UPD_QP_PATH_MASK_SV = 22 + 32, }; enum { /* param3 */ -- cgit v1.2.3 From 79aab093a0b5370d7fc4e99df75996f4744dc03f Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 22 Sep 2016 12:11:15 +0300 Subject: net: Update API for VF vlan protocol 802.1ad support Introduce new rtnl UAPI that exposes a list of vlans per VF, giving the ability for user-space application to specify it for the VF, as an option to support 802.1ad. We adjusted IP Link tool to support this option. For future use cases, the new UAPI supports multiple vlans. For now we limit the list size to a single vlan in kernel. Add IFLA_VF_VLAN_LIST in addition to IFLA_VF_VLAN to keep backward compatibility with older versions of IP Link tool. Add a vlan protocol parameter to the ndo_set_vf_vlan callback. We kept 802.1Q as the drivers' default vlan protocol. Suitable ip link tool command examples: Set vf vlan protocol 802.1ad: ip link set eth0 vf 1 vlan 100 proto 802.1ad Set vf to VST (802.1Q) mode: ip link set eth0 vf 1 vlan 100 proto 802.1Q Or by omitting the new parameter ip link set eth0 vf 1 vlan 100 Signed-off-by: Moshe Shemesh Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/if_link.h | 1 + include/linux/netdevice.h | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_link.h b/include/linux/if_link.h index f923d15b432c..0b17c585b5cd 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -25,5 +25,6 @@ struct ifla_vf_info { __u32 max_tx_rate; __u32 rss_query_en; __u32 trusted; + __be16 vlan_proto; }; #endif /* _LINUX_IF_LINK_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 69f242c71865..1e8a5c734d72 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -946,7 +946,8 @@ struct netdev_xdp { * * SR-IOV management functions. * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac); - * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos); + * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, + * u8 qos, __be16 proto); * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate, * int max_tx_rate); * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting); @@ -1187,7 +1188,8 @@ struct net_device_ops { int (*ndo_set_vf_mac)(struct net_device *dev, int queue, u8 *mac); int (*ndo_set_vf_vlan)(struct net_device *dev, - int queue, u16 vlan, u8 qos); + int queue, u16 vlan, + u8 qos, __be16 proto); int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate); -- cgit v1.2.3 From b42959dc35a533a531dd698b581193a65a5da831 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 22 Sep 2016 12:11:16 +0300 Subject: net/mlx4: Add VF vlan protocol 802.1ad support Move the vf to VST 802.1ad mode (mlx4 VST QinQ mode) by setting vf vlan protocol to 802.1ad. VST 802.1ad mode in mlx4, is used for STAG strip/insertion by PF, while the CTAG is set by the VF. Read current vlan protocol as part of the vf configuration state. Upon setting vf vlan protocol to 802.1ad, we use a mechanism of handshake to verify that both the vf and the pf driver version support it. The handshake uses the command QUERY_FUNC_CAP: - The vf sets a pre-defined support bit in input modifier. - A pf that supports the feature sends the request to the vf through a pre-defined field in the output mailbox. - In case vf does not support the feature, the pf will fail the control command (in this case, IP link tool command to set the vf vlan protocol to 802.1ad). No change in VST 802.1Q mode. Signed-off-by: Moshe Shemesh Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- include/linux/mlx4/cmd.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index 116b284bc4ce..1f3568694a57 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -309,7 +309,8 @@ int mlx4_get_vf_stats(struct mlx4_dev *dev, int port, int vf_idx, struct ifla_vf_stats *vf_stats); u32 mlx4_comm_get_version(void); int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac); -int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos); +int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, + u8 qos, __be16 proto); int mlx4_set_vf_rate(struct mlx4_dev *dev, int port, int vf, int min_tx_rate, int max_tx_rate); int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting); -- cgit v1.2.3 From fe72926b792e52ab00abfa81a201805bfb2247d6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 21 Sep 2016 11:35:02 -0400 Subject: netfilter: call nf_hook_state_init with rcu_read_lock held This makes things simpler because we can store the head of the list in the nf_state structure without worrying about concurrent add/delete of hook elements from the list. A future commit will make use of this to implement a simpler linked-list. Signed-off-by: Florian Westphal Signed-off-by: Aaron Conole Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 8 +++++++- include/linux/netfilter_ingress.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 9230f9aee896..ad444f0b4ed0 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -174,10 +174,16 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, if (!list_empty(hook_list)) { struct nf_hook_state state; + int ret; + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); nf_hook_state_init(&state, hook_list, hook, thresh, pf, indev, outdev, sk, net, okfn); - return nf_hook_slow(skb, &state); + + ret = nf_hook_slow(skb, &state); + rcu_read_unlock(); + return ret; } return 1; } diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h index 5fcd375ef175..6965ba09eba7 100644 --- a/include/linux/netfilter_ingress.h +++ b/include/linux/netfilter_ingress.h @@ -14,6 +14,7 @@ static inline bool nf_hook_ingress_active(const struct sk_buff *skb) return !list_empty(&skb->dev->nf_hooks_ingress); } +/* caller must hold rcu_read_lock */ static inline int nf_hook_ingress(struct sk_buff *skb) { struct nf_hook_state state; -- cgit v1.2.3 From e3b37f11e6e4e6b6f02cc762f182ce233d2c1c9d Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Wed, 21 Sep 2016 11:35:07 -0400 Subject: netfilter: replace list_head with single linked list The netfilter hook list never uses the prev pointer, and so can be trimmed to be a simple singly-linked list. In addition to having a more light weight structure for hook traversal, struct net becomes 5568 bytes (down from 6400) and struct net_device becomes 2176 bytes (down from 2240). Signed-off-by: Aaron Conole Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netdevice.h | 2 +- include/linux/netfilter.h | 63 +++++++++++++++++++++------------------ include/linux/netfilter_ingress.h | 17 +++++++---- 3 files changed, 47 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 67bb978470dc..41f49f5ab62a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1783,7 +1783,7 @@ struct net_device { #endif struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS - struct list_head nf_hooks_ingress; + struct nf_hook_entry __rcu *nf_hooks_ingress; #endif unsigned char broadcast[MAX_ADDR_LEN]; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index ad444f0b4ed0..44e20dac98a9 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -55,12 +55,34 @@ struct nf_hook_state { struct net_device *out; struct sock *sk; struct net *net; - struct list_head *hook_list; + struct nf_hook_entry __rcu *hook_entries; int (*okfn)(struct net *, struct sock *, struct sk_buff *); }; +typedef unsigned int nf_hookfn(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state); +struct nf_hook_ops { + struct list_head list; + + /* User fills in from here down. */ + nf_hookfn *hook; + struct net_device *dev; + void *priv; + u_int8_t pf; + unsigned int hooknum; + /* Hooks are ordered in ascending priority. */ + int priority; +}; + +struct nf_hook_entry { + struct nf_hook_entry __rcu *next; + struct nf_hook_ops ops; + const struct nf_hook_ops *orig_ops; +}; + static inline void nf_hook_state_init(struct nf_hook_state *p, - struct list_head *hook_list, + struct nf_hook_entry *hook_entry, unsigned int hook, int thresh, u_int8_t pf, struct net_device *indev, @@ -76,26 +98,11 @@ static inline void nf_hook_state_init(struct nf_hook_state *p, p->out = outdev; p->sk = sk; p->net = net; - p->hook_list = hook_list; + RCU_INIT_POINTER(p->hook_entries, hook_entry); p->okfn = okfn; } -typedef unsigned int nf_hookfn(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state); - -struct nf_hook_ops { - struct list_head list; - /* User fills in from here down. */ - nf_hookfn *hook; - struct net_device *dev; - void *priv; - u_int8_t pf; - unsigned int hooknum; - /* Hooks are ordered in ascending priority. */ - int priority; -}; struct nf_sockopt_ops { struct list_head list; @@ -161,7 +168,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, int (*okfn)(struct net *, struct sock *, struct sk_buff *), int thresh) { - struct list_head *hook_list; + struct nf_hook_entry *hook_head; + int ret = 1; #ifdef HAVE_JUMP_LABEL if (__builtin_constant_p(pf) && @@ -170,22 +178,19 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, return 1; #endif - hook_list = &net->nf.hooks[pf][hook]; - - if (!list_empty(hook_list)) { + rcu_read_lock(); + hook_head = rcu_dereference(net->nf.hooks[pf][hook]); + if (hook_head) { struct nf_hook_state state; - int ret; - /* We may already have this, but read-locks nest anyway */ - rcu_read_lock(); - nf_hook_state_init(&state, hook_list, hook, thresh, + nf_hook_state_init(&state, hook_head, hook, thresh, pf, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state); - rcu_read_unlock(); - return ret; } - return 1; + rcu_read_unlock(); + + return ret; } static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, diff --git a/include/linux/netfilter_ingress.h b/include/linux/netfilter_ingress.h index 6965ba09eba7..33e37fb41d5d 100644 --- a/include/linux/netfilter_ingress.h +++ b/include/linux/netfilter_ingress.h @@ -11,23 +11,30 @@ static inline bool nf_hook_ingress_active(const struct sk_buff *skb) if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) return false; #endif - return !list_empty(&skb->dev->nf_hooks_ingress); + return rcu_access_pointer(skb->dev->nf_hooks_ingress); } /* caller must hold rcu_read_lock */ static inline int nf_hook_ingress(struct sk_buff *skb) { + struct nf_hook_entry *e = rcu_dereference(skb->dev->nf_hooks_ingress); struct nf_hook_state state; - nf_hook_state_init(&state, &skb->dev->nf_hooks_ingress, - NF_NETDEV_INGRESS, INT_MIN, NFPROTO_NETDEV, - skb->dev, NULL, NULL, dev_net(skb->dev), NULL); + /* Must recheck the ingress hook head, in the event it became NULL + * after the check in nf_hook_ingress_active evaluated to true. + */ + if (unlikely(!e)) + return 0; + + nf_hook_state_init(&state, e, NF_NETDEV_INGRESS, INT_MIN, + NFPROTO_NETDEV, skb->dev, NULL, NULL, + dev_net(skb->dev), NULL); return nf_hook_slow(skb, &state); } static inline void nf_hook_ingress_init(struct net_device *dev) { - INIT_LIST_HEAD(&dev->nf_hooks_ingress); + RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); } #else /* CONFIG_NETFILTER_INGRESS */ static inline int nf_hook_ingress_active(struct sk_buff *skb) -- cgit v1.2.3 From 484611357c19f9e19ef742ebef4505a07d243cc9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 28 Sep 2016 10:54:32 -0400 Subject: bpf: allow access into map value arrays Suppose you have a map array value that is something like this struct foo { unsigned iter; int array[SOME_CONSTANT]; }; You can easily insert this into an array, but you cannot modify the contents of foo->array[] after the fact. This is because we have no way to verify we won't go off the end of the array at verification time. This patch provides a start for this work. We accomplish this by keeping track of a minimum and maximum value a register could be while we're checking the code. Then at the time we try to do an access into a MAP_VALUE we verify that the maximum offset into that region is a valid access into that memory region. So in practice, code such as this unsigned index = 0; if (foo->iter >= SOME_CONSTANT) foo->iter = index; else index = foo->iter++; foo->array[index] = bar; would be allowed, as we can verify that index will always be between 0 and SOME_CONSTANT-1. If you wish to use signed values you'll have to have an extra check to make sure the index isn't less than 0, or do something like index %= SOME_CONSTANT. Signed-off-by: Josef Bacik Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 7 +++++++ include/linux/bpf_verifier.h | 12 ++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5691fdc83819..c201017b5730 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -139,6 +139,13 @@ enum bpf_reg_type { */ PTR_TO_PACKET, PTR_TO_PACKET_END, /* skb->data + headlen */ + + /* PTR_TO_MAP_VALUE_ADJ is used for doing pointer math inside of a map + * elem value. We only allow this if we can statically verify that + * access from this register are going to fall within the size of the + * map element. + */ + PTR_TO_MAP_VALUE_ADJ, }; struct bpf_prog; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c5cb661712c9..7035b997aaa5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -10,8 +10,19 @@ #include /* for enum bpf_reg_type */ #include /* for MAX_BPF_STACK */ + /* Just some arbitrary values so we can safely do math without overflowing and + * are obviously wrong for any sort of memory access. + */ +#define BPF_REGISTER_MAX_RANGE (1024 * 1024 * 1024) +#define BPF_REGISTER_MIN_RANGE -(1024 * 1024 * 1024) + struct bpf_reg_state { enum bpf_reg_type type; + /* + * Used to determine if any memory access using this register will + * result in a bad access. + */ + u64 min_value, max_value; union { /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ s64 imm; @@ -81,6 +92,7 @@ struct bpf_verifier_env { u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; bool seen_direct_write; + bool varlen_map_value_access; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ }; -- cgit v1.2.3 From bd11f0741fa5a2c296629898ad07759dd12b35bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 27 Sep 2016 23:57:58 -0700 Subject: ipv6 addrconf: implement RFC7559 router solicitation backoff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This implements: https://tools.ietf.org/html/rfc7559 Backoff is performed according to RFC3315 section 14: https://tools.ietf.org/html/rfc3315#section-14 We allow setting /proc/sys/net/ipv6/conf/*/router_solicitations to a negative value meaning an unlimited number of retransmits, and we make this the new default (inline with the RFC). We also add a new setting: /proc/sys/net/ipv6/conf/*/router_solicitation_max_interval defaulting to 1 hour (per RFC recommendation). Signed-off-by: Maciej Żenczykowski Acked-by: Erik Kline Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index c6dbcd84a2c7..7e9a789be5e0 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -18,6 +18,7 @@ struct ipv6_devconf { __s32 dad_transmits; __s32 rtr_solicits; __s32 rtr_solicit_interval; + __s32 rtr_solicit_max_interval; __s32 rtr_solicit_delay; __s32 force_mld_version; __s32 mldv1_unsolicited_report_interval; -- cgit v1.2.3 From 0a7fb11c23c0fb8f5ad37f285f40348f1ab9ccbd Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Sat, 1 Oct 2016 21:59:55 +0300 Subject: qed: Add Light L2 support Other protocols beside the networking driver need the ability of passing some L2 traffic, usually [although not limited] for the purpose of some management traffic. Signed-off-by: Yuval Mintz Signed-off-by: Ram Amrani Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 1 + include/linux/qed/qed_ll2_if.h | 139 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) create mode 100644 include/linux/qed/qed_ll2_if.h (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index e4546abcea08..c2d74e8785cf 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -627,6 +627,7 @@ enum DP_MODULE { QED_MSG_SP = 0x100000, QED_MSG_STORAGE = 0x200000, QED_MSG_CXT = 0x800000, + QED_MSG_LL2 = 0x1000000, QED_MSG_ILT = 0x2000000, QED_MSG_ROCE = 0x4000000, QED_MSG_DEBUG = 0x8000000, diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h new file mode 100644 index 000000000000..fd75c265dba3 --- /dev/null +++ b/include/linux/qed/qed_ll2_if.h @@ -0,0 +1,139 @@ +/* QLogic qed NIC Driver + * + * Copyright (c) 2015 QLogic Corporation + * + * This software is available under the terms of the GNU General Public License + * (GPL) Version 2, available from the file COPYING in the main directory of + * this source tree. + */ + +#ifndef _QED_LL2_IF_H +#define _QED_LL2_IF_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct qed_ll2_stats { + u64 gsi_invalid_hdr; + u64 gsi_invalid_pkt_length; + u64 gsi_unsupported_pkt_typ; + u64 gsi_crcchksm_error; + + u64 packet_too_big_discard; + u64 no_buff_discard; + + u64 rcv_ucast_bytes; + u64 rcv_mcast_bytes; + u64 rcv_bcast_bytes; + u64 rcv_ucast_pkts; + u64 rcv_mcast_pkts; + u64 rcv_bcast_pkts; + + u64 sent_ucast_bytes; + u64 sent_mcast_bytes; + u64 sent_bcast_bytes; + u64 sent_ucast_pkts; + u64 sent_mcast_pkts; + u64 sent_bcast_pkts; +}; + +#define QED_LL2_UNUSED_HANDLE (0xff) + +struct qed_ll2_cb_ops { + int (*rx_cb)(void *, struct sk_buff *, u32, u32); + int (*tx_cb)(void *, struct sk_buff *, bool); +}; + +struct qed_ll2_params { + u16 mtu; + bool drop_ttl0_packets; + bool rx_vlan_stripping; + u8 tx_tc; + bool frags_mapped; + u8 ll2_mac_address[ETH_ALEN]; +}; + +struct qed_ll2_ops { +/** + * @brief start - initializes ll2 + * + * @param cdev + * @param params - protocol driver configuration for the ll2. + * + * @return 0 on success, otherwise error value. + */ + int (*start)(struct qed_dev *cdev, struct qed_ll2_params *params); + +/** + * @brief stop - stops the ll2 + * + * @param cdev + * + * @return 0 on success, otherwise error value. + */ + int (*stop)(struct qed_dev *cdev); + +/** + * @brief start_xmit - transmits an skb over the ll2 interface + * + * @param cdev + * @param skb + * + * @return 0 on success, otherwise error value. + */ + int (*start_xmit)(struct qed_dev *cdev, struct sk_buff *skb); + +/** + * @brief register_cb_ops - protocol driver register the callback for Rx/Tx + * packets. Should be called before `start'. + * + * @param cdev + * @param cookie - to be passed to the callback functions. + * @param ops - the callback functions to register for Rx / Tx. + * + * @return 0 on success, otherwise error value. + */ + void (*register_cb_ops)(struct qed_dev *cdev, + const struct qed_ll2_cb_ops *ops, + void *cookie); + +/** + * @brief get LL2 related statistics + * + * @param cdev + * @param stats - pointer to struct that would be filled with stats + * + * @return 0 on success, error otherwise. + */ + int (*get_stats)(struct qed_dev *cdev, struct qed_ll2_stats *stats); +}; + +#ifdef CONFIG_QED_LL2 +int qed_ll2_alloc_if(struct qed_dev *); +void qed_ll2_dealloc_if(struct qed_dev *); +#else +static const struct qed_ll2_ops qed_ll2_ops_pass = { + .start = NULL, + .stop = NULL, + .start_xmit = NULL, + .register_cb_ops = NULL, + .get_stats = NULL, +}; + +static inline int qed_ll2_alloc_if(struct qed_dev *cdev) +{ + return 0; +} + +static inline void qed_ll2_dealloc_if(struct qed_dev *cdev) +{ +} +#endif +#endif -- cgit v1.2.3 From cee9fbd8e2e9e713cd8bf227c6492fd8854de74b Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Sat, 1 Oct 2016 21:59:56 +0300 Subject: qede: Add qedr framework Adds a skeletal implementation of the qede RoCE driver - The qedr has some dependencies of the state of the underlying base interface. This adds some logic required with mutual registrations and the ability to pass updates on 'intresting' events. Signed-off-by: Ram Amrani Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 3 +- include/linux/qed/qede_roce.h | 88 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 include/linux/qed/qede_roce.h (limited to 'include/linux') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index c2d74e8785cf..e313742b571d 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -260,11 +260,10 @@ struct qed_dev_info { /* MFW version */ u32 mfw_rev; - bool rdma_supported; - u32 flash_size; u8 mf_mode; bool tx_switching; + bool rdma_supported; }; enum qed_sb_type { diff --git a/include/linux/qed/qede_roce.h b/include/linux/qed/qede_roce.h new file mode 100644 index 000000000000..99fbe6d55acb --- /dev/null +++ b/include/linux/qed/qede_roce.h @@ -0,0 +1,88 @@ +/* QLogic qedr NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef QEDE_ROCE_H +#define QEDE_ROCE_H + +struct qedr_dev; +struct qed_dev; +struct qede_dev; + +enum qede_roce_event { + QEDE_UP, + QEDE_DOWN, + QEDE_CHANGE_ADDR, + QEDE_CLOSE +}; + +struct qede_roce_event_work { + struct list_head list; + struct work_struct work; + void *ptr; + enum qede_roce_event event; +}; + +struct qedr_driver { + unsigned char name[32]; + + struct qedr_dev* (*add)(struct qed_dev *, struct pci_dev *, + struct net_device *); + + void (*remove)(struct qedr_dev *); + void (*notify)(struct qedr_dev *, enum qede_roce_event); +}; + +/* APIs for RoCE driver to register callback handlers, + * which will be invoked when device is added, removed, ifup, ifdown + */ +int qede_roce_register_driver(struct qedr_driver *drv); +void qede_roce_unregister_driver(struct qedr_driver *drv); + +bool qede_roce_supported(struct qede_dev *dev); + +#if IS_ENABLED(CONFIG_INFINIBAND_QEDR) +int qede_roce_dev_add(struct qede_dev *dev); +void qede_roce_dev_event_open(struct qede_dev *dev); +void qede_roce_dev_event_close(struct qede_dev *dev); +void qede_roce_dev_remove(struct qede_dev *dev); +void qede_roce_event_changeaddr(struct qede_dev *qedr); +#else +static inline int qede_roce_dev_add(struct qede_dev *dev) +{ + return 0; +} + +static inline void qede_roce_dev_event_open(struct qede_dev *dev) {} +static inline void qede_roce_dev_event_close(struct qede_dev *dev) {} +static inline void qede_roce_dev_remove(struct qede_dev *dev) {} +static inline void qede_roce_event_changeaddr(struct qede_dev *qedr) {} +#endif +#endif -- cgit v1.2.3 From 51ff17251c9c2c2e71974149d22bc73ea09c27cc Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Sat, 1 Oct 2016 21:59:57 +0300 Subject: qed: Add support for RoCE hw init This adds the backbone required for the various HW initalizations which are necessary for the qedr driver - FW notification, resource initializations, etc. Signed-off-by: Ram Amrani Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/common_hsi.h | 1 + include/linux/qed/qed_if.h | 5 +- include/linux/qed/qed_roce_if.h | 345 ++++++++++++++++++++++++++++++++++++++++ include/linux/qed/rdma_common.h | 1 + 4 files changed, 351 insertions(+), 1 deletion(-) create mode 100644 include/linux/qed/qed_roce_if.h (limited to 'include/linux') diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 19027635df0d..734deb094618 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -674,6 +674,7 @@ union event_ring_data { struct iscsi_eqe_data iscsi_info; struct malicious_vf_eqe_data malicious_vf; struct initial_cleanup_eqe_data vf_init_cleanup; + struct regpair roce_handle; }; /* Event Ring Entry */ diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index e313742b571d..f9ae903bbb84 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -34,6 +34,8 @@ enum dcbx_protocol_type { DCBX_MAX_PROTOCOL_TYPE }; +#define QED_ROCE_PROTOCOL_INDEX (3) + #ifdef CONFIG_DCB #define QED_LLDP_CHASSIS_ID_STAT_LEN 4 #define QED_LLDP_PORT_ID_STAT_LEN 4 @@ -268,6 +270,7 @@ struct qed_dev_info { enum qed_sb_type { QED_SB_TYPE_L2_QUEUE, + QED_SB_TYPE_CNQ, }; enum qed_protocol { @@ -628,7 +631,7 @@ enum DP_MODULE { QED_MSG_CXT = 0x800000, QED_MSG_LL2 = 0x1000000, QED_MSG_ILT = 0x2000000, - QED_MSG_ROCE = 0x4000000, + QED_MSG_RDMA = 0x4000000, QED_MSG_DEBUG = 0x8000000, /* to be added...up to 0x8000000 */ }; diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h new file mode 100644 index 000000000000..0f7d5275e515 --- /dev/null +++ b/include/linux/qed/qed_roce_if.h @@ -0,0 +1,345 @@ +/* QLogic qed NIC Driver + * Copyright (c) 2015-2016 QLogic Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and /or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef _QED_ROCE_IF_H +#define _QED_ROCE_IF_H +#include +#include +#include +#include +#include +#include +#include +#include + +#define QED_RDMA_MAX_CNQ_SIZE (0xFFFF) + +/* rdma interface */ +enum qed_rdma_tid_type { + QED_RDMA_TID_REGISTERED_MR, + QED_RDMA_TID_FMR, + QED_RDMA_TID_MW_TYPE1, + QED_RDMA_TID_MW_TYPE2A +}; + +struct qed_rdma_events { + void *context; + void (*affiliated_event)(void *context, u8 fw_event_code, + void *fw_handle); + void (*unaffiliated_event)(void *context, u8 event_code); +}; + +struct qed_rdma_device { + u32 vendor_id; + u32 vendor_part_id; + u32 hw_ver; + u64 fw_ver; + + u64 node_guid; + u64 sys_image_guid; + + u8 max_cnq; + u8 max_sge; + u8 max_srq_sge; + u16 max_inline; + u32 max_wqe; + u32 max_srq_wqe; + u8 max_qp_resp_rd_atomic_resc; + u8 max_qp_req_rd_atomic_resc; + u64 max_dev_resp_rd_atomic_resc; + u32 max_cq; + u32 max_qp; + u32 max_srq; + u32 max_mr; + u64 max_mr_size; + u32 max_cqe; + u32 max_mw; + u32 max_fmr; + u32 max_mr_mw_fmr_pbl; + u64 max_mr_mw_fmr_size; + u32 max_pd; + u32 max_ah; + u8 max_pkey; + u16 max_srq_wr; + u8 max_stats_queues; + u32 dev_caps; + + /* Abilty to support RNR-NAK generation */ + +#define QED_RDMA_DEV_CAP_RNR_NAK_MASK 0x1 +#define QED_RDMA_DEV_CAP_RNR_NAK_SHIFT 0 + /* Abilty to support shutdown port */ +#define QED_RDMA_DEV_CAP_SHUTDOWN_PORT_MASK 0x1 +#define QED_RDMA_DEV_CAP_SHUTDOWN_PORT_SHIFT 1 + /* Abilty to support port active event */ +#define QED_RDMA_DEV_CAP_PORT_ACTIVE_EVENT_MASK 0x1 +#define QED_RDMA_DEV_CAP_PORT_ACTIVE_EVENT_SHIFT 2 + /* Abilty to support port change event */ +#define QED_RDMA_DEV_CAP_PORT_CHANGE_EVENT_MASK 0x1 +#define QED_RDMA_DEV_CAP_PORT_CHANGE_EVENT_SHIFT 3 + /* Abilty to support system image GUID */ +#define QED_RDMA_DEV_CAP_SYS_IMAGE_MASK 0x1 +#define QED_RDMA_DEV_CAP_SYS_IMAGE_SHIFT 4 + /* Abilty to support bad P_Key counter support */ +#define QED_RDMA_DEV_CAP_BAD_PKEY_CNT_MASK 0x1 +#define QED_RDMA_DEV_CAP_BAD_PKEY_CNT_SHIFT 5 + /* Abilty to support atomic operations */ +#define QED_RDMA_DEV_CAP_ATOMIC_OP_MASK 0x1 +#define QED_RDMA_DEV_CAP_ATOMIC_OP_SHIFT 6 +#define QED_RDMA_DEV_CAP_RESIZE_CQ_MASK 0x1 +#define QED_RDMA_DEV_CAP_RESIZE_CQ_SHIFT 7 + /* Abilty to support modifying the maximum number of + * outstanding work requests per QP + */ +#define QED_RDMA_DEV_CAP_RESIZE_MAX_WR_MASK 0x1 +#define QED_RDMA_DEV_CAP_RESIZE_MAX_WR_SHIFT 8 + /* Abilty to support automatic path migration */ +#define QED_RDMA_DEV_CAP_AUTO_PATH_MIG_MASK 0x1 +#define QED_RDMA_DEV_CAP_AUTO_PATH_MIG_SHIFT 9 + /* Abilty to support the base memory management extensions */ +#define QED_RDMA_DEV_CAP_BASE_MEMORY_EXT_MASK 0x1 +#define QED_RDMA_DEV_CAP_BASE_MEMORY_EXT_SHIFT 10 +#define QED_RDMA_DEV_CAP_BASE_QUEUE_EXT_MASK 0x1 +#define QED_RDMA_DEV_CAP_BASE_QUEUE_EXT_SHIFT 11 + /* Abilty to support multipile page sizes per memory region */ +#define QED_RDMA_DEV_CAP_MULTI_PAGE_PER_MR_EXT_MASK 0x1 +#define QED_RDMA_DEV_CAP_MULTI_PAGE_PER_MR_EXT_SHIFT 12 + /* Abilty to support block list physical buffer list */ +#define QED_RDMA_DEV_CAP_BLOCK_MODE_MASK 0x1 +#define QED_RDMA_DEV_CAP_BLOCK_MODE_SHIFT 13 + /* Abilty to support zero based virtual addresses */ +#define QED_RDMA_DEV_CAP_ZBVA_MASK 0x1 +#define QED_RDMA_DEV_CAP_ZBVA_SHIFT 14 + /* Abilty to support local invalidate fencing */ +#define QED_RDMA_DEV_CAP_LOCAL_INV_FENCE_MASK 0x1 +#define QED_RDMA_DEV_CAP_LOCAL_INV_FENCE_SHIFT 15 + /* Abilty to support Loopback on QP */ +#define QED_RDMA_DEV_CAP_LB_INDICATOR_MASK 0x1 +#define QED_RDMA_DEV_CAP_LB_INDICATOR_SHIFT 16 + u64 page_size_caps; + u8 dev_ack_delay; + u32 reserved_lkey; + u32 bad_pkey_counter; + struct qed_rdma_events events; +}; + +enum qed_port_state { + QED_RDMA_PORT_UP, + QED_RDMA_PORT_DOWN, +}; + +enum qed_roce_capability { + QED_ROCE_V1 = 1 << 0, + QED_ROCE_V2 = 1 << 1, +}; + +struct qed_rdma_port { + enum qed_port_state port_state; + int link_speed; + u64 max_msg_size; + u8 source_gid_table_len; + void *source_gid_table_ptr; + u8 pkey_table_len; + void *pkey_table_ptr; + u32 pkey_bad_counter; + enum qed_roce_capability capability; +}; + +struct qed_rdma_cnq_params { + u8 num_pbl_pages; + u64 pbl_ptr; +}; + +/* The CQ Mode affects the CQ doorbell transaction size. + * 64/32 bit machines should configure to 32/16 bits respectively. + */ +enum qed_rdma_cq_mode { + QED_RDMA_CQ_MODE_16_BITS, + QED_RDMA_CQ_MODE_32_BITS, +}; + +struct qed_roce_dcqcn_params { + u8 notification_point; + u8 reaction_point; + + /* fields for notification point */ + u32 cnp_send_timeout; + + /* fields for reaction point */ + u32 rl_bc_rate; + u16 rl_max_rate; + u16 rl_r_ai; + u16 rl_r_hai; + u16 dcqcn_g; + u32 dcqcn_k_us; + u32 dcqcn_timeout_us; +}; + +struct qed_rdma_start_in_params { + struct qed_rdma_events *events; + struct qed_rdma_cnq_params cnq_pbl_list[128]; + u8 desired_cnq; + enum qed_rdma_cq_mode cq_mode; + struct qed_roce_dcqcn_params dcqcn_params; + u16 max_mtu; + u8 mac_addr[ETH_ALEN]; + u8 iwarp_flags; +}; + +struct qed_rdma_add_user_out_params { + u16 dpi; + u64 dpi_addr; + u64 dpi_phys_addr; + u32 dpi_size; +}; + +enum roce_mode { + ROCE_V1, + ROCE_V2_IPV4, + ROCE_V2_IPV6, + MAX_ROCE_MODE +}; + +union qed_gid { + u8 bytes[16]; + u16 words[8]; + u32 dwords[4]; + u64 qwords[2]; + u32 ipv4_addr; +}; + +struct qed_rdma_register_tid_in_params { + u32 itid; + enum qed_rdma_tid_type tid_type; + u8 key; + u16 pd; + bool local_read; + bool local_write; + bool remote_read; + bool remote_write; + bool remote_atomic; + bool mw_bind; + u64 pbl_ptr; + bool pbl_two_level; + u8 pbl_page_size_log; + u8 page_size_log; + u32 fbo; + u64 length; + u64 vaddr; + bool zbva; + bool phy_mr; + bool dma_mr; + + bool dif_enabled; + u64 dif_error_addr; + u64 dif_runt_addr; +}; + +struct qed_rdma_create_srq_in_params { + u64 pbl_base_addr; + u64 prod_pair_addr; + u16 num_pages; + u16 pd_id; + u16 page_size; +}; + +struct qed_rdma_create_srq_out_params { + u16 srq_id; +}; + +struct qed_rdma_destroy_srq_in_params { + u16 srq_id; +}; + +struct qed_rdma_modify_srq_in_params { + u32 wqe_limit; + u16 srq_id; +}; + +struct qed_rdma_stats_out_params { + u64 sent_bytes; + u64 sent_pkts; + u64 rcv_bytes; + u64 rcv_pkts; +}; + +struct qed_rdma_counters_out_params { + u64 pd_count; + u64 max_pd; + u64 dpi_count; + u64 max_dpi; + u64 cq_count; + u64 max_cq; + u64 qp_count; + u64 max_qp; + u64 tid_count; + u64 max_tid; +}; + +#define QED_ROCE_TX_HEAD_FAILURE (1) +#define QED_ROCE_TX_FRAG_FAILURE (2) + +enum qed_rdma_type { + QED_RDMA_TYPE_ROCE, +}; + +struct qed_dev_rdma_info { + struct qed_dev_info common; + enum qed_rdma_type rdma_type; +}; + +struct qed_rdma_ops { + const struct qed_common_ops *common; + + int (*fill_dev_info)(struct qed_dev *cdev, + struct qed_dev_rdma_info *info); + void *(*rdma_get_rdma_ctx)(struct qed_dev *cdev); + + int (*rdma_init)(struct qed_dev *dev, + struct qed_rdma_start_in_params *iparams); + + int (*rdma_add_user)(void *rdma_cxt, + struct qed_rdma_add_user_out_params *oparams); + + void (*rdma_remove_user)(void *rdma_cxt, u16 dpi); + int (*rdma_stop)(void *rdma_cxt); + struct qed_rdma_device* (*rdma_query_device)(void *rdma_cxt); + int (*rdma_get_start_sb)(struct qed_dev *cdev); + int (*rdma_get_min_cnq_msix)(struct qed_dev *cdev); + void (*rdma_cnq_prod_update)(void *rdma_cxt, u8 cnq_index, u16 prod); + int (*rdma_get_rdma_int)(struct qed_dev *cdev, + struct qed_int_info *info); + int (*rdma_set_rdma_int)(struct qed_dev *cdev, u16 cnt); +}; + +const struct qed_rdma_ops *qed_get_rdma_ops(void); + +#endif diff --git a/include/linux/qed/rdma_common.h b/include/linux/qed/rdma_common.h index 187991c1f439..7663725faa94 100644 --- a/include/linux/qed/rdma_common.h +++ b/include/linux/qed/rdma_common.h @@ -28,6 +28,7 @@ #define RDMA_MAX_PDS (64 * 1024) #define RDMA_NUM_STATISTIC_COUNTERS MAX_NUM_VPORTS +#define RDMA_NUM_STATISTIC_COUNTERS_BB MAX_NUM_VPORTS_BB #define RDMA_TASK_TYPE (PROTOCOLID_ROCE) -- cgit v1.2.3 From c295f86e60f5ba67f0f4bba2bb2c22b3cbf01ec1 Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Sat, 1 Oct 2016 21:59:58 +0300 Subject: qed: PD,PKEY and CQ verb support Add support for the configurations of the protection domain and completion queues. Signed-off-by: Ram Amrani Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_roce_if.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h index 0f7d5275e515..b559b1c9e76d 100644 --- a/include/linux/qed/qed_roce_if.h +++ b/include/linux/qed/qed_roce_if.h @@ -263,6 +263,19 @@ struct qed_rdma_register_tid_in_params { u64 dif_runt_addr; }; +struct qed_rdma_create_cq_in_params { + u32 cq_handle_lo; + u32 cq_handle_hi; + u32 cq_size; + u16 dpi; + bool pbl_two_level; + u64 pbl_ptr; + u16 pbl_num_pages; + u8 pbl_page_size_log; + u8 cnq_id; + u16 int_timeout; +}; + struct qed_rdma_create_srq_in_params { u64 pbl_base_addr; u64 prod_pair_addr; @@ -271,6 +284,14 @@ struct qed_rdma_create_srq_in_params { u16 page_size; }; +struct qed_rdma_destroy_cq_in_params { + u16 icid; +}; + +struct qed_rdma_destroy_cq_out_params { + u16 num_cq_notif; +}; + struct qed_rdma_create_srq_out_params { u16 srq_id; }; @@ -332,12 +353,21 @@ struct qed_rdma_ops { void (*rdma_remove_user)(void *rdma_cxt, u16 dpi); int (*rdma_stop)(void *rdma_cxt); struct qed_rdma_device* (*rdma_query_device)(void *rdma_cxt); + struct qed_rdma_port* (*rdma_query_port)(void *rdma_cxt); int (*rdma_get_start_sb)(struct qed_dev *cdev); int (*rdma_get_min_cnq_msix)(struct qed_dev *cdev); void (*rdma_cnq_prod_update)(void *rdma_cxt, u8 cnq_index, u16 prod); int (*rdma_get_rdma_int)(struct qed_dev *cdev, struct qed_int_info *info); int (*rdma_set_rdma_int)(struct qed_dev *cdev, u16 cnt); + int (*rdma_alloc_pd)(void *rdma_cxt, u16 *pd); + void (*rdma_dealloc_pd)(void *rdma_cxt, u16 pd); + int (*rdma_create_cq)(void *rdma_cxt, + struct qed_rdma_create_cq_in_params *params, + u16 *icid); + int (*rdma_destroy_cq)(void *rdma_cxt, + struct qed_rdma_destroy_cq_in_params *iparams, + struct qed_rdma_destroy_cq_out_params *oparams); }; const struct qed_rdma_ops *qed_get_rdma_ops(void); -- cgit v1.2.3 From f109394033521862f2558df93d9afc4dfa829c6a Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Sat, 1 Oct 2016 21:59:59 +0300 Subject: qed: Add support for QP verbs Add support for the slowpath configurations of Queue Pair verbs which adds, deletes, modifies and queries Queue Pairs. Signed-off-by: Ram Amrani Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_roce_if.h | 144 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h index b559b1c9e76d..02321e3b1716 100644 --- a/include/linux/qed/qed_roce_if.h +++ b/include/linux/qed/qed_roce_if.h @@ -43,6 +43,17 @@ #define QED_RDMA_MAX_CNQ_SIZE (0xFFFF) /* rdma interface */ + +enum qed_roce_qp_state { + QED_ROCE_QP_STATE_RESET, + QED_ROCE_QP_STATE_INIT, + QED_ROCE_QP_STATE_RTR, + QED_ROCE_QP_STATE_RTS, + QED_ROCE_QP_STATE_SQD, + QED_ROCE_QP_STATE_ERR, + QED_ROCE_QP_STATE_SQE +}; + enum qed_rdma_tid_type { QED_RDMA_TID_REGISTERED_MR, QED_RDMA_TID_FMR, @@ -292,6 +303,128 @@ struct qed_rdma_destroy_cq_out_params { u16 num_cq_notif; }; +struct qed_rdma_create_qp_in_params { + u32 qp_handle_lo; + u32 qp_handle_hi; + u32 qp_handle_async_lo; + u32 qp_handle_async_hi; + bool use_srq; + bool signal_all; + bool fmr_and_reserved_lkey; + u16 pd; + u16 dpi; + u16 sq_cq_id; + u16 sq_num_pages; + u64 sq_pbl_ptr; + u8 max_sq_sges; + u16 rq_cq_id; + u16 rq_num_pages; + u64 rq_pbl_ptr; + u16 srq_id; + u8 stats_queue; +}; + +struct qed_rdma_create_qp_out_params { + u32 qp_id; + u16 icid; + void *rq_pbl_virt; + dma_addr_t rq_pbl_phys; + void *sq_pbl_virt; + dma_addr_t sq_pbl_phys; +}; + +struct qed_rdma_modify_qp_in_params { + u32 modify_flags; +#define QED_RDMA_MODIFY_QP_VALID_NEW_STATE_MASK 0x1 +#define QED_RDMA_MODIFY_QP_VALID_NEW_STATE_SHIFT 0 +#define QED_ROCE_MODIFY_QP_VALID_PKEY_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_PKEY_SHIFT 1 +#define QED_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN_MASK 0x1 +#define QED_RDMA_MODIFY_QP_VALID_RDMA_OPS_EN_SHIFT 2 +#define QED_ROCE_MODIFY_QP_VALID_DEST_QP_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_DEST_QP_SHIFT 3 +#define QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_ADDRESS_VECTOR_SHIFT 4 +#define QED_ROCE_MODIFY_QP_VALID_RQ_PSN_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_RQ_PSN_SHIFT 5 +#define QED_ROCE_MODIFY_QP_VALID_SQ_PSN_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_SQ_PSN_SHIFT 6 +#define QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ_MASK 0x1 +#define QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_REQ_SHIFT 7 +#define QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP_MASK 0x1 +#define QED_RDMA_MODIFY_QP_VALID_MAX_RD_ATOMIC_RESP_SHIFT 8 +#define QED_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_ACK_TIMEOUT_SHIFT 9 +#define QED_ROCE_MODIFY_QP_VALID_RETRY_CNT_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_RETRY_CNT_SHIFT 10 +#define QED_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_RNR_RETRY_CNT_SHIFT 11 +#define QED_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_MIN_RNR_NAK_TIMER_SHIFT 12 +#define QED_ROCE_MODIFY_QP_VALID_E2E_FLOW_CONTROL_EN_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_E2E_FLOW_CONTROL_EN_SHIFT 13 +#define QED_ROCE_MODIFY_QP_VALID_ROCE_MODE_MASK 0x1 +#define QED_ROCE_MODIFY_QP_VALID_ROCE_MODE_SHIFT 14 + + enum qed_roce_qp_state new_state; + u16 pkey; + bool incoming_rdma_read_en; + bool incoming_rdma_write_en; + bool incoming_atomic_en; + bool e2e_flow_control_en; + u32 dest_qp; + bool lb_indication; + u16 mtu; + u8 traffic_class_tos; + u8 hop_limit_ttl; + u32 flow_label; + union qed_gid sgid; + union qed_gid dgid; + u16 udp_src_port; + + u16 vlan_id; + + u32 rq_psn; + u32 sq_psn; + u8 max_rd_atomic_resp; + u8 max_rd_atomic_req; + u32 ack_timeout; + u8 retry_cnt; + u8 rnr_retry_cnt; + u8 min_rnr_nak_timer; + bool sqd_async; + u8 remote_mac_addr[6]; + u8 local_mac_addr[6]; + bool use_local_mac; + enum roce_mode roce_mode; +}; + +struct qed_rdma_query_qp_out_params { + enum qed_roce_qp_state state; + u32 rq_psn; + u32 sq_psn; + bool draining; + u16 mtu; + u32 dest_qp; + bool incoming_rdma_read_en; + bool incoming_rdma_write_en; + bool incoming_atomic_en; + bool e2e_flow_control_en; + union qed_gid sgid; + union qed_gid dgid; + u32 flow_label; + u8 hop_limit_ttl; + u8 traffic_class_tos; + u32 timeout; + u8 rnr_retry; + u8 retry_cnt; + u8 min_rnr_nak_timer; + u16 pkey_index; + u8 max_rd_atomic; + u8 max_dest_rd_atomic; + bool sqd_async; +}; + struct qed_rdma_create_srq_out_params { u16 srq_id; }; @@ -368,6 +501,17 @@ struct qed_rdma_ops { int (*rdma_destroy_cq)(void *rdma_cxt, struct qed_rdma_destroy_cq_in_params *iparams, struct qed_rdma_destroy_cq_out_params *oparams); + struct qed_rdma_qp * + (*rdma_create_qp)(void *rdma_cxt, + struct qed_rdma_create_qp_in_params *iparams, + struct qed_rdma_create_qp_out_params *oparams); + + int (*rdma_modify_qp)(void *roce_cxt, struct qed_rdma_qp *qp, + struct qed_rdma_modify_qp_in_params *iparams); + + int (*rdma_query_qp)(void *rdma_cxt, struct qed_rdma_qp *qp, + struct qed_rdma_query_qp_out_params *oparams); + int (*rdma_destroy_qp)(void *rdma_cxt, struct qed_rdma_qp *qp); }; const struct qed_rdma_ops *qed_get_rdma_ops(void); -- cgit v1.2.3 From ee8eaea30b1368680f4d2f873bc14e1d7b57d021 Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Sat, 1 Oct 2016 22:00:00 +0300 Subject: qed: Add support for memory registeration verbs Add slowpath configuration support for user, dma and memory regions registration. Signed-off-by: Ram Amrani Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_roce_if.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h index 02321e3b1716..0b6df6eedcf1 100644 --- a/include/linux/qed/qed_roce_if.h +++ b/include/linux/qed/qed_roce_if.h @@ -512,6 +512,12 @@ struct qed_rdma_ops { int (*rdma_query_qp)(void *rdma_cxt, struct qed_rdma_qp *qp, struct qed_rdma_query_qp_out_params *oparams); int (*rdma_destroy_qp)(void *rdma_cxt, struct qed_rdma_qp *qp); + int + (*rdma_register_tid)(void *rdma_cxt, + struct qed_rdma_register_tid_in_params *iparams); + int (*rdma_deregister_tid)(void *rdma_cxt, u32 itid); + int (*rdma_alloc_tid)(void *rdma_cxt, u32 *itid); + void (*rdma_free_tid)(void *rdma_cxt, u32 itid); }; const struct qed_rdma_ops *qed_get_rdma_ops(void); -- cgit v1.2.3 From abd49676c70793ee0a251bc3d8fe1604f9303210 Mon Sep 17 00:00:00 2001 From: Ram Amrani Date: Sat, 1 Oct 2016 22:00:01 +0300 Subject: qed: Add RoCE ll2 & GSI support Add the RoCE-specific LL2 logic [as well as GSI support] over the 'generic' LL2 interface. Signed-off-by: Ram Amrani Signed-off-by: Yuval Mintz Signed-off-by: David S. Miller --- include/linux/qed/qed_roce_if.h | 79 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) (limited to 'include/linux') diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h index 0b6df6eedcf1..53047d3fa678 100644 --- a/include/linux/qed/qed_roce_if.h +++ b/include/linux/qed/qed_roce_if.h @@ -39,6 +39,16 @@ #include #include #include +#include + +enum qed_roce_ll2_tx_dest { + /* Light L2 TX Destination to the Network */ + QED_ROCE_LL2_TX_DEST_NW, + + /* Light L2 TX Destination to the Loopback */ + QED_ROCE_LL2_TX_DEST_LB, + QED_ROCE_LL2_TX_DEST_MAX +}; #define QED_RDMA_MAX_CNQ_SIZE (0xFFFF) @@ -461,6 +471,61 @@ struct qed_rdma_counters_out_params { #define QED_ROCE_TX_HEAD_FAILURE (1) #define QED_ROCE_TX_FRAG_FAILURE (2) +struct qed_roce_ll2_header { + void *vaddr; + dma_addr_t baddr; + size_t len; +}; + +struct qed_roce_ll2_buffer { + dma_addr_t baddr; + size_t len; +}; + +struct qed_roce_ll2_packet { + struct qed_roce_ll2_header header; + int n_seg; + struct qed_roce_ll2_buffer payload[RDMA_MAX_SGE_PER_SQ_WQE]; + int roce_mode; + enum qed_roce_ll2_tx_dest tx_dest; +}; + +struct qed_roce_ll2_tx_params { + int reserved; +}; + +struct qed_roce_ll2_rx_params { + u16 vlan_id; + u8 smac[ETH_ALEN]; + int rc; +}; + +struct qed_roce_ll2_cbs { + void (*tx_cb)(void *pdev, struct qed_roce_ll2_packet *pkt); + + void (*rx_cb)(void *pdev, struct qed_roce_ll2_packet *pkt, + struct qed_roce_ll2_rx_params *params); +}; + +struct qed_roce_ll2_params { + u16 max_rx_buffers; + u16 max_tx_buffers; + u16 mtu; + u8 mac_address[ETH_ALEN]; + struct qed_roce_ll2_cbs cbs; + void *cb_cookie; +}; + +struct qed_roce_ll2_info { + u8 handle; + struct qed_roce_ll2_cbs cbs; + u8 mac_address[ETH_ALEN]; + void *cb_cookie; + + /* Lock to protect ll2 */ + struct mutex lock; +}; + enum qed_rdma_type { QED_RDMA_TYPE_ROCE, }; @@ -518,6 +583,20 @@ struct qed_rdma_ops { int (*rdma_deregister_tid)(void *rdma_cxt, u32 itid); int (*rdma_alloc_tid)(void *rdma_cxt, u32 *itid); void (*rdma_free_tid)(void *rdma_cxt, u32 itid); + int (*roce_ll2_start)(struct qed_dev *cdev, + struct qed_roce_ll2_params *params); + int (*roce_ll2_stop)(struct qed_dev *cdev); + int (*roce_ll2_tx)(struct qed_dev *cdev, + struct qed_roce_ll2_packet *packet, + struct qed_roce_ll2_tx_params *params); + int (*roce_ll2_post_rx_buffer)(struct qed_dev *cdev, + struct qed_roce_ll2_buffer *buf, + u64 cookie, u8 notify_fw); + int (*roce_ll2_set_mac_filter)(struct qed_dev *cdev, + u8 *old_mac_address, + u8 *new_mac_address); + int (*roce_ll2_stats)(struct qed_dev *cdev, + struct qed_ll2_stats *stats); }; const struct qed_rdma_ops *qed_get_rdma_ops(void); -- cgit v1.2.3