From 96f87ee1811306d0c8cf94b8c37b0e4f725b01d1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Jan 2019 16:07:23 +0200 Subject: RDMA: Clean structures from CONFIG_INFINIBAND_ON_DEMAND_PAGING CONFIG_INFINIBAND_ON_DEMAND_PAGING is used in general structures to micro-optimize the memory footprint. Remove it, so it will allow us to simplify various ODP device flows. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a3ceed3a040a..3ddd199ba602 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1504,12 +1504,10 @@ struct ib_ucontext { bool cleanup_retryable; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); struct mutex per_mm_list_lock; struct list_head per_mm_list; -#endif struct ib_rdmacg_object cg_obj; /* -- cgit v1.2.3 From 13859d5df418ea535926e2b57c29d5161c522b9d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Jan 2019 16:07:26 +0200 Subject: RDMA/mlx5: Embed into the code flow the ODP config option Convert various places to more readable code, which embeds CONFIG_INFINIBAND_ON_DEMAND_PAGING into the code flow. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_umem_odp.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 0b1446fe2fab..d3725cf13ecd 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -83,6 +83,19 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) return container_of(umem, struct ib_umem_odp, umem); } +/* + * The lower 2 bits of the DMA address signal the R/W permissions for + * the entry. To upgrade the permissions, provide the appropriate + * bitmask to the map_dma_pages function. + * + * Be aware that upgrading a mapped address might result in change of + * the DMA address for the page. + */ +#define ODP_READ_ALLOWED_BIT (1<<0ULL) +#define ODP_WRITE_ALLOWED_BIT (1<<1ULL) + +#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_ucontext_per_mm { @@ -107,19 +120,6 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, unsigned long addr, size_t size); void ib_umem_odp_release(struct ib_umem_odp *umem_odp); -/* - * The lower 2 bits of the DMA address signal the R/W permissions for - * the entry. To upgrade the permissions, provide the appropriate - * bitmask to the map_dma_pages function. - * - * Be aware that upgrading a mapped address might result in change of - * the DMA address for the page. - */ -#define ODP_READ_ALLOWED_BIT (1<<0ULL) -#define ODP_WRITE_ALLOWED_BIT (1<<1ULL) - -#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) - int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, u64 bcnt, u64 access_mask, unsigned long current_seq); -- cgit v1.2.3 From 0ada768517dafa1504ef5986ba04f118b7436960 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Jan 2019 16:07:27 +0200 Subject: RDMA/mlx5: Delete declaration of already removed function The implementation of mlx5_core_page_fault_resume() was removed in commit d5d284b829a6 ("{net,IB}/mlx5: Move Page fault EQ and ODP logic to RDMA"). This patch removes declaration too. Fixes: d5d284b829a6 ("{net,IB}/mlx5: Move Page fault EQ and ODP logic to RDMA") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/driver.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 54299251d40d..b6f5839f129a 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -939,10 +939,6 @@ int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *odp_caps); int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, u8 port_num, void *out, size_t sz); -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token, - u32 wq_num, u8 type, int error); -#endif int mlx5_init_rl_table(struct mlx5_core_dev *dev); void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev); -- cgit v1.2.3 From b0ea0fa5435f9df7213a9af098558f7dd584d8e8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 9 Jan 2019 11:15:16 +0200 Subject: IB/{core,hw}: Have ib_umem_get extract the ib_ucontext from ib_udata ib_umem_get() can only be called in a method callback, which always has a udata parameter. This allows ib_umem_get() to derive the ucontext pointer directly from the udata without requiring the drivers to find it in some way or another. Signed-off-by: Jason Gunthorpe Signed-off-by: Shamir Rabinovitch --- include/rdma/ib_umem.h | 8 +++++--- include/rdma/ib_verbs.h | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 5d3755ec5afa..73af05db04c7 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -36,6 +36,7 @@ #include #include #include +#include struct ib_ucontext; struct ib_umem_odp; @@ -80,7 +81,7 @@ static inline size_t ib_umem_num_pages(struct ib_umem *umem) #ifdef CONFIG_INFINIBAND_USER_MEM -struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, +struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, size_t size, int access, int dmasync); void ib_umem_release(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem); @@ -91,9 +92,10 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, #include -static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, +static inline struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, size_t size, - int access, int dmasync) { + int access, int dmasync) +{ return ERR_PTR(-EINVAL); } static inline void ib_umem_release(struct ib_umem *umem) { } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3ddd199ba602..aa1f126d3383 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4200,6 +4200,7 @@ void rdma_roce_rescan_device(struct ib_device *ibdev); struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); +struct ib_ucontext *rdma_get_ucontext(struct ib_udata *udata); int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs); -- cgit v1.2.3 From ea4baf7f116a18382df331db2123d98bc1c3cd83 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 18 Dec 2018 14:28:30 +0200 Subject: RDMA: Rename port_callback to init_port Most provider routines are callback routines which ib core invokes. _callback suffix doesn't convey information about when such callback is invoked. Therefore, rename port_callback to init_port. Additionally, store the init_port function pointer in ib_device_ops, so that it can be accessed in subsequent patches when binding rdma device to net namespace. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 10 +++++++--- include/rdma/rdma_vt.h | 3 --- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index aa1f126d3383..1d1902fd9f87 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2504,6 +2504,12 @@ struct ib_device_ops { */ int (*get_hw_stats)(struct ib_device *device, struct rdma_hw_stats *stats, u8 port, int index); + /* + * This function is called once for each port when a ib device is + * registered. + */ + int (*init_port)(struct ib_device *device, u8 port_num, + struct kobject *port_sysfs); }; struct ib_device { @@ -2620,9 +2626,7 @@ void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); -int ib_register_device(struct ib_device *device, const char *name, - int (*port_callback)(struct ib_device *, u8, - struct kobject *)); +int ib_register_device(struct ib_device *device, const char *name); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index dd0ed8048bb4..acb3bc96dfa7 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -250,9 +250,6 @@ struct rvt_driver_provided { */ void (*do_send)(struct rvt_qp *qp); - /* Passed to ib core registration. Callback to create syfs files */ - int (*port_callback)(struct ib_device *, u8, struct kobject *); - /* * Returns a pointer to the undelying hardware's PCI device. This is * used to display information as to what hardware is being referenced -- cgit v1.2.3 From 54747231150f0dddf68f2ee29ec2970fcc433909 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 18 Dec 2018 14:15:56 +0200 Subject: RDMA: Introduce and use rdma_device_to_ibdev() Introduce and use rdma_device_to_ibdev() API for those drivers which are registering one sysfs group and also use in ib_core. In subsequent patch, device->provider_ibdev one-to-one mapping is no longer holds true during accessing sysfs entries. Therefore, introduce an API rdma_device_to_ibdev() that provides such information. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 1d1902fd9f87..94b6e1dd4dab 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4241,4 +4241,27 @@ rdma_set_device_sysfs_group(struct ib_device *dev, dev->groups[1] = group; } +/** + * rdma_device_to_ibdev - Get ib_device pointer from device pointer + * + * @device: device pointer for which ib_device pointer to retrieve + * + * rdma_device_to_ibdev() retrieves ib_device pointer from device. + * + */ +static inline struct ib_device *rdma_device_to_ibdev(struct device *device) +{ + return container_of(device, struct ib_device, dev); +} + +/** + * rdma_device_to_drv_device - Helper macro to reach back to driver's + * ib_device holder structure from device pointer. + * + * NOTE: New drivers should not make use of this API; This API is only for + * existing drivers who have exposed sysfs entries using + * rdma_set_device_sysfs_group(). + */ +#define rdma_device_to_drv_device(dev, drv_dev_struct, ibdev_member) \ + container_of(rdma_device_to_ibdev(dev), drv_dev_struct, ibdev_member) #endif /* IB_VERBS_H */ -- cgit v1.2.3 From 7527a7b157d1191b23562ed70154ae93bd65f845 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 17 Jan 2019 20:14:15 +0200 Subject: IB/core: Simplify rdma cgroup registration RDMA cgroup registration routine always returns success, so simplify function to be void and run clang formatter over whole CONFIG_CGROUP_RDMA art of core_priv.h. This reduces unwinding error path for regular registration and future net namespace change functionality for rdma device. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Acked-by: Tejun Heo Signed-off-by: Jason Gunthorpe --- include/linux/cgroup_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cgroup_rdma.h b/include/linux/cgroup_rdma.h index e94290b29e99..ef1bae2983f3 100644 --- a/include/linux/cgroup_rdma.h +++ b/include/linux/cgroup_rdma.h @@ -39,7 +39,7 @@ struct rdmacg_device { * APIs for RDMA/IB stack to publish when a device wants to * participate in resource accounting */ -int rdmacg_register_device(struct rdmacg_device *device); +void rdmacg_register_device(struct rdmacg_device *device); void rdmacg_unregister_device(struct rdmacg_device *device); /* APIs for RDMA/IB stack to charge/uncharge pool specific resources */ -- cgit v1.2.3 From 534fd7aac56a7994d16032f32123def9923e339f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 13 Jan 2019 16:01:17 +0200 Subject: IB/mlx5: Manage indirection mkey upon DEVX flow for ODP Manage indirection mkey upon DEVX flow to support ODP. To support a page fault event on the indirection mkey it needs to be part of the device mkey radix tree. Both the creation and the deletion flows for a DEVX object which is indirection mkey were adapted to handle that. Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b6f5839f129a..619d6fee96a1 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -364,6 +364,7 @@ struct mlx5_core_sig_ctx { enum { MLX5_MKEY_MR = 1, MLX5_MKEY_MW, + MLX5_MKEY_INDIRECT_DEVX, }; struct mlx5_core_mkey { -- cgit v1.2.3 From da6a496a34f2fdcab14362cdc5068aac385e7b47 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 22 Jan 2019 09:16:08 +0200 Subject: IB/mlx5: Ranges in implicit ODP MR inherit its write access A sub-range in ODP implicit MR should take its write permission from the MR and not be set always to allow. Fixes: d07d1d70ce1a ("IB/umem: Update on demand page (ODP) support") Signed-off-by: Moni Shoua Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_umem_odp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index d3725cf13ecd..d0024f53626e 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -116,7 +116,7 @@ struct ib_ucontext_per_mm { }; int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root_umem, unsigned long addr, size_t size); void ib_umem_odp_release(struct ib_umem_odp *umem_odp); -- cgit v1.2.3 From 61b2fe3c62e5269408e264b2348f96467246d537 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 22 Jan 2019 09:16:09 +0200 Subject: IB/mlx5: Remove dead code When CONFIG_INFINIBAND_ON_DEMAND_PAGING is not set there is no caller to ib_alloc_odp_umem() so let's remove it. Signed-off-by: Moni Shoua Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_umem_odp.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index d0024f53626e..dadc96dea39c 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -169,12 +169,6 @@ static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) return -EINVAL; } -static inline struct ib_umem_odp * -ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) -{ - return ERR_PTR(-EINVAL); -} - static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ -- cgit v1.2.3 From 6bf8f22aea0ddd93af822aed8afeeee4acdf7694 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 22 Jan 2019 08:29:56 +0200 Subject: IB/mlx5: Introduce MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD Introduce MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD and its initial implementation. This object is from type class FD and will be used to read DEVX async commands completion. The core layer should allow the driver to set object from type FD in a safe mode, this option was added with a matching comment in place. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/uverbs_types.h | 1 + include/uapi/rdma/mlx5_user_ioctl_cmds.h | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h index acb1bfa3cc99..175d761695e1 100644 --- a/include/rdma/uverbs_types.h +++ b/include/rdma/uverbs_types.h @@ -157,6 +157,7 @@ struct uverbs_obj_fd_type { extern const struct uverbs_obj_type_class uverbs_idr_class; extern const struct uverbs_obj_type_class uverbs_fd_class; +void uverbs_close_fd(struct file *f); #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \ sizeof(char)) diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index b8d121d457f1..6ceae29d77cd 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -113,11 +113,20 @@ enum mlx5_ib_devx_umem_methods { MLX5_IB_METHOD_DEVX_UMEM_DEREG, }; +enum mlx5_ib_devx_async_cmd_fd_alloc_attrs { + MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_devx_async_cmd_fd_methods { + MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC = (1U << UVERBS_ID_NS_SHIFT), +}; + enum mlx5_ib_objects { MLX5_IB_OBJECT_DEVX = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_OBJECT_DEVX_OBJ, MLX5_IB_OBJECT_DEVX_UMEM, MLX5_IB_OBJECT_FLOW_MATCHER, + MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD, }; enum mlx5_ib_flow_matcher_create_attrs { -- cgit v1.2.3 From a124edba26270697540f1058bfcd490c1c65b116 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 22 Jan 2019 08:29:57 +0200 Subject: IB/mlx5: Introduce async DEVX obj query API Introduce async DEVX obj query API to get the command response back to user space once it's ready without blocking when calling the firmware. The event's data includes a header with some meta data then the firmware output command data. The header includes: - The input 'wr_id' to let application recognizing the response. The input FD attribute is used to have the event data ready on. Downstream patches from this series will implement the file ops to let application read it. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 9 +++++++++ include/uapi/rdma/mlx5_user_ioctl_verbs.h | 5 +++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 6ceae29d77cd..8149d224030b 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -84,6 +84,14 @@ enum mlx5_ib_devx_obj_query_attrs { MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, }; +enum mlx5_ib_devx_obj_query_async_attrs { + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_CMD_IN, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_FD, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_WR_ID, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_ASYNC_OUT_LEN, +}; + enum mlx5_ib_devx_query_eqn_attrs { MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, @@ -94,6 +102,7 @@ enum mlx5_ib_devx_obj_methods { MLX5_IB_METHOD_DEVX_OBJ_DESTROY, MLX5_IB_METHOD_DEVX_OBJ_MODIFY, MLX5_IB_METHOD_DEVX_OBJ_QUERY, + MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY, }; enum mlx5_ib_devx_umem_reg_attrs { diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index 4ef62c0e8452..4a701033b93f 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -51,5 +51,10 @@ enum mlx5_ib_uapi_flow_action_packet_reformat_type { MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3, }; +struct mlx5_ib_uapi_devx_async_cmd_hdr { + __aligned_u64 wr_id; + __u8 out_data[]; +}; + #endif -- cgit v1.2.3 From 0b5cb3300ae59ed7e93b465dfa2384a6a4df8eb4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 22 Jan 2019 10:25:20 -0800 Subject: RDMA/srp: Increase max_segment_size The default behavior of the SCSI core is to set the block layer request queue parameter max_segment_size to 64 KB. That means that elements of scatterlists are limited to 64 KB. Since RDMA adapters support larger sizes, increase max_segment_size for the SRP initiator. Notes: - The SCSI max_segment_size parameter was introduced in kernel v5.0. See also commit 50c2e9107f17 ("scsi: introduce a max_segment_size host_template parameters"). - Some other block drivers already set max_segment_size to UINT_MAX, e.g. nbd and rbd. Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 94b6e1dd4dab..71ea144ec823 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3715,6 +3715,19 @@ static inline unsigned int ib_sg_dma_len(struct ib_device *dev, return sg_dma_len(sg); } +/** + * ib_dma_max_seg_size - Return the size limit of a single DMA transfer + * @dev: The device to query + * + * The returned value represents a size in bytes. + */ +static inline unsigned int ib_dma_max_seg_size(struct ib_device *dev) +{ + struct device_dma_parameters *p = dev->dma_device->dma_parms; + + return p ? p->max_segment_size : UINT_MAX; +} + /** * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU * @dev: The device for which the DMA address was created -- cgit v1.2.3 From 459cc69fa4c17caf21de596693d8a07170820a58 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 30 Jan 2019 12:49:11 +0200 Subject: RDMA: Provide safe ib_alloc_device() function All callers to ib_alloc_device() provide a larger size than struct ib_device and rely on the fact that struct ib_device is embedded in their driver specific structure as the first member. Provide a safer variant of ib_alloc_device() that checks and enforces this approach to make sure the drivers are using it right. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 71ea144ec823..a1a1e710642c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2621,7 +2621,13 @@ struct ib_client { struct list_head list; }; -struct ib_device *ib_alloc_device(size_t size); +struct ib_device *_ib_alloc_device(size_t size); +#define ib_alloc_device(drv_struct, member) \ + container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof( \ + struct drv_struct, member))), \ + struct drv_struct, member) + void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); -- cgit v1.2.3 From 6780c4fa9d6e091b2f206ac429a40e2e8d2e45f3 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 22 Jan 2019 10:08:22 +0200 Subject: RDMA: Add indication for in kernel API support to IB device Drivers that do not provide kernel verbs support should not be used by ib kernel clients at all. In case a device does not implement all mandatory verbs for kverbs usage mark it as a non kverbs provider and prevent its usage for all clients except for uverbs. The device is marked as a non kverbs provider using the 'kverbs_provider' flag which should only be set by the core code. The clients can choose whether kverbs are requested for its usage using the 'no_kverbs_req' flag which is currently set for uverbs only. This patch allows drivers to remove mandatory verbs stubs and simply set the callbacks to NULL. The IB device will be registered as a non-kverbs provider. Note that verbs that are required for the device registration process must be implemented. Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a1a1e710642c..4183a03b46b5 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2565,6 +2565,8 @@ struct ib_device { __be64 node_guid; u32 local_dma_lkey; u16 is_switch:1; + /* Indicates kernel verbs support, should not be used in drivers */ + u16 kverbs_provider:1; u8 node_type; u8 phys_port_cnt; struct ib_device_attr attrs; @@ -2619,6 +2621,9 @@ struct ib_client { const struct sockaddr *addr, void *client_data); struct list_head list; + + /* kverbs are not required by the client */ + u8 no_kverbs_req:1; }; struct ib_device *_ib_alloc_device(size_t size); -- cgit v1.2.3 From 0ad699c0edc97a864177679dd67f2ccd73b07cb7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 30 Jan 2019 12:48:58 +0200 Subject: RDMA/core: Simplify restrack interface In the current implementation, we have one restrack root per-device and all users are simply providing it directly. Let's simplify the interface and have callers provide the ib_device and internally access the restrack_root. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/restrack.h | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 8f179be9d9a9..f756fc48eee5 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -49,6 +49,7 @@ enum rdma_restrack_type { }; #define RDMA_RESTRACK_HASH_BITS 8 +struct ib_device; struct rdma_restrack_entry; /** @@ -122,25 +123,9 @@ struct rdma_restrack_entry { bool user; }; -/** - * rdma_restrack_init() - initialize resource tracking - * @res: resource tracking root - */ -void rdma_restrack_init(struct rdma_restrack_root *res); - -/** - * rdma_restrack_clean() - clean resource tracking - * @res: resource tracking root - */ -void rdma_restrack_clean(struct rdma_restrack_root *res); - -/** - * rdma_restrack_count() - the current usage of specific object - * @res: resource entry - * @type: actual type of object to operate - * @ns: PID namespace - */ -int rdma_restrack_count(struct rdma_restrack_root *res, +void rdma_restrack_init(struct ib_device *dev); +void rdma_restrack_clean(struct ib_device *dev); +int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, struct pid_namespace *ns); -- cgit v1.2.3 From 02da37509705d3ba6a58fe4799a0caf6b4baecb0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 30 Jan 2019 12:49:02 +0200 Subject: RDMA/core: Use the ops infrastructure to keep all callbacks in one place As preparation to hide rdma_restrack_root, refactor the code to use the ops structure instead of a special callback which is hidden in rdma_restrack_root. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 5 +++++ include/rdma/restrack.h | 7 ------- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 4183a03b46b5..5fc3be884444 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2510,6 +2510,11 @@ struct ib_device_ops { */ int (*init_port)(struct ib_device *device, u8 port_num, struct kobject *port_sysfs); + /** + * Allows rdma drivers to add their own restrack attributes. + */ + int (*fill_res_entry)(struct sk_buff *msg, + struct rdma_restrack_entry *entry); }; struct ib_device { diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index f756fc48eee5..cc66cc7a11d3 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -65,13 +65,6 @@ struct rdma_restrack_root { * @hash: global database for all resources per-device */ DECLARE_HASHTABLE(hash, RDMA_RESTRACK_HASH_BITS); - /** - * @fill_res_entry: driver-specific fill function - * - * Allows rdma drivers to add their own restrack attributes. - */ - int (*fill_res_entry)(struct sk_buff *msg, - struct rdma_restrack_entry *entry); }; /** -- cgit v1.2.3 From ddf922c31fedd19c5b89a269c35e5c8b68c64327 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:21:01 -0800 Subject: IB/hfi1, IB/rdmavt: Allow for extending of QP's s_ack_queue The OPFN protocol uses the COMPARE_SWAP request to exchange data between the requester and the responder and therefore needs to be stored in the QP's s_ack_queue when the request is received on the responder side. However, because the user does not know anything about the OPFN protocol, this extra entry in the queue cannot be advertised to the user. This patch adds an extra entry in a QP's s_ack_queue. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/rdma_vt.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index acb3bc96dfa7..168e40be183c 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -182,6 +182,7 @@ struct rvt_driver_params { u32 max_mad_size; u8 qos_shift; u8 max_rdma_atomic; + u8 extra_rdma_atomic; u8 reserved_operations; }; @@ -519,7 +520,14 @@ static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi) */ static inline unsigned int rvt_max_atomic(struct rvt_dev_info *rdi) { - return rdi->dparms.max_rdma_atomic + 1; + return rdi->dparms.max_rdma_atomic + + rdi->dparms.extra_rdma_atomic + 1; +} + +static inline unsigned int rvt_size_atomic(struct rvt_dev_info *rdi) +{ + return rdi->dparms.max_rdma_atomic + + rdi->dparms.extra_rdma_atomic; } /* -- cgit v1.2.3 From da82334219bc386ef7ea5b4b185a339a973dd513 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 22 Jan 2019 08:48:41 +0200 Subject: IB/core: Allocate a bit for SRQ ODP support The ODP support matrix is per operation and per transport. The support for each transport (RC, UD, etc.) is described with a bit field. ODP for SRQ WQEs is considered a different kind of support from ODP for RQ WQs and therefore needs a different capability bit. Signed-off-by: Moni Shoua Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 5fc3be884444..5eefdea62831 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -268,6 +268,7 @@ enum ib_odp_transport_cap_bits { IB_ODP_SUPPORT_WRITE = 1 << 2, IB_ODP_SUPPORT_READ = 1 << 3, IB_ODP_SUPPORT_ATOMIC = 1 << 4, + IB_ODP_SUPPORT_SRQ_RECV = 1 << 5, }; struct ib_odp_caps { -- cgit v1.2.3 From 52a72e2a395fa3c5ab5df41058a8511e87215730 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 22 Jan 2019 08:48:42 +0200 Subject: IB/uverbs: Expose XRC ODP device capabilities Expose XRC ODP capabilities as part of the extended device capabilities. Signed-off-by: Moni Shoua Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 1 + include/uapi/rdma/ib_user_verbs.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 5eefdea62831..8219c07340a9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -277,6 +277,7 @@ struct ib_odp_caps { uint32_t rc_odp_caps; uint32_t uc_odp_caps; uint32_t ud_odp_caps; + uint32_t xrc_odp_caps; } per_transport_caps; }; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 480d9a60b68e..0474c7400268 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -270,6 +270,8 @@ struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_tm_caps tm_caps; struct ib_uverbs_cq_moderation_caps cq_moderation_caps; __aligned_u64 max_dm_size; + __u32 xrc_odp_caps; + __u32 reserved; }; struct ib_uverbs_query_port { -- cgit v1.2.3 From a163afc88556e099271a7b423295bc5176fcecce Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 31 Jan 2019 08:30:34 -0800 Subject: IB/core: Remove ib_sg_dma_address() and ib_sg_dma_len() Keeping single line wrapper functions is not useful. Hence remove the ib_sg_dma_address() and ib_sg_dma_len() functions. This patch does not change any functionality. Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 8219c07340a9..f7e8709e48cd 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3705,33 +3705,6 @@ static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, { dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction, dma_attrs); } -/** - * ib_sg_dma_address - Return the DMA address from a scatter/gather entry - * @dev: The device for which the DMA addresses were created - * @sg: The scatter/gather entry - * - * Note: this function is obsolete. To do: change all occurrences of - * ib_sg_dma_address() into sg_dma_address(). - */ -static inline u64 ib_sg_dma_address(struct ib_device *dev, - struct scatterlist *sg) -{ - return sg_dma_address(sg); -} - -/** - * ib_sg_dma_len - Return the DMA length from a scatter/gather entry - * @dev: The device for which the DMA addresses were created - * @sg: The scatter/gather entry - * - * Note: this function is obsolete. To do: change all occurrences of - * ib_sg_dma_len() into sg_dma_len(). - */ -static inline unsigned int ib_sg_dma_len(struct ib_device *dev, - struct scatterlist *sg) -{ - return sg_dma_len(sg); -} /** * ib_dma_max_seg_size - Return the size limit of a single DMA transfer -- cgit v1.2.3 From 668aa15b5bf87f156ec805cb7348c785c56b82ab Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Tue, 29 Jan 2019 12:08:50 +0200 Subject: RDMA/rxe: Improve loopback marking Currently a packet is marked for loopback only if the source and destination addresses equals. This is not enough when multiple gids are present in rxe device's gid table and the traffic is from one gid to another. Fix it by marking the packet for loopback if the destination MAC address is equal to the source MAC address. Signed-off-by: Kamal Heib Reviewed-by: Yuval Shaia Tested-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_rxe.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index 44ef6a3b7afc..aae2e696bb38 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -58,8 +58,7 @@ struct rxe_global_route { struct rxe_av { __u8 port_num; __u8 network_type; - __u16 reserved1; - __u32 reserved2; + __u8 dmac[6]; struct rxe_global_route grh; union { struct sockaddr_in _sockaddr_in; -- cgit v1.2.3 From f76903d574b26bc596951a5c5e757eb02c67abbd Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 29 Jan 2019 13:33:11 -0800 Subject: RDMA/IWPM: refactor the IWPM message attribute names In order to add new IWPM_NL attributes, the enums for the IWPM commands attributes are refactored such that a new attribute can be added without breaking ABI version 3. Instead of sharing nl attribute enums for both request and response messages, we create separate enums for each IWPM message request and reply. This allows us to extend any given IWPM message by adding new attributes for just that message. These new enums are created, though, in a way to avoid breaking ABI version 3. Signed-off-by: Steve Wise Reviewed-by: Tatyana Nikolova Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 2e18b77a817f..42d53e182d5f 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -83,13 +83,20 @@ enum { IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0, IWPM_NLA_MANAGE_MAPPING_SEQ, IWPM_NLA_MANAGE_ADDR, - IWPM_NLA_MANAGE_MAPPED_LOC_ADDR, + IWPM_NLA_MANAGE_MAPPING_MAX +}; + +enum { + IWPM_NLA_RMANAGE_MAPPING_UNSPEC = 0, + IWPM_NLA_RMANAGE_MAPPING_SEQ, + IWPM_NLA_RMANAGE_ADDR, + IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR, + /* The following maintains bisectability of rdma-core */ + IWPM_NLA_MANAGE_MAPPED_LOC_ADDR = IWPM_NLA_RMANAGE_MAPPED_LOC_ADDR, IWPM_NLA_RMANAGE_MAPPING_ERR, IWPM_NLA_RMANAGE_MAPPING_MAX }; -#define IWPM_NLA_MANAGE_MAPPING_MAX 3 -#define IWPM_NLA_QUERY_MAPPING_MAX 4 #define IWPM_NLA_MAPINFO_SEND_MAX 3 enum { @@ -97,6 +104,14 @@ enum { IWPM_NLA_QUERY_MAPPING_SEQ, IWPM_NLA_QUERY_LOCAL_ADDR, IWPM_NLA_QUERY_REMOTE_ADDR, + IWPM_NLA_QUERY_MAPPING_MAX, +}; + +enum { + IWPM_NLA_RQUERY_MAPPING_UNSPEC = 0, + IWPM_NLA_RQUERY_MAPPING_SEQ, + IWPM_NLA_RQUERY_LOCAL_ADDR, + IWPM_NLA_RQUERY_REMOTE_ADDR, IWPM_NLA_RQUERY_MAPPED_LOC_ADDR, IWPM_NLA_RQUERY_MAPPED_REM_ADDR, IWPM_NLA_RQUERY_MAPPING_ERR, -- cgit v1.2.3 From b0bad9ad514fc1dd8890f1749f5d2425a73270e3 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 29 Jan 2019 13:33:16 -0800 Subject: RDMA/IWPM: Support no port mapping requirements A soft iwarp driver that uses the host TCP stack via a kernel mode socket does not need port mapping. In fact, if the port map daemon, iwpmd, is running, then iwpmd must not try and create/bind a socket to the actual port for a soft iwarp connection, since the driver already has that socket bound. Yet if the soft iwarp driver wants to interoperate with hard iwarp devices that -are- using port mapping, then the soft iwarp driver's mappings still need to be maintained and advertised by the iwpm protocol. This patch enhances the rdma driver<->iwcm interface to allow an iwarp driver to specify that it does not want port mapping. The iwpm kernel<->iwpmd interface is also enhanced to pass up this information on map requests. Care is taken to interoperate with the current iwpmd version (ABI version 3) and only use the new NL attributes if iwpmd supports ABI version 4. The ABI version define has also been created in rdma_netlink.h so both kernel and user code can share it. The iwcm and iwpmd negotiate the ABI version to use with a new HELLO netlink message. Signed-off-by: Steve Wise Reviewed-by: Tatyana Nikolova Signed-off-by: Jason Gunthorpe --- include/rdma/iw_cm.h | 13 +++++++++++++ include/rdma/iw_portmap.h | 15 ++++++++++++++- include/uapi/rdma/rdma_netlink.h | 24 ++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 5cd7701db148..48512abd3162 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -105,6 +105,18 @@ struct iw_cm_conn_param { u32 qpn; }; +enum iw_flags { + + /* + * This flag allows the iwcm and iwpmd to still advertise + * mappings but the real and mapped port numbers are the + * same. Further, iwpmd will not bind any user socket to + * reserve the port. This is required for soft iwarp + * to play in the port mapped iwarp space. + */ + IW_F_NO_PORT_MAP = (1 << 0), +}; + struct iw_cm_verbs { void (*add_ref)(struct ib_qp *qp); @@ -127,6 +139,7 @@ struct iw_cm_verbs { int (*destroy_listen)(struct iw_cm_id *cm_id); char ifname[IFNAMSIZ]; + enum iw_flags driver_flags; }; /** diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h index fda31673a562..84fac196ef80 100644 --- a/include/rdma/iw_portmap.h +++ b/include/rdma/iw_portmap.h @@ -58,6 +58,7 @@ struct iwpm_sa_data { struct sockaddr_storage mapped_loc_addr; struct sockaddr_storage rem_addr; struct sockaddr_storage mapped_rem_addr; + u32 flags; }; /** @@ -205,9 +206,11 @@ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, * @local_addr: Local ip/tcp address * @mapped_addr: Mapped local ip/tcp address * @nl_client: The index of the netlink client + * @map_flags: IWPM mapping flags */ int iwpm_create_mapinfo(struct sockaddr_storage *local_addr, - struct sockaddr_storage *mapped_addr, u8 nl_client); + struct sockaddr_storage *mapped_addr, u8 nl_client, + u32 map_flags); /** * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address @@ -221,4 +224,14 @@ int iwpm_create_mapinfo(struct sockaddr_storage *local_addr, int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr, struct sockaddr_storage *mapped_addr); +/** + * iwpm_hello_cb - Process a hello message from iwpmd + * + * @skb: + * @cb: Contains the received message (payload and netlink header) + * + * Using the received port mapper pid, send the kernel's abi_version + * after adjusting it to support the iwpmd version. + */ +int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb); #endif /* _IW_PORTMAP_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 42d53e182d5f..0f5263767fb4 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -35,6 +35,19 @@ enum { RDMA_NL_RDMA_CM_NUM_ATTR, }; +/* The minimum version that the iwpm kernel supports */ +#define IWPM_UABI_VERSION_MIN 3 + +/* The latest version that the iwpm kernel supports */ +#define IWPM_UABI_VERSION 4 + +/* iwarp port mapper message flags */ +enum { + + /* Do not map the port for this IWPM request */ + IWPM_FLAGS_NO_PORT_MAP = (1 << 0), +}; + /* iwarp port mapper op-codes */ enum { RDMA_NL_IWPM_REG_PID = 0, @@ -45,6 +58,7 @@ enum { RDMA_NL_IWPM_HANDLE_ERR, RDMA_NL_IWPM_MAPINFO, RDMA_NL_IWPM_MAPINFO_NUM, + RDMA_NL_IWPM_HELLO, RDMA_NL_IWPM_NUM_OPS }; @@ -83,6 +97,7 @@ enum { IWPM_NLA_MANAGE_MAPPING_UNSPEC = 0, IWPM_NLA_MANAGE_MAPPING_SEQ, IWPM_NLA_MANAGE_ADDR, + IWPM_NLA_MANAGE_FLAGS, IWPM_NLA_MANAGE_MAPPING_MAX }; @@ -98,12 +113,14 @@ enum { }; #define IWPM_NLA_MAPINFO_SEND_MAX 3 +#define IWPM_NLA_REMOVE_MAPPING_MAX 3 enum { IWPM_NLA_QUERY_MAPPING_UNSPEC = 0, IWPM_NLA_QUERY_MAPPING_SEQ, IWPM_NLA_QUERY_LOCAL_ADDR, IWPM_NLA_QUERY_REMOTE_ADDR, + IWPM_NLA_QUERY_FLAGS, IWPM_NLA_QUERY_MAPPING_MAX, }; @@ -129,6 +146,7 @@ enum { IWPM_NLA_MAPINFO_UNSPEC = 0, IWPM_NLA_MAPINFO_LOCAL_ADDR, IWPM_NLA_MAPINFO_MAPPED_ADDR, + IWPM_NLA_MAPINFO_FLAGS, IWPM_NLA_MAPINFO_MAX }; @@ -147,6 +165,12 @@ enum { IWPM_NLA_ERR_MAX }; +enum { + IWPM_NLA_HELLO_UNSPEC = 0, + IWPM_NLA_HELLO_ABI_VERSION, + IWPM_NLA_HELLO_MAX +}; + /* * Local service operations: * RESOLVE - The client requests the local service to resolve a path. -- cgit v1.2.3 From a78e8723a50530d15faa25cc0b6f009bcd251c20 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 16 Jan 2019 09:55:41 +0200 Subject: RDMA/cma: Remove CM_ID statistics provided by rdma-cm module Netlink statistics exported by rdma-cm never had any working user space component published to the mailing list or to any open source project. Canvassing various proprietary users, and the original requester, we find that there are no real users of this interface. This patch simply removes all occurrences of RDMA CM netlink in favour of modern nldev implementation, which provides the same information and accompanied by widely used user space component. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 0f5263767fb4..3a9e681e4257 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -5,8 +5,7 @@ #include enum { - RDMA_NL_RDMA_CM = 1, - RDMA_NL_IWCM, + RDMA_NL_IWCM = 2, RDMA_NL_RSVD, RDMA_NL_LS, /* RDMA Local Services */ RDMA_NL_NLDEV, /* RDMA device interface */ @@ -14,8 +13,7 @@ enum { }; enum { - RDMA_NL_GROUP_CM = 1, - RDMA_NL_GROUP_IWPM, + RDMA_NL_GROUP_IWPM = 2, RDMA_NL_GROUP_LS, RDMA_NL_NUM_GROUPS }; @@ -24,17 +22,6 @@ enum { #define RDMA_NL_GET_OP(type) (type & ((1 << 10) - 1)) #define RDMA_NL_GET_TYPE(client, op) ((client << 10) + op) -enum { - RDMA_NL_RDMA_CM_ID_STATS = 0, - RDMA_NL_RDMA_CM_NUM_OPS -}; - -enum { - RDMA_NL_RDMA_CM_ATTR_SRC_ADDR = 1, - RDMA_NL_RDMA_CM_ATTR_DST_ADDR, - RDMA_NL_RDMA_CM_NUM_ATTR, -}; - /* The minimum version that the iwpm kernel supports */ #define IWPM_UABI_VERSION_MIN 3 -- cgit v1.2.3 From a2bfd708b17adb6e597e70d4eca824667f2d4e3c Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 5 Feb 2019 11:33:22 -0800 Subject: RDMA/iwpm: move kdoc comments to functions Move the iwpm kdoc comments from the prototype declarations to above the function bodies. There are no functional changes in this patch. Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- include/rdma/iw_portmap.h | 149 ---------------------------------------------- 1 file changed, 149 deletions(-) (limited to 'include') diff --git a/include/rdma/iw_portmap.h b/include/rdma/iw_portmap.h index 84fac196ef80..b9fee7feeeb5 100644 --- a/include/rdma/iw_portmap.h +++ b/include/rdma/iw_portmap.h @@ -61,177 +61,28 @@ struct iwpm_sa_data { u32 flags; }; -/** - * iwpm_init - Allocate resources for the iwarp port mapper - * - * Should be called when network interface goes up. - */ int iwpm_init(u8); - -/** - * iwpm_exit - Deallocate resources for the iwarp port mapper - * - * Should be called when network interface goes down. - */ int iwpm_exit(u8); - -/** - * iwpm_valid_pid - Check if the userspace iwarp port mapper pid is valid - * - * Returns true if the pid is greater than zero, otherwise returns false - */ int iwpm_valid_pid(void); - -/** - * iwpm_register_pid - Send a netlink query to userspace - * to get the iwarp port mapper pid - * @pm_msg: Contains driver info to send to the userspace port mapper - * @nl_client: The index of the netlink client - */ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client); - -/** - * iwpm_add_mapping - Send a netlink add mapping request to - * the userspace port mapper - * @pm_msg: Contains the local ip/tcp address info to send - * @nl_client: The index of the netlink client - * - * If the request is successful, the pm_msg stores - * the port mapper response (mapped address info) - */ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client); - -/** - * iwpm_add_and_query_mapping - Send a netlink add and query mapping request - * to the userspace port mapper - * @pm_msg: Contains the local and remote ip/tcp address info to send - * @nl_client: The index of the netlink client - * - * If the request is successful, the pm_msg stores the - * port mapper response (mapped local and remote address info) - */ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client); - -/** - * iwpm_remove_mapping - Send a netlink remove mapping request - * to the userspace port mapper - * - * @local_addr: Local ip/tcp address to remove - * @nl_client: The index of the netlink client - */ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client); - -/** - * iwpm_register_pid_cb - Process the port mapper response to - * iwpm_register_pid query - * @skb: - * @cb: Contains the received message (payload and netlink header) - * - * If successful, the function receives the userspace port mapper pid - * which is used in future communication with the port mapper - */ int iwpm_register_pid_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_add_mapping_cb - Process the port mapper response to - * iwpm_add_mapping request - * @skb: - * @cb: Contains the received message (payload and netlink header) - */ int iwpm_add_mapping_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_add_and_query_mapping_cb - Process the port mapper response to - * iwpm_add_and_query_mapping request - * @skb: - * @cb: Contains the received message (payload and netlink header) - */ int iwpm_add_and_query_mapping_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_remote_info_cb - Process remote connecting peer address info, which - * the port mapper has received from the connecting peer - * - * @cb: Contains the received message (payload and netlink header) - * - * Stores the IPv4/IPv6 address info in a hash table - */ int iwpm_remote_info_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_mapping_error_cb - Process port mapper notification for error - * - * @skb: - * @cb: Contains the received message (payload and netlink header) - */ int iwpm_mapping_error_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_mapping_info_cb - Process a notification that the userspace - * port mapper daemon is started - * @skb: - * @cb: Contains the received message (payload and netlink header) - * - * Using the received port mapper pid, send all the local mapping - * info records to the userspace port mapper - */ int iwpm_mapping_info_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_ack_mapping_info_cb - Process the port mapper ack for - * the provided local mapping info records - * @skb: - * @cb: Contains the received message (payload and netlink header) - */ int iwpm_ack_mapping_info_cb(struct sk_buff *, struct netlink_callback *); - -/** - * iwpm_get_remote_info - Get the remote connecting peer address info - * - * @mapped_loc_addr: Mapped local address of the listening peer - * @mapped_rem_addr: Mapped remote address of the connecting peer - * @remote_addr: To store the remote address of the connecting peer - * @nl_client: The index of the netlink client - * - * The remote address info is retrieved and provided to the client in - * the remote_addr. After that it is removed from the hash table - */ int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr, struct sockaddr_storage *mapped_rem_addr, struct sockaddr_storage *remote_addr, u8 nl_client); - -/** - * iwpm_create_mapinfo - Store local and mapped IPv4/IPv6 address - * info in a hash table - * @local_addr: Local ip/tcp address - * @mapped_addr: Mapped local ip/tcp address - * @nl_client: The index of the netlink client - * @map_flags: IWPM mapping flags - */ int iwpm_create_mapinfo(struct sockaddr_storage *local_addr, struct sockaddr_storage *mapped_addr, u8 nl_client, u32 map_flags); - -/** - * iwpm_remove_mapinfo - Remove local and mapped IPv4/IPv6 address - * info from the hash table - * @local_addr: Local ip/tcp address - * @mapped_addr: Mapped local ip/tcp address - * - * Returns err code if mapping info is not found in the hash table, - * otherwise returns 0 - */ int iwpm_remove_mapinfo(struct sockaddr_storage *local_addr, struct sockaddr_storage *mapped_addr); -/** - * iwpm_hello_cb - Process a hello message from iwpmd - * - * @skb: - * @cb: Contains the received message (payload and netlink header) - * - * Using the received port mapper pid, send the kernel's abi_version - * after adjusting it to support the iwpmd version. - */ int iwpm_hello_cb(struct sk_buff *skb, struct netlink_callback *cb); #endif /* _IW_PORTMAP_H */ -- cgit v1.2.3 From 385156c5f2a61834666f079ee66338f177c65c28 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:29:44 -0800 Subject: IB/hfi: Move RC functions into a header file This patch moves some RC helper functions into a header file so that they can be called from both RC and TID RDMA functions. In addition, a common function for rewinding a request is created in rdmavt so that it can be shared between qib and hfi1 driver. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/rdmavt_qp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index cbafb1878669..56a9221378d9 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -628,6 +628,16 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp); */ void rvt_get_credit(struct rvt_qp *qp, u32 aeth); +/** + * rvt_restart_sge - rewind the sge state for a wqe + * @ss: the sge state pointer + * @wqe: the wqe to rewind + * @len: the data length from the start of the wqe in bytes + * + * Returns the remaining data length. + */ +u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len); + /** * @qp - the qp pair * @len - the length -- cgit v1.2.3 From 838b6fd2d9ca29998869e4d1ecf4566efe807666 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:30:07 -0800 Subject: IB/hfi1: TID RDMA RcvArray programming and TID allocation TID entries are used by hfi1 hardware to receive data payload from incoming packets directly into a user buffer and thus avoid data copying by software. This patch implements the functions for TID allocation, freeing, and programming TID RcvArray entries in hardware for kernel clients. TID entries are managed via lists of TID groups similar to PSM. Furthermore, to track TID resource allocation for each request, software flows are also allocated and freed as needed. Since software flows consume large amount of memory for tracking TID allocation and freeing, it is generally desirable to allocate them dynamically in the send queue and only for TID RDMA requests, but pre-allocate them for receive queue because the send queue could have thousands of entries while the receive queue has only a limited number of entries. Signed-off-by: Mitko Haralanov Signed-off-by: Ashutosh Dixit Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/rdmavt_qp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 56a9221378d9..9095a0b71250 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -174,6 +174,7 @@ struct rvt_swqe { u32 lpsn; /* last packet sequence number */ u32 ssn; /* send sequence number */ u32 length; /* total length of data in sg_list */ + void *priv; /* driver dependent field */ struct rvt_sge sg_list[0]; }; @@ -235,6 +236,7 @@ struct rvt_ack_entry { u32 lpsn; u8 opcode; u8 sent; + void *priv; }; #define RC_QP_SCALING_INTERVAL 5 -- cgit v1.2.3 From 742a3826cf82395e304df99f6494d04b0dd03a84 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:30:40 -0800 Subject: IB/hfi1: Add functions to build TID RDMA READ request This patch adds the helper functions to build the TID RDMA READ request on the requester side. The key is to allocate TID resources (TID flow and TID entries) and send the resource information to the responder side along with the read request. Since the TID resources are limited, each TID RDMA READ request has to be split into segments with a default segment size of 256K. A software flow is allocated to track the data transaction for each segment. The work request opcode, packet opcode, and packet formats for TID RDMA READ protocol are also defined in this patch. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/ib_hdrs.h | 9 +++++++- include/rdma/tid_rdma_defs.h | 52 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 include/rdma/tid_rdma_defs.h (limited to 'include') diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h index 6e35416170a3..58a0a0f99e7f 100644 --- a/include/rdma/ib_hdrs.h +++ b/include/rdma/ib_hdrs.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -100,6 +100,8 @@ struct ib_atomic_eth { __be64 compare_data; /* potentially unaligned */ } __packed; +#include + union ib_ehdrs { struct { __be32 deth[2]; @@ -117,6 +119,11 @@ union ib_ehdrs { __be32 aeth; __be32 ieth; struct ib_atomic_eth atomic_eth; + /* TID RDMA headers */ + union { + struct tid_rdma_read_req r_req; + struct tid_rdma_read_resp r_rsp; + } tid_rdma; } __packed; struct ib_other_headers { diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h new file mode 100644 index 000000000000..1c431ea32b52 --- /dev/null +++ b/include/rdma/tid_rdma_defs.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ + +#ifndef TID_RDMA_DEFS_H +#define TID_RDMA_DEFS_H + +#include + +struct tid_rdma_read_req { + __le32 kdeth0; + __le32 kdeth1; + struct ib_reth reth; + __be32 tid_flow_psn; + __be32 tid_flow_qp; + __be32 verbs_qp; +}; + +struct tid_rdma_read_resp { + __le32 kdeth0; + __le32 kdeth1; + __be32 aeth; + __be32 reserved[4]; + __be32 verbs_psn; + __be32 verbs_qp; +}; + +/* + * TID RDMA Opcodes + */ +#define IB_OPCODE_TID_RDMA 0xe0 +enum { + IB_OPCODE_READ_REQ = 0x4, + IB_OPCODE_READ_RESP = 0x5, + + IB_OPCODE(TID_RDMA, READ_REQ), + IB_OPCODE(TID_RDMA, READ_RESP), +}; + +#define TID_OP(x) IB_OPCODE_TID_RDMA_##x + +/* + * Define TID RDMA specific WR opcodes. The ib_wr_opcode + * enum already provides some reserved values for use by + * low level drivers. Two of those are used but renamed + * to be more descriptive. + */ +#define IB_WR_TID_RDMA_READ IB_WR_RESERVED2 + +#endif /* TID_RDMA_DEFS_H */ -- cgit v1.2.3 From 039cd3daf19b9acbf080054d765cbceac842b6a0 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 19:31:57 -0800 Subject: IB/hfi1: Increment the retry timeout value for TID RDMA READ request The RC retry timeout value is based on the estimated time for the response packet to come back. However, for TID RDMA READ request, due to the use of header suppression, the driver is normally not notified for each incoming response packet until the last TID RDMA READ response packet. Consequently, the retry timeout value should be extended to cover the transaction time for the entire length of a segment (default 256K) instead of that for a single packet. This patch addresses the issue by introducing new retry timer functions to account for multiple packets and wrapper functions for backward compatibility. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/rdma_vt.h | 12 +++++++++--- include/rdma/rdmavt_qp.h | 6 +++++- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 168e40be183c..87d66c9630d7 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -574,9 +574,10 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi, /** * rvt_mod_retry_timer - mod a retry timer * @qp - the QP + * @shift - timeout shift to wait for multiple packets * Modify a potentially already running retry timer */ -static inline void rvt_mod_retry_timer(struct rvt_qp *qp) +static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift) { struct ib_qp *ibqp = &qp->ibqp; struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); @@ -584,8 +585,13 @@ static inline void rvt_mod_retry_timer(struct rvt_qp *qp) lockdep_assert_held(&qp->s_lock); qp->s_flags |= RVT_S_TIMER; /* 4.096 usec. * (1 << qp->timeout) */ - mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies + - rdi->busy_jiffies); + mod_timer(&qp->s_timer, jiffies + rdi->busy_jiffies + + (qp->timeout_jiffies << shift)); +} + +static inline void rvt_mod_retry_timer(struct rvt_qp *qp) +{ + return rvt_mod_retry_timer_ext(qp, 0); } struct rvt_dev_info *rvt_alloc_device(size_t size, int nports); diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 9095a0b71250..d8d88d023092 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -688,7 +688,11 @@ enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t); void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth); void rvt_del_timers_sync(struct rvt_qp *qp); void rvt_stop_rc_timers(struct rvt_qp *qp); -void rvt_add_retry_timer(struct rvt_qp *qp); +void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift); +static inline void rvt_add_retry_timer(struct rvt_qp *qp) +{ + rvt_add_retry_timer_ext(qp, 0); +} void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, void *data, u32 length, -- cgit v1.2.3 From c098bbb00cd1986cbb58ed1712643f80ed00fcc3 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:28 -0800 Subject: IB/hfi1: Build TID RDMA WRITE request This patch adds the functions to build TID RDMA WRITE request. The work request opcode, packet opcode, and packet formats for TID RDMA WRITE protocol are also defined in this patch. Signed-off-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Ashutosh Dixit Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/ib_hdrs.h | 5 ++++ include/rdma/tid_rdma_defs.h | 56 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h index 58a0a0f99e7f..9a90bd031e8c 100644 --- a/include/rdma/ib_hdrs.h +++ b/include/rdma/ib_hdrs.h @@ -123,6 +123,11 @@ union ib_ehdrs { union { struct tid_rdma_read_req r_req; struct tid_rdma_read_resp r_rsp; + struct tid_rdma_write_req w_req; + struct tid_rdma_write_resp w_rsp; + struct tid_rdma_write_data w_data; + struct tid_rdma_resync resync; + struct tid_rdma_ack ack; } tid_rdma; } __packed; diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h index 1c431ea32b52..08fe47c7ad2c 100644 --- a/include/rdma/tid_rdma_defs.h +++ b/include/rdma/tid_rdma_defs.h @@ -27,16 +27,71 @@ struct tid_rdma_read_resp { __be32 verbs_qp; }; +struct tid_rdma_write_req { + __le32 kdeth0; + __le32 kdeth1; + struct ib_reth reth; + __be32 reserved[2]; + __be32 verbs_qp; +}; + +struct tid_rdma_write_resp { + __le32 kdeth0; + __le32 kdeth1; + __be32 aeth; + __be32 reserved[3]; + __be32 tid_flow_psn; + __be32 tid_flow_qp; + __be32 verbs_qp; +}; + +struct tid_rdma_write_data { + __le32 kdeth0; + __le32 kdeth1; + __be32 reserved[6]; + __be32 verbs_qp; +}; + +struct tid_rdma_resync { + __le32 kdeth0; + __le32 kdeth1; + __be32 reserved[6]; + __be32 verbs_qp; +}; + +struct tid_rdma_ack { + __le32 kdeth0; + __le32 kdeth1; + __be32 aeth; + __be32 reserved[2]; + __be32 tid_flow_psn; + __be32 verbs_psn; + __be32 tid_flow_qp; + __be32 verbs_qp; +}; + /* * TID RDMA Opcodes */ #define IB_OPCODE_TID_RDMA 0xe0 enum { + IB_OPCODE_WRITE_REQ = 0x0, + IB_OPCODE_WRITE_RESP = 0x1, + IB_OPCODE_WRITE_DATA = 0x2, + IB_OPCODE_WRITE_DATA_LAST = 0x3, IB_OPCODE_READ_REQ = 0x4, IB_OPCODE_READ_RESP = 0x5, + IB_OPCODE_RESYNC = 0x6, + IB_OPCODE_ACK = 0x7, + IB_OPCODE(TID_RDMA, WRITE_REQ), + IB_OPCODE(TID_RDMA, WRITE_RESP), + IB_OPCODE(TID_RDMA, WRITE_DATA), + IB_OPCODE(TID_RDMA, WRITE_DATA_LAST), IB_OPCODE(TID_RDMA, READ_REQ), IB_OPCODE(TID_RDMA, READ_RESP), + IB_OPCODE(TID_RDMA, RESYNC), + IB_OPCODE(TID_RDMA, ACK), }; #define TID_OP(x) IB_OPCODE_TID_RDMA_##x @@ -47,6 +102,7 @@ enum { * low level drivers. Two of those are used but renamed * to be more descriptive. */ +#define IB_WR_TID_RDMA_WRITE IB_WR_RESERVED1 #define IB_WR_TID_RDMA_READ IB_WR_RESERVED2 #endif /* TID_RDMA_DEFS_H */ -- cgit v1.2.3 From 4f9264d156dc6c154a8a6cfae780730bad45c6f8 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:48:48 -0800 Subject: IB/hfi1: Add an s_acked_ack_queue pointer The s_ack_queue is managed by two pointers into the ring: r_head_ack_queue and s_tail_ack_queue. r_head_ack_queue is the index of where the next received request is going to be placed and s_tail_ack_queue is the entry of the request currently being processed. This works perfectly fine for normal Verbs as the requests are processed one at a time and the s_tail_ack_queue is not moved until the request that it points to is fully completed. In this fashion, s_tail_ack_queue constantly chases r_head_ack_queue and the two pointers can easily be used to determine "queue full" and "queue empty" conditions. The detection of these two conditions are imported in determining when an old entry can safely be overwritten with a new received request and the resources associated with the old request be safely released. When pipelined TID RDMA WRITE is introduced into this mix, things look very different. r_head_ack_queue is still the point at which a newly received request will be inserted, s_tail_ack_queue is still the currently processed request. However, with pipelined TID RDMA WRITE requests, s_tail_ack_queue moves to the next request once all TID RDMA WRITE responses for that request have been sent. The rest of the protocol for a particular request is managed by other pointers specific to TID RDMA - r_tid_tail and r_tid_ack - which point to the entries for which the next TID RDMA DATA packets are going to arrive and the request for which the next TID RDMA ACK packets are to be generated, respectively. What this means is that entries in the ring, which are "behind" s_tail_ack_queue (entries which s_tail_ack_queue has gone past) are no longer considered complete. This is where the problem is - a newly received request could potentially overwrite a still active TID RDMA WRITE request. The reason why the TID RDMA pointers trail s_tail_ack_queue is that the normal Verbs send engine uses s_tail_ack_queue as the pointer for the next response. Since TID RDMA WRITE responses are processed by the normal Verbs send engine, s_tail_ack_queue had to be moved to the next entry once all TID RDMA WRITE response packets were sent to get the desired pipelining between requests. Doing otherwise would mean that the normal Verbs send engine would not be able to send the TID RDMA WRITE responses for the next TID RDMA request until the current one is fully completed. This patch introduces the s_acked_ack_queue index to point to the next request to complete on the responder side. For requests other than TID RDMA WRITE, s_acked_ack_queue should always be kept in sync with s_tail_ack_queue. For TID RDMA WRITE request, it may fall behind s_tail_ack_queue. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/rdmavt_qp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index d8d88d023092..4ee612ab6cb4 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -375,6 +375,7 @@ struct rvt_qp { u8 s_rnr_retry; /* requester RNR retry counter */ u8 s_num_rd_atomic; /* number of RDMA read/atomic pending */ u8 s_tail_ack_queue; /* index into s_ack_queue[] */ + u8 s_acked_ack_queue; /* index into s_ack_queue[] */ struct rvt_sge_state s_ack_rdma_sge; struct timer_list s_timer; -- cgit v1.2.3 From 3c6cb20a0d17d7a75778fb0935d6fa427c8177af Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 23 Jan 2019 21:51:39 -0800 Subject: IB/hfi1: Add TID RDMA WRITE functionality into RDMA verbs This patch integrates TID RDMA WRITE protocol into normal RDMA verbs framework. The TID RDMA WRITE protocol is an end-to-end protocol between the hfi1 drivers on two OPA nodes that converts a qualified RDMA WRITE request into a TID RDMA WRITE request to avoid data copying on the responder side. Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/rdmavt_qp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 4ee612ab6cb4..f0fbd4063fef 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -246,6 +246,7 @@ struct rvt_ack_entry { #define RVT_OPERATION_ATOMIC_SGE 0x00000004 #define RVT_OPERATION_LOCAL 0x00000008 #define RVT_OPERATION_USE_RESERVE 0x00000010 +#define RVT_OPERATION_IGN_RNR_CNT 0x00000020 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) -- cgit v1.2.3 From 70f8a3ca68d3e1f3344d959981ca55d5f6ec77f7 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 6 Feb 2019 09:59:15 -0800 Subject: mm: make mm->pinned_vm an atomic64 counter Taking a sleeping lock to _only_ increment a variable is quite the overkill, and pretty much all users do this. Furthermore, some drivers (ie: infiniband and scif) that need pinned semantics can go to quite some trouble to actually delay via workqueue (un)accounting for pinned pages when not possible to acquire it. By making the counter atomic we no longer need to hold the mmap_sem and can simply some code around it for pinned_vm users. The counter is 64-bit such that we need not worry about overflows such as rdma user input controlled from userspace. Reviewed-by: Ira Weiny Reviewed-by: Christoph Lameter Reviewed-by: Daniel Jordan Reviewed-by: Jan Kara Signed-off-by: Davidlohr Bueso Signed-off-by: Jason Gunthorpe --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2c471a2c43fa..acea2ea2d6c4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -405,7 +405,7 @@ struct mm_struct { unsigned long total_vm; /* Total pages mapped */ unsigned long locked_vm; /* Pages that have PG_mlocked set */ - unsigned long pinned_vm; /* Refcount permanently increased */ + atomic64_t pinned_vm; /* Refcount permanently increased */ unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */ unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ -- cgit v1.2.3 From 95b86d1c91ad3b19f882d9e70aa37c8e99e8dc17 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 7 Feb 2019 01:31:27 -0500 Subject: RDMA/bnxt_re: Update kernel user abi to pass chip context User space verbs provider library would need chip context. Changing the ABI to add chip version details in structure. Furthermore, changing the kernel driver ucontext allocation code to initialize the abi structure with appropriate values. As suggested by community, appended the new fields at the bottom of the ABI structure and retaining to older fields as those were in the older versions. Keeping the ABI version at 1 and adding a new field in the ucontext response structure to hold the component mask. The user space library should check pre-defined flags to figure out if a certain feature is supported on not. Signed-off-by: Devesh Sharma Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/bnxt_re-abi.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index a7a6111e50c7..dc52e3cf574c 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -44,6 +44,14 @@ #define BNXT_RE_ABI_VERSION 1 +#define BNXT_RE_CHIP_ID0_CHIP_NUM_SFT 0x00 +#define BNXT_RE_CHIP_ID0_CHIP_REV_SFT 0x10 +#define BNXT_RE_CHIP_ID0_CHIP_MET_SFT 0x18 + +enum { + BNXT_RE_UCNTX_CMASK_HAVE_CCTX = 0x1ULL +}; + struct bnxt_re_uctx_resp { __u32 dev_id; __u32 max_qp; @@ -51,6 +59,9 @@ struct bnxt_re_uctx_resp { __u32 cqe_sz; __u32 max_cqd; __u32 rsvd; + __aligned_u64 comp_mask; + __u32 chip_id0; + __u32 chip_id1; }; /* -- cgit v1.2.3 From 2c1619edef61a03cb516efaa81750784c3071d10 Mon Sep 17 00:00:00 2001 From: Danit Goldberg Date: Thu, 24 Jan 2019 14:18:15 +0200 Subject: IB/cma: Define option to set ack timeout and pack tos_set Define new option in 'rdma_set_option' to override calculated QP timeout when requested to provide QP attributes to modify a QP. At the same time, pack tos_set to be bitfield. Signed-off-by: Danit Goldberg Reviewed-by: Moni Shoua Signed-off-by: Leon Romanovsky Reviewed-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- include/rdma/rdma_cm.h | 1 + include/uapi/rdma/rdma_user_cm.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 60987a5903b7..71f48cfdc24c 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -374,6 +374,7 @@ int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse); */ int rdma_set_afonly(struct rdma_cm_id *id, int afonly); +int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout); /** * rdma_get_service_id - Return the IB service ID for a specified address. * @id: Communication identifier associated with the address. diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 0d1e78ebad05..e42940a215a3 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -300,6 +300,10 @@ enum { RDMA_OPTION_ID_TOS = 0, RDMA_OPTION_ID_REUSEADDR = 1, RDMA_OPTION_ID_AFONLY = 2, + RDMA_OPTION_ID_ACK_TIMEOUT = 3 +}; + +enum { RDMA_OPTION_IB_PATH = 1 }; -- cgit v1.2.3 From 926ba19b3574f6a80823a42484877ed65e91da9c Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 1 Feb 2019 12:44:32 -0800 Subject: RDMA/iwcm: add tos_set bool to iw_cm struct This allows drivers to know the tos was actively set by the application. Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- include/rdma/iw_cm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/iw_cm.h b/include/rdma/iw_cm.h index 48512abd3162..0e1f02815643 100644 --- a/include/rdma/iw_cm.h +++ b/include/rdma/iw_cm.h @@ -94,7 +94,8 @@ struct iw_cm_id { void (*add_ref)(struct iw_cm_id *); void (*rem_ref)(struct iw_cm_id *); u8 tos; - bool mapped; + bool tos_set:1; + bool mapped:1; }; struct iw_cm_conn_param { -- cgit v1.2.3 From 805b754d492f6227e1646001bdf85ad4bb819e55 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Sat, 2 Feb 2019 11:09:44 +0200 Subject: IB/core: Eliminate a hole in MAD agent struct Move the security related fields above the u8s to eliminate a hole in the struct. pahole before: struct ib_mad_agent { ... u32 hi_tid; /* 48 4 */ u32 flags; /* 52 4 */ u8 port_num; /* 56 1 */ u8 rmpp_version; /* 57 1 */ /* XXX 6 bytes hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ void * security; /* 64 8 */ bool smp_allowed; /* 72 1 */ bool lsm_nb_reg; /* 73 1 */ /* XXX 6 bytes hole, try to pack */ struct notifier_block lsm_nb; /* 80 24 */ /* XXX last struct has 4 bytes of padding */ /* size: 104, cachelines: 2, members: 14 */ ... }; pahole after: struct ib_mad_agent { ... u32 hi_tid; /* 48 4 */ u32 flags; /* 52 4 */ void * security; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct notifier_block lsm_nb; /* 64 24 */ /* XXX last struct has 4 bytes of padding */ u8 port_num; /* 88 1 */ u8 rmpp_version; /* 89 1 */ bool smp_allowed; /* 90 1 */ bool lsm_nb_reg; /* 91 1 */ /* size: 96, cachelines: 2, members: 14 */ ... }; Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_mad.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index fdef558e3a2d..1c0b914f199d 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -616,12 +616,12 @@ struct ib_mad_agent { void *context; u32 hi_tid; u32 flags; + void *security; + struct notifier_block lsm_nb; u8 port_num; u8 rmpp_version; - void *security; bool smp_allowed; bool lsm_nb_reg; - struct notifier_block lsm_nb; }; /** -- cgit v1.2.3 From c66f67414c1f88554485bb2a0abf8b5c0d741de7 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Sat, 2 Feb 2019 11:09:45 +0200 Subject: IB/core: Don't register each MAD agent for LSM notifier When creating many MAD agents in a short period of time, receive packet processing can be delayed long enough to cause timeouts while new agents are being added to the atomic notifier chain with IRQs disabled. Notifier chain registration and unregstration is an O(n) operation. With large numbers of MAD agents being created and destroyed simultaneously the CPUs spend too much time with interrupts disabled. Instead of each MAD agent registering for it's own LSM notification, maintain a list of agents internally and register once, this registration already existed for handling the PKeys. This list is write mostly, so a normal spin lock is used vs a read/write lock. All MAD agents must be checked, so a single list is used instead of breaking them down per device. Notifier calls are done under rcu_read_lock, so there isn't a risk of similar packet timeouts while checking the MAD agents security settings when notified. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Acked-by: Paul Moore Signed-off-by: Jason Gunthorpe --- include/rdma/ib_mad.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 1c0b914f199d..79ba8219e7dc 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -617,11 +617,10 @@ struct ib_mad_agent { u32 hi_tid; u32 flags; void *security; - struct notifier_block lsm_nb; + struct list_head mad_agent_sec_list; u8 port_num; u8 rmpp_version; bool smp_allowed; - bool lsm_nb_reg; }; /** -- cgit v1.2.3 From 30471d4b20335d9bd9ae9b2382a1e1e97d18d86d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 3 Feb 2019 14:55:50 +0200 Subject: RDMA/core: Share driver structure size with core Add new macros to be used in drivers while registering ops structure and IB/core while calling allocation routines, so drivers won't need to perform kzalloc/kfree in their paths. The change in allocation stage allows us to initialize common fields prior to calling to drivers (e.g. restrack). Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 2e1f1e885ee5..e29eae4aec84 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2264,6 +2264,19 @@ struct ib_counters_read_attr { struct uverbs_attr_bundle; +#define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member) \ + .size_##ib_struct = \ + (sizeof(struct drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) + \ + BUILD_BUG_ON_ZERO( \ + !__same_type(((struct drv_struct *)NULL)->member, \ + struct ib_struct))) + +#define rdma_zalloc_drv_obj(ib_dev, ib_type) \ + ((struct ib_type *)kzalloc(ib_dev->ops.size_##ib_type, GFP_KERNEL)) + +#define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct + /** * struct ib_device_ops - InfiniBand device operations * This structure defines all the InfiniBand device operations, providers will -- cgit v1.2.3 From 21a428a019c9a6d133e745b529b9bf18c1187e70 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 3 Feb 2019 14:55:51 +0200 Subject: RDMA: Handle PD allocations by IB/core The PD allocations in IB/core allows us to simplify drivers and their error flows in their .alloc_pd() paths. The changes in .alloc_pd() go hand in had with relevant update in .dealloc_pd(). We will use this opportunity and convert .dealloc_pd() to don't fail, as it was suggested a long time ago, failures are not happening as we have never seen a WARN_ON print. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e29eae4aec84..854d7816787c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2385,10 +2385,9 @@ struct ib_device_ops { int (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); - struct ib_pd *(*alloc_pd)(struct ib_device *device, - struct ib_ucontext *context, - struct ib_udata *udata); - int (*dealloc_pd)(struct ib_pd *pd); + int (*alloc_pd)(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata); + void (*dealloc_pd)(struct ib_pd *pd); struct ib_ah *(*create_ah)(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags, struct ib_udata *udata); @@ -2530,6 +2529,8 @@ struct ib_device_ops { */ int (*fill_res_entry)(struct sk_buff *msg, struct rdma_restrack_entry *entry); + + DECLARE_RDMA_OBJ_SIZE(ib_pd); }; struct ib_device { -- cgit v1.2.3 From 652432f33c01b2edaa5b2550b423cd894b1c7b9a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:50 -0700 Subject: RDMA/device: Get rid of reg_state This really has no purpose anymore, refcount can be used to tell if the device is still registered. Keeping it around just invites mis-use. Signed-off-by: Jason Gunthorpe Reviewed-by: Parav Pandit --- include/rdma/ib_verbs.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 854d7816787c..d8ba987e8b29 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2572,12 +2572,6 @@ struct ib_device { struct kobject *ports_kobj; struct list_head port_list; - enum { - IB_DEV_UNINITIALIZED, - IB_DEV_REGISTERED, - IB_DEV_UNREGISTERED - } reg_state; - int uverbs_abi_ver; u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; -- cgit v1.2.3 From e59178d895afa29b671323f8265a1e50afe989e5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:52 -0700 Subject: RDMA/devices: Use xarray to store the clients This gives each client a unique ID and will let us move client_data to use xarray, and revise the locking scheme. clients have to be add/removed in strict FIFO/LIFO order as they interdepend. To support this the client_ids are assigned to increase in FIFO order. The existing linked list is kept to support reverse iteration until xarray can get a reverse iteration API. Signed-off-by: Jason Gunthorpe Reviewed-by: Parav Pandit --- include/rdma/ib_verbs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index d8ba987e8b29..cc15820513cd 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2610,7 +2610,7 @@ struct ib_device { }; struct ib_client { - char *name; + const char *name; void (*add) (struct ib_device *); void (*remove)(struct ib_device *, void *client_data); @@ -2637,6 +2637,7 @@ struct ib_client { const struct sockaddr *addr, void *client_data); struct list_head list; + u32 client_id; /* kverbs are not required by the client */ u8 no_kverbs_req:1; -- cgit v1.2.3 From 0df91bb67334eebaf73d4ba32567e16d55f4f116 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:53 -0700 Subject: RDMA/devices: Use xarray to store the client_data Now that we have a small ID for each client we can use xarray instead of linearly searching linked lists for client data. This will give much faster and scalable client data lookup, and will lets us revise the locking scheme. Since xarray can store 'going_down' using a mark just entirely eliminate the struct ib_client_data and directly store the client_data value in the xarray. However this does require a special iterator as we must still iterate over any NULL client_data values. Also eliminate the client_data_lock in favour of internal xarray locking. Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index cc15820513cd..8558f31ca46f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2542,12 +2542,7 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; - rwlock_t client_data_lock; - struct list_head core_list; - /* Access to the client_data_list is protected by the client_data_lock - * rwlock and the lists_rwsem read-write semaphore - */ - struct list_head client_data_list; + struct xarray client_data; struct ib_cache cache; /** @@ -2660,7 +2655,21 @@ void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); -void *ib_get_client_data(struct ib_device *device, struct ib_client *client); +/** + * ib_get_client_data - Get IB client context + * @device:Device to get context for + * @client:Client to get context for + * + * ib_get_client_data() returns the client context data set with + * ib_set_client_data(). This can only be called while the client is + * registered to the device, once the ib_client remove() callback returns this + * cannot be called. + */ +static inline void *ib_get_client_data(struct ib_device *device, + struct ib_client *client) +{ + return xa_load(&device->client_data, client->client_id); +} void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); void ib_set_device_ops(struct ib_device *device, -- cgit v1.2.3 From 921eab1143aadf976a42cac4605b4d35159b355d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 Feb 2019 22:41:54 -0700 Subject: RDMA/devices: Re-organize device.c locking The locking here started out with a single lock that covered everything and then has lately veered into crazy town. The fundamental problem is that several places need to iterate over a linked list, but also need to drop their locks to avoid deadlock during client callbacks. xarray's restartable iteration offers a simple solution to the problem. Once all the lists are xarrays we can drop locks in the places that need that and rely on xarray to provide consistency and locking for the data structure. The resulting simplification is that each of the three lists has a dedicated rwsem that must be held when working with the list it covers. One data structure is no longer covered by multiple locks. The sleeping semaphore is selected because the read side generally needs to be held over something sleeping, and using RCU reader locking in those cases is overkill. In the process this simplifies the entire registration/unregistration flow to be the expected list of setups and the reversed list of matching teardowns, and the registration lock 'refcount' can now be revised to be released after the ULPs are removed, providing a very sane semantic for this feature. Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 8558f31ca46f..135fab2c016c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2542,6 +2542,7 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; + struct rw_semaphore client_data_rwsem; struct xarray client_data; struct ib_cache cache; -- cgit v1.2.3 From d901b2760dc6cd5fbbf2eac31d71d94baa6c4aef Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 4 Jan 2019 11:40:21 -0700 Subject: lib/scatterlist: Provide a DMA page iterator Commit 2db76d7c3c6d ("lib/scatterlist: sg_page_iter: support sg lists w/o backing pages") introduced the sg_page_iter_dma_address() function without providing a way to use it in the general case. If the sg_dma_len() is not equal to the sg length callers cannot safely use the for_each_sg_page/sg_page_iter_dma_address combination. Resolve this API mistake by providing a DMA specific iterator, for_each_sg_dma_page(), that uses the right length so sg_page_iter_dma_address() works as expected with all sglists. A new iterator type is introduced to provide compile-time safety against wrongly mixing accessors and iterators. Acked-by: Christoph Hellwig (for scatterlist) Acked-by: Thomas Hellstrom Acked-by: Sakari Ailus (ipu3-cio2) Signed-off-by: Jason Gunthorpe --- include/linux/scatterlist.h | 49 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index b96f0d0b5b8f..b4be960c7e5d 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -339,12 +339,12 @@ int sg_alloc_table_chained(struct sg_table *table, int nents, /* * sg page iterator * - * Iterates over sg entries page-by-page. On each successful iteration, - * you can call sg_page_iter_page(@piter) and sg_page_iter_dma_address(@piter) - * to get the current page and its dma address. @piter->sg will point to the - * sg holding this page and @piter->sg_pgoffset to the page's page offset - * within the sg. The iteration will stop either when a maximum number of sg - * entries was reached or a terminating sg (sg_last(sg) == true) was reached. + * Iterates over sg entries page-by-page. On each successful iteration, you + * can call sg_page_iter_page(@piter) to get the current page and its dma + * address. @piter->sg will point to the sg holding this page and + * @piter->sg_pgoffset to the page's page offset within the sg. The iteration + * will stop either when a maximum number of sg entries was reached or a + * terminating sg (sg_last(sg) == true) was reached. */ struct sg_page_iter { struct scatterlist *sg; /* sg holding the page */ @@ -356,7 +356,19 @@ struct sg_page_iter { * next step */ }; +/* + * sg page iterator for DMA addresses + * + * This is the same as sg_page_iter however you can call + * sg_page_iter_dma_address(@dma_iter) to get the page's DMA + * address. sg_page_iter_page() cannot be called on this iterator. + */ +struct sg_dma_page_iter { + struct sg_page_iter base; +}; + bool __sg_page_iter_next(struct sg_page_iter *piter); +bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter); void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset); @@ -372,11 +384,13 @@ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) /** * sg_page_iter_dma_address - get the dma address of the current page held by * the page iterator. - * @piter: page iterator holding the page + * @dma_iter: page iterator holding the page */ -static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter) +static inline dma_addr_t +sg_page_iter_dma_address(struct sg_dma_page_iter *dma_iter) { - return sg_dma_address(piter->sg) + (piter->sg_pgoffset << PAGE_SHIFT); + return sg_dma_address(dma_iter->base.sg) + + (dma_iter->base.sg_pgoffset << PAGE_SHIFT); } /** @@ -385,11 +399,28 @@ static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter) * @piter: page iterator to hold current page, sg, sg_pgoffset * @nents: maximum number of sg entries to iterate over * @pgoffset: starting page offset + * + * Callers may use sg_page_iter_page() to get each page pointer. */ #define for_each_sg_page(sglist, piter, nents, pgoffset) \ for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \ __sg_page_iter_next(piter);) +/** + * for_each_sg_dma_page - iterate over the pages of the given sg list + * @sglist: sglist to iterate over + * @dma_iter: page iterator to hold current page + * @dma_nents: maximum number of sg entries to iterate over, this is the value + * returned from dma_map_sg + * @pgoffset: starting page offset + * + * Callers may use sg_page_iter_dma_address() to get each page's DMA address. + */ +#define for_each_sg_dma_page(sglist, dma_iter, dma_nents, pgoffset) \ + for (__sg_page_iter_start(&(dma_iter)->base, sglist, dma_nents, \ + pgoffset); \ + __sg_page_iter_dma_next(dma_iter);) + /* * Mapping sg iterator * -- cgit v1.2.3 From 3d9dfd060391928bd615db62ecddea5e1255edfd Mon Sep 17 00:00:00 2001 From: Shamir Rabinovitch Date: Thu, 7 Feb 2019 18:44:47 +0200 Subject: IB/uverbs: Add ib_ucontext to uverbs_attr_bundle sent from ioctl and cmd flows Add ib_ucontext to the uverbs_attr_bundle sent down the iocl and cmd flows as soon as the flow has ib_uobject. In addition, remove rdma_get_ucontext helper function that is only used by ib_umem_get. Signed-off-by: Shamir Rabinovitch Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 2 -- include/rdma/uverbs_ioctl.h | 1 + include/rdma/uverbs_std_types.h | 18 +++++++++++++----- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 135fab2c016c..64ee7c08be22 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4247,8 +4247,6 @@ void rdma_roce_rescan_device(struct ib_device *ibdev); struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); -struct ib_ucontext *rdma_get_ucontext(struct ib_udata *udata); - int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs); struct net_device *rdma_alloc_netdev(struct ib_device *device, u8 port_num, diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 27da906beea7..b14a9ee786e9 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -652,6 +652,7 @@ struct uverbs_attr_bundle { struct ib_udata driver_udata; struct ib_udata ucore; struct ib_uverbs_file *ufile; + struct ib_ucontext *context; DECLARE_BITMAP(attr_present, UVERBS_API_ATTR_BKEY_LEN); struct uverbs_attr attrs[]; }; diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 883abcf6d36e..794c47565971 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -48,9 +48,12 @@ #define uobj_get_type(_attrs, _object) \ uapi_get_object((_attrs)->ufile->device->uapi, _object) +struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type, + u32 object_id, + struct uverbs_attr_bundle *attrs); + #define uobj_get_read(_type, _id, _attrs) \ - rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \ - _uobj_check_id(_id), UVERBS_LOOKUP_READ) + _uobj_get_read(_type, _uobj_check_id(_id), _attrs) #define ufd_get_read(_type, _fdnum, _attrs) \ rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \ @@ -67,9 +70,12 @@ static inline void *_uobj_get_obj_read(struct ib_uobject *uobj) ((struct ib_##_object *)_uobj_get_obj_read( \ uobj_get_read(_type, _id, _attrs))) +struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type, + u32 object_id, + struct uverbs_attr_bundle *attrs); + #define uobj_get_write(_type, _id, _attrs) \ - rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \ - _uobj_check_id(_id), UVERBS_LOOKUP_WRITE) + _uobj_get_write(_type, _uobj_check_id(_id), _attrs) int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id, const struct uverbs_attr_bundle *attrs); @@ -123,8 +129,10 @@ __uobj_alloc(const struct uverbs_api_object *obj, { struct ib_uobject *uobj = rdma_alloc_begin_uobject(obj, attrs->ufile); - if (!IS_ERR(uobj)) + if (!IS_ERR(uobj)) { *ib_dev = uobj->context->device; + attrs->context = uobj->context; + } return uobj; } -- cgit v1.2.3 From 730623f4a56fa42d4559715ff2f4a5c32b3ae8bf Mon Sep 17 00:00:00 2001 From: Shamir Rabinovitch Date: Thu, 7 Feb 2019 18:44:48 +0200 Subject: IB/verbs: Add helper function rdma_udata_to_drv_context Helper function to get driver's context out of ib_udata wrapped in uverbs_attr_bundle for user objects or NULL for kernel objects. Signed-off-by: Shamir Rabinovitch Signed-off-by: Jason Gunthorpe --- include/rdma/uverbs_ioctl.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index b14a9ee786e9..28570ac2b6a0 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -664,6 +664,23 @@ static inline bool uverbs_attr_is_valid(const struct uverbs_attr_bundle *attrs_b attrs_bundle->attr_present); } +/** + * rdma_udata_to_drv_context - Helper macro to get the driver's context out of + * ib_udata which is embedded in uverbs_attr_bundle. + * + * If udata is not NULL this cannot fail. Otherwise a NULL udata will result + * in a NULL ucontext pointer, as a safety precaution. Callers should be using + * 'udata' to determine if the driver call is in user or kernel mode, not + * 'ucontext'. + * + */ +#define rdma_udata_to_drv_context(udata, drv_dev_struct, member) \ + (udata ? container_of(container_of(udata, struct uverbs_attr_bundle, \ + driver_udata) \ + ->context, \ + drv_dev_struct, member) : \ + (drv_dev_struct *)NULL) + #define IS_UVERBS_COPY_ERR(_ret) ((_ret) && (_ret) != -ENOENT) static inline const struct uverbs_attr *uverbs_attr_get(const struct uverbs_attr_bundle *attrs_bundle, -- cgit v1.2.3 From 89944450547334aa6655e0cd4aec8df1897a205a Mon Sep 17 00:00:00 2001 From: Shamir Rabinovitch Date: Thu, 7 Feb 2019 18:44:49 +0200 Subject: IB/{hw,sw}: Remove 'uobject->context' dependency in object creation APIs Now when we have the udata passed to all the ib_xxx object creation APIs and the additional macro 'rdma_udata_to_drv_context' to get the ib_ucontext from ib_udata stored in uverbs_attr_bundle, we can finally start to remove the dependency of the drivers in the ib_xxx->uobject->context. Signed-off-by: Shamir Rabinovitch Signed-off-by: Jason Gunthorpe --- include/rdma/rdma_vt.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 87d66c9630d7..4c257aff7d32 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -186,6 +186,11 @@ struct rvt_driver_params { u8 reserved_operations; }; +/* User context */ +struct rvt_ucontext { + struct ib_ucontext ibucontext; +}; + /* Protection domain */ struct rvt_pd { struct ib_pd ibpd; -- cgit v1.2.3 From fd47c2f99f04249d1ba82c422d1818dcbe193908 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:43 +0200 Subject: RDMA/restrack: Convert internal DB from hash to XArray The additions of .doit callbacks posses new access pattern to the resource entries by some user visible index. Back then, the legacy DB was implemented as hash because per-index access wasn't needed and XArray wasn't accepted yet. Acceptance of XArray together with per-index access requires the refresh of DB implementation. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/restrack.h | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index cc66cc7a11d3..16e11b4c3ec3 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -13,6 +13,7 @@ #include #include #include +#include /** * enum rdma_restrack_type - HW objects to track @@ -48,7 +49,6 @@ enum rdma_restrack_type { RDMA_RESTRACK_MAX }; -#define RDMA_RESTRACK_HASH_BITS 8 struct ib_device; struct rdma_restrack_entry; @@ -62,9 +62,17 @@ struct rdma_restrack_root { */ struct rw_semaphore rwsem; /** - * @hash: global database for all resources per-device + * @xa: Array of XArray structures to hold restrack entries. + * We want to use array of XArrays because insertion is type + * dependent. For types with xisiting unique ID (like QPN), + * we will insert to that unique index. For other types, + * we insert based on pointers and auto-allocate unique index. */ - DECLARE_HASHTABLE(hash, RDMA_RESTRACK_HASH_BITS); + struct xarray xa[RDMA_RESTRACK_MAX]; + /** + * @next_id: Next ID to support cyclic allocation + */ + u32 next_id[RDMA_RESTRACK_MAX]; }; /** @@ -102,10 +110,6 @@ struct rdma_restrack_entry { * @kern_name: name of owner for the kernel created entities. */ const char *kern_name; - /** - * @node: hash table entry - */ - struct hlist_node node; /** * @type: various objects in restrack database */ @@ -114,6 +118,10 @@ struct rdma_restrack_entry { * @user: user resource */ bool user; + /** + * @id: ID to expose to users + */ + u32 id; }; void rdma_restrack_init(struct ib_device *dev); -- cgit v1.2.3 From 18c4c66f76d99df89ad682ba25bafb9227e8ec30 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:44 +0200 Subject: RDMA/restrack: Translate from ID to restrack object Add new general helper to get restrack entry given by ID and their respective type. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/restrack.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 16e11b4c3ec3..44ce32cc0b51 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -179,4 +179,7 @@ int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name, int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value); int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, u64 value); +struct rdma_restrack_entry *rdma_restrack_get_byid(struct ib_device *dev, + enum rdma_restrack_type type, + u32 id); #endif /* _RDMA_RESTRACK_H_ */ -- cgit v1.2.3 From 48118527186fb255461ebf3685ab0f1c2680bd9c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:46 +0200 Subject: RDMA/restrack: Reduce scope of synchronization lock while updating DB XArray uses internal lock for updates to XArray. This means that our external RW lock is needed to ensure that entry is not deleted while we are performing iteration over list. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/restrack.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 44ce32cc0b51..53e1a7fb7355 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -58,7 +58,8 @@ struct rdma_restrack_entry; */ struct rdma_restrack_root { /* - * @rwsem: Read/write lock to protect lists + * @rwsem: Read/write lock to protect erase of entry. + * Lists and insertions are protected by XArray internal lock. */ struct rw_semaphore rwsem; /** -- cgit v1.2.3 From 41eda65c6100930d95bb854a0114f3544593070c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:47 +0200 Subject: RDMA/restrack: Hide restrack DB from IB/core There is no need to expose internals of restrack DB to IB/core. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 7 +++---- include/rdma/restrack.h | 28 ---------------------------- 2 files changed, 3 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 64ee7c08be22..2a17c2b30073 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2533,6 +2533,8 @@ struct ib_device_ops { DECLARE_RDMA_OBJ_SIZE(ib_pd); }; +struct rdma_restrack_root; + struct ib_device { /* Do not access @dma_device directly from ULP nor from HW drivers. */ struct device *dma_device; @@ -2589,10 +2591,7 @@ struct ib_device { #endif u32 index; - /* - * Implementation details of the RDMA core, don't use in drivers - */ - struct rdma_restrack_root res; + struct rdma_restrack_root *res; const struct uapi_definition *driver_def; enum rdma_driver_id driver_id; diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 53e1a7fb7355..ecf3c7702a4f 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -7,7 +7,6 @@ #define _RDMA_RESTRACK_H_ #include -#include #include #include #include @@ -50,31 +49,6 @@ enum rdma_restrack_type { }; struct ib_device; -struct rdma_restrack_entry; - -/** - * struct rdma_restrack_root - main resource tracking management - * entity, per-device - */ -struct rdma_restrack_root { - /* - * @rwsem: Read/write lock to protect erase of entry. - * Lists and insertions are protected by XArray internal lock. - */ - struct rw_semaphore rwsem; - /** - * @xa: Array of XArray structures to hold restrack entries. - * We want to use array of XArrays because insertion is type - * dependent. For types with xisiting unique ID (like QPN), - * we will insert to that unique index. For other types, - * we insert based on pointers and auto-allocate unique index. - */ - struct xarray xa[RDMA_RESTRACK_MAX]; - /** - * @next_id: Next ID to support cyclic allocation - */ - u32 next_id[RDMA_RESTRACK_MAX]; -}; /** * struct rdma_restrack_entry - metadata per-entry @@ -125,8 +99,6 @@ struct rdma_restrack_entry { u32 id; }; -void rdma_restrack_init(struct ib_device *dev); -void rdma_restrack_clean(struct ib_device *dev); int rdma_restrack_count(struct ib_device *dev, enum rdma_restrack_type type, struct pid_namespace *ns); -- cgit v1.2.3 From 517b773e0f612d608cbc62a08c55601bd56f73f6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:49 +0200 Subject: RDMA/nldev: Share with user-space object IDs Give to the user space tools unique identifier for PD, MR, CQ and CM_ID objects, so they can be able to query on them with .doit callbacks. QP .doit is not supported yet, till all drivers will be updated to provide their LQPN to be equal to their restrack ID. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 3a9e681e4257..43362132e0d7 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -456,6 +456,15 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_DRIVER_S64, /* s64 */ RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */ + /* + * Indexes to get/set secific entry, + * for QP use RDMA_NLDEV_ATTR_RES_LQPN + */ + RDMA_NLDEV_ATTR_RES_PDN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CQN, /* u32 */ + RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ + /* * Always the end */ -- cgit v1.2.3 From c3d02788b45ab4a2d8f243b98c04b549c8193af6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 18 Feb 2019 22:25:50 +0200 Subject: RDMA/nldev: Provide parent IDs for PD, MR and QP objects PD, MR and QP objects have parents objects: contexts and PDs. The exposed parent IDs allow to correlate various objects and simplify debug investigation. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 43362132e0d7..4ebbcfb2c6ef 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -464,6 +464,7 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_CQN, /* u32 */ RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ + RDMA_NLDEV_ATTR_RES_CTXN, /* u32 */ /* * Always the end -- cgit v1.2.3 From ea1075edcbab7d92f4e4ccf5490043f796bf78be Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:47 -0700 Subject: RDMA: Add and use rdma_for_each_port We have many loops iterating over all of the end port numbers on a struct ib_device, simplify them with a for_each helper. Reviewed-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 2a17c2b30073..fa0edd6ae33c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2827,6 +2827,16 @@ static inline u8 rdma_start_port(const struct ib_device *device) return rdma_cap_ib_switch(device) ? 0 : 1; } +/** + * rdma_for_each_port - Iterate over all valid port numbers of the IB device + * @device - The struct ib_device * to iterate over + * @iter - The unsigned int to store the port number + */ +#define rdma_for_each_port(device, iter) \ + for (iter = rdma_start_port(device + BUILD_BUG_ON_ZERO(!__same_type( \ + unsigned int, iter))); \ + iter <= rdma_end_port(device); (iter)++) + /** * rdma_end_port - Return the last valid port number for the device * specified -- cgit v1.2.3 From 8ceb1357b33790193e9d55d2d09bcfd6bd59dd6d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:48 -0700 Subject: RDMA/device: Consolidate ib_device per_port data into one place There is no reason to have three allocations of per-port data. Combine them together and make the lifetime for all the per-port data match the struct ib_device. Following patches will require more port-specific data, now there is a good place to put it. Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 74 ++++++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index fa0edd6ae33c..b42e257814f7 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2198,6 +2198,13 @@ struct ib_port_immutable { u32 max_mad_size; }; +struct ib_port_data { + struct ib_port_immutable immutable; + + spinlock_t pkey_list_lock; + struct list_head pkey_list; +}; + /* rdma netdev type - specifies protocol type */ enum rdma_netdev_t { RDMA_NETDEV_OPA_VNIC, @@ -2243,12 +2250,6 @@ struct rdma_netdev_alloc_params { struct net_device *netdev, void *param); }; -struct ib_port_pkey_list { - /* Lock to hold while modifying the list. */ - spinlock_t list_lock; - struct list_head pkey_list; -}; - struct ib_counters { struct ib_device *device; struct ib_uobject *uobject; @@ -2549,14 +2550,12 @@ struct ib_device { struct ib_cache cache; /** - * port_immutable is indexed by port number + * port_data is indexed by port number */ - struct ib_port_immutable *port_immutable; + struct ib_port_data *port_data; int num_comp_vectors; - struct ib_port_pkey_list *port_pkey_list; - struct iw_cm_verbs *iwcm; struct module *owner; @@ -2860,34 +2859,38 @@ static inline int rdma_is_port_valid(const struct ib_device *device, static inline bool rdma_is_grh_required(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & - RDMA_CORE_PORT_IB_GRH_REQUIRED; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_PORT_IB_GRH_REQUIRED; } static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IB; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_IB; } static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & - (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); + return device->port_data[port_num].immutable.core_cap_flags & + (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); } static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; } static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_ROCE; } static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_IWARP; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_IWARP; } static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num) @@ -2898,12 +2901,14 @@ static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num) static inline bool rdma_protocol_raw_packet(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_RAW_PACKET; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_RAW_PACKET; } static inline bool rdma_protocol_usnic(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_USNIC; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_PROT_USNIC; } /** @@ -2920,7 +2925,8 @@ static inline bool rdma_protocol_usnic(const struct ib_device *device, u8 port_n */ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_MAD; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_IB_MAD; } /** @@ -2944,8 +2950,8 @@ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num) { - return (device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_OPA_MAD) - == RDMA_CORE_CAP_OPA_MAD; + return (device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_OPA_MAD) == RDMA_CORE_CAP_OPA_MAD; } /** @@ -2970,7 +2976,8 @@ static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SMI; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_IB_SMI; } /** @@ -2990,7 +2997,8 @@ static inline bool rdma_cap_ib_smi(const struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_CM; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_IB_CM; } /** @@ -3007,7 +3015,8 @@ static inline bool rdma_cap_ib_cm(const struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IW_CM; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_IW_CM; } /** @@ -3027,7 +3036,8 @@ static inline bool rdma_cap_iw_cm(const struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_ib_sa(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_IB_SA; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_IB_SA; } /** @@ -3067,7 +3077,8 @@ static inline bool rdma_cap_ib_mcast(const struct ib_device *device, u8 port_num */ static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_AF_IB; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_AF_IB; } /** @@ -3088,7 +3099,8 @@ static inline bool rdma_cap_af_ib(const struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH; + return device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_CAP_ETH_AH; } /** @@ -3102,7 +3114,7 @@ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) */ static inline bool rdma_cap_opa_ah(struct ib_device *device, u8 port_num) { - return (device->port_immutable[port_num].core_cap_flags & + return (device->port_data[port_num].immutable.core_cap_flags & RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH; } @@ -3120,7 +3132,7 @@ static inline bool rdma_cap_opa_ah(struct ib_device *device, u8 port_num) */ static inline size_t rdma_max_mad_size(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].max_mad_size; + return device->port_data[port_num].immutable.max_mad_size; } /** -- cgit v1.2.3 From 8faea9fd4a3914f12cd343e10810ec5f4215ddd6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:49 -0700 Subject: RDMA/cache: Move the cache per-port data into the main ib_port_data Like the other cases there no real reason to have another array just for the cache. This larger conversion gets its own patch. Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b42e257814f7..50b7ebc2885e 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2186,7 +2186,6 @@ struct ib_port_cache { struct ib_cache { rwlock_t lock; struct ib_event_handler event_handler; - struct ib_port_cache *ports; }; struct iw_cm_verbs; @@ -2203,6 +2202,8 @@ struct ib_port_data { spinlock_t pkey_list_lock; struct list_head pkey_list; + + struct ib_port_cache cache; }; /* rdma netdev type - specifies protocol type */ -- cgit v1.2.3 From c2261dd76b549754c14c8ac7cadadd0993b182d6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:50 -0700 Subject: RDMA/device: Add ib_device_set_netdev() as an alternative to get_netdev The associated netdev should not actually be very dynamic, so for most drivers there is no reason for a callback like this. Provide an API to inform the core code about the net dev affiliation and use a core maintained data structure instead. This allows the core code to be more aware of the ndev relationship which will allow some new APIs based around this. This also uses locking that makes some kind of sense, many drivers had a confusing RCU lock, or missing locking which isn't right. Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 50b7ebc2885e..7f81a313c01b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2204,6 +2204,9 @@ struct ib_port_data { struct list_head pkey_list; struct ib_port_cache cache; + + spinlock_t netdev_lock; + struct net_device *netdev; }; /* rdma netdev type - specifies protocol type */ @@ -3996,6 +3999,10 @@ void ib_device_put(struct ib_device *device); struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); +int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, + unsigned int port); +struct net_device *ib_device_netdev(struct ib_device *dev, u8 port); + struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq(struct ib_wq *wq); -- cgit v1.2.3 From 324e227ea7c952626abafe72db42ae0d70220a6e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:51 -0700 Subject: RDMA/device: Add ib_device_get_by_netdev() Several drivers need to find the ib_device from a given netdev. rxe needs this at speed in an unsleepable context, so choose to implement the translation using a RCU safe hash table. The hash table can have a many to one mapping. This is intended to support some future case where multiple IB drivers (ie iWarp and RoCE) connect to the same netdevs. driver_ids will need to be different to support this. In the process this makes the struct ib_device and ib_port_data RCU safe by deferring their kfrees. Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 7f81a313c01b..3aa802b65cf3 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2198,6 +2198,8 @@ struct ib_port_immutable { }; struct ib_port_data { + struct ib_device *ib_dev; + struct ib_port_immutable immutable; spinlock_t pkey_list_lock; @@ -2206,7 +2208,8 @@ struct ib_port_data { struct ib_port_cache cache; spinlock_t netdev_lock; - struct net_device *netdev; + struct net_device __rcu *netdev; + struct hlist_node ndev_hash_link; }; /* rdma netdev type - specifies protocol type */ @@ -2545,6 +2548,7 @@ struct ib_device { struct device *dma_device; struct ib_device_ops ops; char name[IB_DEVICE_NAME_MAX]; + struct rcu_head rcu_head; struct list_head event_handler_list; spinlock_t event_handler_lock; @@ -3996,6 +4000,10 @@ static inline bool ib_device_try_get(struct ib_device *dev) } void ib_device_put(struct ib_device *device); +struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, + enum rdma_driver_id driver_id); +struct ib_device *ib_device_get_by_name(const char *name, + enum rdma_driver_id driver_id); struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr); -- cgit v1.2.3 From d0899892edd089790eb17943ecf28254a909deae Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:53 -0700 Subject: RDMA/device: Provide APIs from the core code to help unregistration These APIs are intended to support drivers that exist outside the usual driver core probe()/remove() callbacks. Normally the driver core will prevent remove() from running concurrently with probe(), once this safety is lost drivers need more support to get the locking and lifetimes right. ib_unregister_driver() is intended to be used during module_exit of a driver using these APIs. It unregisters all the associated ib_devices. ib_unregister_device_and_put() is to be used by a driver-specific removal function (ie removal by name, removal from a netdev notifier, removal from netlink) ib_unregister_queued() is to be used from netdev notifier chains where RTNL is held. The locking is tricky here since once things become async it is possible to race unregister with registration. This is largely solved by relying on the registration refcount, unregistration will only ever work on something that has a positive registration refcount - and then an unregistration mutex serializes all competing unregistrations of the same device. Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3aa802b65cf3..ad83f8c38dc8 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2538,6 +2538,12 @@ struct ib_device_ops { int (*fill_res_entry)(struct sk_buff *msg, struct rdma_restrack_entry *entry); + /* Device lifecycle callbacks */ + /* + * This is called as part of ib_dealloc_device(). + */ + void (*dealloc_driver)(struct ib_device *dev); + DECLARE_RDMA_OBJ_SIZE(ib_pd); }; @@ -2555,6 +2561,7 @@ struct ib_device { struct rw_semaphore client_data_rwsem; struct xarray client_data; + struct mutex unregistration_lock; struct ib_cache cache; /** @@ -2609,6 +2616,7 @@ struct ib_device { */ refcount_t refcount; struct completion unreg_completion; + struct work_struct unregistration_work; }; struct ib_client { @@ -2658,6 +2666,9 @@ void ib_get_device_fw_str(struct ib_device *device, char *str); int ib_register_device(struct ib_device *device, const char *name); void ib_unregister_device(struct ib_device *device); +void ib_unregister_driver(enum rdma_driver_id driver_id); +void ib_unregister_device_and_put(struct ib_device *device); +void ib_unregister_device_queued(struct ib_device *ib_dev); int ib_register_client (struct ib_client *client); void ib_unregister_client(struct ib_client *client); -- cgit v1.2.3 From ca22354b140853b8155692d5b2bc0110aa54e937 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Feb 2019 21:12:56 -0700 Subject: RDMA/rxe: Close a race after ib_register_device Since rxe allows unregistration from other threads the rxe pointer can become invalid any moment after ib_register_driver returns. This could cause a user triggered use after free. Add another driver callback to be called right after the device becomes registered to complete any device setup required post-registration. This callback has enough core locking to prevent the device from becoming unregistered. Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ad83f8c38dc8..640263289ab9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2539,6 +2539,11 @@ struct ib_device_ops { struct rdma_restrack_entry *entry); /* Device lifecycle callbacks */ + /* + * Called after the device becomes registered, before clients are + * attached + */ + int (*enable_driver)(struct ib_device *dev); /* * This is called as part of ib_dealloc_device(). */ -- cgit v1.2.3 From 3856ec4b93c9463d36ee39098dde1fbbd29ec6dd Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 15 Feb 2019 11:03:53 -0800 Subject: RDMA/core: Add RDMA_NLDEV_CMD_NEWLINK/DELLINK support Add support for new LINK messages to allow adding and deleting rdma interfaces. This will be used initially for soft rdma drivers which instantiate device instances dynamically by the admin specifying a netdev device to use. The rdma_rxe module will be the first user of these messages. The design is modeled after RTNL_NEWLINK/DELLINK: rdma drivers register with the rdma core if they provide link add/delete functions. Each driver registers with a unique "type" string, that is used to dispatch messages coming from user space. A new RDMA_NLDEV_ATTR is defined for the "type" string. User mode will pass 3 attributes in a NEWLINK message: RDMA_NLDEV_ATTR_DEV_NAME for the desired rdma device name to be created, RDMA_NLDEV_ATTR_LINK_TYPE for the "type" of link being added, and RDMA_NLDEV_ATTR_NDEV_NAME for the net_device interface to use for this link. The DELLINK message will contain the RDMA_NLDEV_ATTR_DEV_INDEX of the device to delete. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Reviewed-by: Michael J. Ruhl Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 3 +++ include/rdma/rdma_netlink.h | 11 +++++++++++ include/uapi/rdma/rdma_netlink.h | 10 ++++++++-- 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 640263289ab9..225cb76d469f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -238,6 +238,7 @@ enum ib_device_cap_flags { IB_DEVICE_RDMA_NETDEV_OPA_VNIC = (1ULL << 35), /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = (1ULL << 36), + IB_DEVICE_ALLOW_USER_UNREG = (1ULL << 37), }; enum ib_signature_prot_cap { @@ -2622,6 +2623,8 @@ struct ib_device { refcount_t refcount; struct completion unreg_completion; struct work_struct unregistration_work; + + const struct rdma_link_ops *link_ops; }; struct ib_client { diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 70218e6b5187..10732ab31ba2 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -99,4 +99,15 @@ int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags); * Returns true on success or false if no listeners. */ bool rdma_nl_chk_listeners(unsigned int group); + +struct rdma_link_ops { + struct list_head list; + const char *type; + int (*newlink)(const char *ibdev_name, struct net_device *ndev); +}; + +void rdma_link_register(struct rdma_link_ops *ops); +void rdma_link_unregister(struct rdma_link_ops *ops); + +#define MODULE_ALIAS_RDMA_LINK(type) MODULE_ALIAS("rdma-link-" type) #endif /* _RDMA_NETLINK_H */ diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 4ebbcfb2c6ef..5cc592728071 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -255,9 +255,11 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_GET, /* can dump */ RDMA_NLDEV_CMD_SET, - /* 3 - 4 are free to use */ + RDMA_NLDEV_CMD_NEWLINK, - RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */ + RDMA_NLDEV_CMD_DELLINK, + + RDMA_NLDEV_CMD_PORT_GET, /* can dump */ /* 6 - 8 are free to use */ @@ -465,6 +467,10 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_MRN, /* u32 */ RDMA_NLDEV_ATTR_RES_CM_IDN, /* u32 */ RDMA_NLDEV_ATTR_RES_CTXN, /* u32 */ + /* + * Identifies the rdma driver. eg: "rxe" or "siw" + */ + RDMA_NLDEV_ATTR_LINK_TYPE, /* string */ /* * Always the end -- cgit v1.2.3 From a2a074ef396f8738d9ee08ceefa8811381a4fe4f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 12 Feb 2019 20:39:16 +0200 Subject: RDMA: Handle ucontext allocations by IB/core Following the PD conversion patch, do the same for ucontext allocations. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 225cb76d469f..9b9e17bcc201 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2389,9 +2389,9 @@ struct ib_device_ops { int (*del_gid)(const struct ib_gid_attr *attr, void **context); int (*query_pkey)(struct ib_device *device, u8 port_num, u16 index, u16 *pkey); - struct ib_ucontext *(*alloc_ucontext)(struct ib_device *device, - struct ib_udata *udata); - int (*dealloc_ucontext)(struct ib_ucontext *context); + int (*alloc_ucontext)(struct ib_ucontext *context, + struct ib_udata *udata); + void (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); int (*alloc_pd)(struct ib_pd *pd, struct ib_ucontext *context, @@ -2551,6 +2551,7 @@ struct ib_device_ops { void (*dealloc_driver)(struct ib_device *dev); DECLARE_RDMA_OBJ_SIZE(ib_pd); + DECLARE_RDMA_OBJ_SIZE(ib_ucontext); }; struct rdma_restrack_root; -- cgit v1.2.3