From 968e78dd96443e2cc963c493070574778805e76a Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:11 +0200 Subject: IB/mlx5: Enhance UMR support to allow partial page table update The current UMR interface doesn't allow partial updates to a memory region's page tables. This patch changes the interface to allow that. It also changes the way the UMR operation validates the memory region's state. When set, IB_SEND_UMR_FAIL_IF_FREE will cause the UMR operation to fail if the MKEY is in the free state. When it is unchecked the operation will check that it isn't in the free state. Signed-off-by: Haggai Eran Signed-off-by: Shachar Raindel Signed-off-by: Roland Dreier --- include/linux/mlx5/device.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ea4f1c46f761..fa07bfda0e15 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -180,6 +180,15 @@ enum { MLX5_MKEY_MASK_FREE = 1ull << 29, }; +enum { + MLX5_UMR_TRANSLATION_OFFSET_EN = (1 << 4), + + MLX5_UMR_CHECK_NOT_FREE = (1 << 5), + MLX5_UMR_CHECK_FREE = (2 << 5), + + MLX5_UMR_INLINE = (1 << 7), +}; + enum mlx5_event { MLX5_EVENT_TYPE_COMP = 0x0, @@ -776,6 +785,10 @@ struct mlx5_query_eq_mbox_out { struct mlx5_eq_context ctx; }; +enum { + MLX5_MKEY_STATUS_FREE = 1 << 6, +}; + struct mlx5_mkey_seg { /* This is a two bit field occupying bits 31-30. * bit 31 is always 0, -- cgit v1.2.3 From 406f9e5fa9a7a60b42e676841e39f2d752266814 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:12 +0200 Subject: IB/core: Replace ib_umem's offset field with a full address In order to allow umems that do not pin memory, we need the umem to keep track of its region's address. This makes the offset field redundant, and so this patch removes it. Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- include/rdma/ib_umem.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index a2bf41e0bde9..7ed6d4ff58dc 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -42,7 +42,7 @@ struct ib_ucontext; struct ib_umem { struct ib_ucontext *context; size_t length; - int offset; + unsigned long address; int page_size; int writable; int hugetlb; @@ -55,6 +55,29 @@ struct ib_umem { int npages; }; +/* Returns the offset of the umem start relative to the first page. */ +static inline int ib_umem_offset(struct ib_umem *umem) +{ + return umem->address & ((unsigned long)umem->page_size - 1); +} + +/* Returns the first page of an ODP umem. */ +static inline unsigned long ib_umem_start(struct ib_umem *umem) +{ + return umem->address - ib_umem_offset(umem); +} + +/* Returns the address of the page after the last one of an ODP umem. */ +static inline unsigned long ib_umem_end(struct ib_umem *umem) +{ + return PAGE_ALIGN(umem->address + umem->length); +} + +static inline size_t ib_umem_num_pages(struct ib_umem *umem) +{ + return (ib_umem_end(umem) - ib_umem_start(umem)) >> PAGE_SHIFT; +} + #ifdef CONFIG_INFINIBAND_USER_MEM struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, -- cgit v1.2.3 From c5d76f130b286682b64c659eaf6af701e3d79a7b Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:13 +0200 Subject: IB/core: Add umem function to read data from user-space In some drivers there's a need to read data from a user space area that was pinned using ib_umem when running from a different process context. The ib_umem_copy_from function allows reading data from the physical pages pinned in the ib_umem struct. Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- include/rdma/ib_umem.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 7ed6d4ff58dc..45bb04bc88cd 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -84,6 +84,8 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, int access, int dmasync); void ib_umem_release(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem); +int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length); #else /* CONFIG_INFINIBAND_USER_MEM */ -- cgit v1.2.3 From c1395a2a8c01e8a919e47d64eb3d23d00e824b8b Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:14 +0200 Subject: IB/mlx5: Add function to read WQE from user-space Add a helper function mlx5_ib_read_user_wqe to read information from user-space owned work queues. The function will be used in a later patch by the page-fault handling code in mlx5_ib. Signed-off-by: Haggai Eran [ Add stub for ib_umem_copy_from() for CONFIG_INFINIBAND_USER_MEM=n - Roland ] Signed-off-by: Roland Dreier --- include/linux/mlx5/qp.h | 3 +++ include/rdma/ib_umem.h | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 3fa075daeb1d..67f4b9660b06 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -189,6 +189,9 @@ struct mlx5_wqe_ctrl_seg { __be32 imm; }; +#define MLX5_WQE_CTRL_DS_MASK 0x3f +#define MLX5_WQE_DS_UNITS 16 + struct mlx5_wqe_xrc_seg { __be32 xrc_srqn; u8 rsvd[12]; diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 45bb04bc88cd..a51f4091489a 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -98,7 +98,10 @@ static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, } static inline void ib_umem_release(struct ib_umem *umem) { } static inline int ib_umem_page_count(struct ib_umem *umem) { return 0; } - +static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, + size_t length) { + return -EINVAL; +} #endif /* CONFIG_INFINIBAND_USER_MEM */ #endif /* IB_UMEM_H */ -- cgit v1.2.3 From 5a77abf9a97a7ecc8fb0f6bf4ad411fb12b02f31 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 11 Dec 2014 17:04:15 +0200 Subject: IB/core: Add support for extended query device caps Add extensible query device capabilities verb to allow adding new features. ib_uverbs_ex_query_device is added and copy_query_dev_fields is used to copy capability fields to be used by both ib_uverbs_query_device and ib_uverbs_ex_query_device. Signed-off-by: Eli Cohen Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- include/rdma/ib_verbs.h | 5 ++++- include/uapi/rdma/ib_user_verbs.h | 14 +++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 470a011d6fa4..97a999f9e4d8 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1662,7 +1662,10 @@ static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len) { - return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; + size_t copy_sz; + + copy_sz = min_t(size_t, len, udata->outlen); + return copy_to_user(udata->outbuf, src, copy_sz) ? -EFAULT : 0; } /** diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 26daf55ff76e..e8a96071e352 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -90,8 +90,9 @@ enum { }; enum { + IB_USER_VERBS_EX_CMD_QUERY_DEVICE = IB_USER_VERBS_CMD_QUERY_DEVICE, IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, - IB_USER_VERBS_EX_CMD_DESTROY_FLOW + IB_USER_VERBS_EX_CMD_DESTROY_FLOW, }; /* @@ -201,6 +202,17 @@ struct ib_uverbs_query_device_resp { __u8 reserved[4]; }; +struct ib_uverbs_ex_query_device { + __u32 comp_mask; + __u32 reserved; +}; + +struct ib_uverbs_ex_query_device_resp { + struct ib_uverbs_query_device_resp base; + __u32 comp_mask; + __u32 reserved; +}; + struct ib_uverbs_query_port { __u64 response; __u8 port_num; -- cgit v1.2.3 From 860f10a799c83e38a69d5a69d80da5312a4c4106 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 11 Dec 2014 17:04:16 +0200 Subject: IB/core: Add flags for on demand paging support * Add a configuration option for enable on-demand paging support in the infiniband subsystem (CONFIG_INFINIBAND_ON_DEMAND_PAGING). In a later patch, this configuration option will select the MMU_NOTIFIER configuration option to enable mmu notifiers. * Add a flag for on demand paging (ODP) support in the IB device capabilities. * Add a flag to request ODP MR in the access flags to reg_mr. * Fail registrations done with the ODP flag when the low-level driver doesn't support this. * Change the conditions in which an MR will be writable to explicitly specify the access flags. This is to avoid making an MR writable just because it is an ODP MR. * Add a ODP capabilities to the extended query device verb. Signed-off-by: Sagi Grimberg Signed-off-by: Shachar Raindel Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- include/rdma/ib_verbs.h | 28 ++++++++++++++++++++++++++-- include/uapi/rdma/ib_user_verbs.h | 15 +++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 97a999f9e4d8..a41bc5a39ebf 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -123,7 +123,8 @@ enum ib_device_cap_flags { IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23), IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24), IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), - IB_DEVICE_SIGNATURE_HANDOVER = (1<<30) + IB_DEVICE_SIGNATURE_HANDOVER = (1<<30), + IB_DEVICE_ON_DEMAND_PAGING = (1<<31), }; enum ib_signature_prot_cap { @@ -143,6 +144,27 @@ enum ib_atomic_cap { IB_ATOMIC_GLOB }; +enum ib_odp_general_cap_bits { + IB_ODP_SUPPORT = 1 << 0, +}; + +enum ib_odp_transport_cap_bits { + IB_ODP_SUPPORT_SEND = 1 << 0, + IB_ODP_SUPPORT_RECV = 1 << 1, + IB_ODP_SUPPORT_WRITE = 1 << 2, + IB_ODP_SUPPORT_READ = 1 << 3, + IB_ODP_SUPPORT_ATOMIC = 1 << 4, +}; + +struct ib_odp_caps { + uint64_t general_caps; + struct { + uint32_t rc_odp_caps; + uint32_t uc_odp_caps; + uint32_t ud_odp_caps; + } per_transport_caps; +}; + struct ib_device_attr { u64 fw_ver; __be64 sys_image_guid; @@ -186,6 +208,7 @@ struct ib_device_attr { u8 local_ca_ack_delay; int sig_prot_cap; int sig_guard_cap; + struct ib_odp_caps odp_caps; }; enum ib_mtu { @@ -1073,7 +1096,8 @@ enum ib_access_flags { IB_ACCESS_REMOTE_READ = (1<<2), IB_ACCESS_REMOTE_ATOMIC = (1<<3), IB_ACCESS_MW_BIND = (1<<4), - IB_ZERO_BASED = (1<<5) + IB_ZERO_BASED = (1<<5), + IB_ACCESS_ON_DEMAND = (1<<6), }; struct ib_phys_buf { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index e8a96071e352..4275b961bf60 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -202,15 +202,30 @@ struct ib_uverbs_query_device_resp { __u8 reserved[4]; }; +enum { + IB_USER_VERBS_EX_QUERY_DEVICE_ODP = 1ULL << 0, +}; + struct ib_uverbs_ex_query_device { __u32 comp_mask; __u32 reserved; }; +struct ib_uverbs_odp_caps { + __u64 general_caps; + struct { + __u32 rc_odp_caps; + __u32 uc_odp_caps; + __u32 ud_odp_caps; + } per_transport_caps; + __u32 reserved; +}; + struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_query_device_resp base; __u32 comp_mask; __u32 reserved; + struct ib_uverbs_odp_caps odp_caps; }; struct ib_uverbs_query_port { -- cgit v1.2.3 From 8ada2c1c0c1d75a60723cd2ca7d49c594a146af6 Mon Sep 17 00:00:00 2001 From: Shachar Raindel Date: Thu, 11 Dec 2014 17:04:17 +0200 Subject: IB/core: Add support for on demand paging regions * Extend the umem struct to keep the ODP related data. * Allocate and initialize the ODP related information in the umem (page_list, dma_list) and freeing as needed in the end of the run. * Store a reference to the process PID struct in the ucontext. Used to safely obtain the task_struct and the mm during fault handling, without preventing the task destruction if needed. * Add 2 helper functions: ib_umem_odp_map_dma_pages and ib_umem_odp_unmap_dma_pages. These functions get the DMA addresses of specific pages of the umem (and, currently, pin them). * Support for page faults only - IB core will keep the reference on the pages used and call put_page when freeing an ODP umem area. Invalidations support will be added in a later patch. Signed-off-by: Sagi Grimberg Signed-off-by: Shachar Raindel Signed-off-by: Haggai Eran Signed-off-by: Majd Dibbiny Signed-off-by: Roland Dreier --- include/rdma/ib_umem.h | 2 + include/rdma/ib_umem_odp.h | 97 ++++++++++++++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 2 + 3 files changed, 101 insertions(+) create mode 100644 include/rdma/ib_umem_odp.h (limited to 'include') diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index a51f4091489a..2d83cfd7e6ce 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -38,6 +38,7 @@ #include struct ib_ucontext; +struct ib_umem_odp; struct ib_umem { struct ib_ucontext *context; @@ -50,6 +51,7 @@ struct ib_umem { struct pid *pid; struct mm_struct *mm; unsigned long diff; + struct ib_umem_odp *odp_data; struct sg_table sg_head; int nmap; int npages; diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h new file mode 100644 index 000000000000..b5a2df1923b7 --- /dev/null +++ b/include/rdma/ib_umem_odp.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2014 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_UMEM_ODP_H +#define IB_UMEM_ODP_H + +#include + +struct ib_umem_odp { + /* + * An array of the pages included in the on-demand paging umem. + * Indices of pages that are currently not mapped into the device will + * contain NULL. + */ + struct page **page_list; + /* + * An array of the same size as page_list, with DMA addresses mapped + * for pages the pages in page_list. The lower two bits designate + * access permissions. See ODP_READ_ALLOWED_BIT and + * ODP_WRITE_ALLOWED_BIT. + */ + dma_addr_t *dma_list; + /* + * The umem_mutex protects the page_list and dma_list fields of an ODP + * umem, allowing only a single thread to map/unmap pages. + */ + struct mutex umem_mutex; + void *private; /* for the HW driver to use. */ +}; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + +int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem); + +void ib_umem_odp_release(struct ib_umem *umem); + +/* + * The lower 2 bits of the DMA address signal the R/W permissions for + * the entry. To upgrade the permissions, provide the appropriate + * bitmask to the map_dma_pages function. + * + * Be aware that upgrading a mapped address might result in change of + * the DMA address for the page. + */ +#define ODP_READ_ALLOWED_BIT (1<<0ULL) +#define ODP_WRITE_ALLOWED_BIT (1<<1ULL) + +#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) + +int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, + u64 access_mask, unsigned long current_seq); + +void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, + u64 bound); + +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +static inline int ib_umem_odp_get(struct ib_ucontext *context, + struct ib_umem *umem) +{ + return -EINVAL; +} + +static inline void ib_umem_odp_release(struct ib_umem *umem) {} + +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + +#endif /* IB_UMEM_ODP_H */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a41bc5a39ebf..3af5dcad1b69 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1151,6 +1151,8 @@ struct ib_ucontext { struct list_head xrcd_list; struct list_head rule_list; int closing; + + struct pid *tgid; }; struct ib_uobject { -- cgit v1.2.3 From 882214e2b12860bff1ccff15a3ec2bbb29d58c02 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:18 +0200 Subject: IB/core: Implement support for MMU notifiers regarding on demand paging regions * Add an interval tree implementation for ODP umems. Create an interval tree for each ucontext (including a count of the number of ODP MRs in this context, semaphore, etc.), and register ODP umems in the interval tree. * Add MMU notifiers handling functions, using the interval tree to notify only the relevant umems and underlying MRs. * Register to receive MMU notifier events from the MM subsystem upon ODP MR registration (and unregister accordingly). * Add a completion object to synchronize the destruction of ODP umems. * Add mechanism to abort page faults when there's a concurrent invalidation. The way we synchronize between concurrent invalidations and page faults is by keeping a counter of currently running invalidations, and a sequence number that is incremented whenever an invalidation is caught. The page fault code checks the counter and also verifies that the sequence number hasn't progressed before it updates the umem's page tables. This is similar to what the kvm module does. In order to prevent the case where we register a umem in the middle of an ongoing notifier, we also keep a per ucontext counter of the total number of active mmu notifiers. We only enable new umems when all the running notifiers complete. Signed-off-by: Sagi Grimberg Signed-off-by: Shachar Raindel Signed-off-by: Haggai Eran Signed-off-by: Yuval Dagan Signed-off-by: Roland Dreier --- include/rdma/ib_umem_odp.h | 65 +++++++++++++++++++++++++++++++++++++++++++++- include/rdma/ib_verbs.h | 19 ++++++++++++++ 2 files changed, 83 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index b5a2df1923b7..3da0b167041b 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -34,6 +34,13 @@ #define IB_UMEM_ODP_H #include +#include +#include + +struct umem_odp_node { + u64 __subtree_last; + struct rb_node rb; +}; struct ib_umem_odp { /* @@ -51,10 +58,27 @@ struct ib_umem_odp { dma_addr_t *dma_list; /* * The umem_mutex protects the page_list and dma_list fields of an ODP - * umem, allowing only a single thread to map/unmap pages. + * umem, allowing only a single thread to map/unmap pages. The mutex + * also protects access to the mmu notifier counters. */ struct mutex umem_mutex; void *private; /* for the HW driver to use. */ + + /* When false, use the notifier counter in the ucontext struct. */ + bool mn_counters_active; + int notifiers_seq; + int notifiers_count; + + /* A linked list of umems that don't have private mmu notifier + * counters yet. */ + struct list_head no_private_counters; + struct ib_umem *umem; + + /* Tree tracking */ + struct umem_odp_node interval_tree; + + struct completion notifier_completion; + int dying; }; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING @@ -82,6 +106,45 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bound); +void rbt_ib_umem_insert(struct umem_odp_node *node, struct rb_root *root); +void rbt_ib_umem_remove(struct umem_odp_node *node, struct rb_root *root); +typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, + void *cookie); +/* + * Call the callback on each ib_umem in the range. Returns the logical or of + * the return values of the functions called. + */ +int rbt_ib_umem_for_each_in_range(struct rb_root *root, u64 start, u64 end, + umem_call_back cb, void *cookie); + +struct umem_odp_node *rbt_ib_umem_iter_first(struct rb_root *root, + u64 start, u64 last); +struct umem_odp_node *rbt_ib_umem_iter_next(struct umem_odp_node *node, + u64 start, u64 last); + +static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, + unsigned long mmu_seq) +{ + /* + * This code is strongly based on the KVM code from + * mmu_notifier_retry. Should be called with + * the relevant locks taken (item->odp_data->umem_mutex + * and the ucontext umem_mutex semaphore locked for read). + */ + + /* Do not allow page faults while the new ib_umem hasn't seen a state + * with zero notifiers yet, and doesn't have its own valid set of + * private counters. */ + if (!item->odp_data->mn_counters_active) + return 1; + + if (unlikely(item->odp_data->notifiers_count)) + return 1; + if (item->odp_data->notifiers_seq != mmu_seq) + return 1; + return 0; +} + #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int ib_umem_odp_get(struct ib_ucontext *context, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3af5dcad1b69..0d74f1de99aa 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -51,6 +51,7 @@ #include #include +#include #include extern struct workqueue_struct *ib_wq; @@ -1139,6 +1140,8 @@ struct ib_fmr_attr { u8 page_shift; }; +struct ib_umem; + struct ib_ucontext { struct ib_device *device; struct list_head pd_list; @@ -1153,6 +1156,22 @@ struct ib_ucontext { int closing; struct pid *tgid; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + struct rb_root umem_tree; + /* + * Protects .umem_rbroot and tree, as well as odp_mrs_count and + * mmu notifiers registration. + */ + struct rw_semaphore umem_rwsem; + void (*invalidate_range)(struct ib_umem *umem, + unsigned long start, unsigned long end); + + struct mmu_notifier mn; + atomic_t notifier_count; + /* A list of umems that don't have private mmu notifier counters yet. */ + struct list_head no_private_counters; + int odp_mrs_count; +#endif }; struct ib_uobject { -- cgit v1.2.3 From 6cb7ff3dcfe6aad6a36a0fd0e928b5bea4fabdd5 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Mon, 15 Dec 2014 18:17:17 -0800 Subject: mlx5_core: Re-add MLX5_DEV_CAP_FLAG_ON_DMND_PG flag In commit 0c7aac854f52 ("net/mlx5_core: Remove unused dev cap enum fields"), the flag MLX5_DEV_CAP_FLAG_ON_DMND_PG was removed. Unfortunately the on-demand paging changes actually use it, so re-add the missing flag. Signed-off-by: Roland Dreier --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index fa07bfda0e15..096abe543d2c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -234,6 +234,7 @@ enum { MLX5_DEV_CAP_FLAG_APM = 1LL << 17, MLX5_DEV_CAP_FLAG_ATOMIC = 1LL << 18, MLX5_DEV_CAP_FLAG_BLOCK_MCAST = 1LL << 23, + MLX5_DEV_CAP_FLAG_ON_DMND_PG = 1LL << 24, MLX5_DEV_CAP_FLAG_CQ_MODER = 1LL << 29, MLX5_DEV_CAP_FLAG_RESIZE_CQ = 1LL << 30, MLX5_DEV_CAP_FLAG_DCT = 1LL << 37, -- cgit v1.2.3 From e420f0c0f3d1022789fcb59b2a0c4b979ce311ba Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:19 +0200 Subject: mlx5_core: Add support for page faults events and low level handling * Add a handler function pointer in the mlx5_core_qp struct for page fault events. Handle page fault events by calling the handler function, if not NULL. * Add on-demand paging capability query command. * Export command for resuming QPs after page faults. * Add various constants related to paging support. Signed-off-by: Sagi Grimberg Signed-off-by: Shachar Raindel Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- include/linux/mlx5/device.h | 54 +++++++++++++++++++++++++++++++++++++++++++- include/linux/mlx5/driver.h | 12 ++++++++++ include/linux/mlx5/qp.h | 55 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 096abe543d2c..70c28239e339 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -119,6 +119,15 @@ enum { MLX5_MAX_LOG_PKEY_TABLE = 5, }; +enum { + MLX5_MKEY_INBOX_PG_ACCESS = 1 << 31 +}; + +enum { + MLX5_PFAULT_SUBTYPE_WQE = 0, + MLX5_PFAULT_SUBTYPE_RDMA = 1, +}; + enum { MLX5_PERM_LOCAL_READ = 1 << 2, MLX5_PERM_LOCAL_WRITE = 1 << 3, @@ -215,6 +224,8 @@ enum mlx5_event { MLX5_EVENT_TYPE_CMD = 0x0a, MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb, + + MLX5_EVENT_TYPE_PAGE_FAULT = 0xc, }; enum { @@ -300,6 +311,8 @@ enum { enum { HCA_CAP_OPMOD_GET_MAX = 0, HCA_CAP_OPMOD_GET_CUR = 1, + HCA_CAP_OPMOD_GET_ODP_MAX = 4, + HCA_CAP_OPMOD_GET_ODP_CUR = 5 }; struct mlx5_inbox_hdr { @@ -329,6 +342,23 @@ struct mlx5_cmd_query_adapter_mbox_out { u8 vsd_psid[16]; }; +enum mlx5_odp_transport_cap_bits { + MLX5_ODP_SUPPORT_SEND = 1 << 31, + MLX5_ODP_SUPPORT_RECV = 1 << 30, + MLX5_ODP_SUPPORT_WRITE = 1 << 29, + MLX5_ODP_SUPPORT_READ = 1 << 28, +}; + +struct mlx5_odp_caps { + char reserved[0x10]; + struct { + __be32 rc_odp_caps; + __be32 uc_odp_caps; + __be32 ud_odp_caps; + } per_transport_caps; + char reserved2[0xe4]; +}; + struct mlx5_cmd_init_hca_mbox_in { struct mlx5_inbox_hdr hdr; u8 rsvd0[2]; @@ -449,6 +479,27 @@ struct mlx5_eqe_page_req { __be32 rsvd1[5]; }; +struct mlx5_eqe_page_fault { + __be32 bytes_committed; + union { + struct { + u16 reserved1; + __be16 wqe_index; + u16 reserved2; + __be16 packet_length; + u8 reserved3[12]; + } __packed wqe; + struct { + __be32 r_key; + u16 reserved1; + __be16 packet_length; + __be32 rdma_op_len; + __be64 rdma_va; + } __packed rdma; + } __packed; + __be32 flags_qpn; +} __packed; + union ev_data { __be32 raw[7]; struct mlx5_eqe_cmd cmd; @@ -460,6 +511,7 @@ union ev_data { struct mlx5_eqe_congestion cong; struct mlx5_eqe_stall_vl stall_vl; struct mlx5_eqe_page_req req_pages; + struct mlx5_eqe_page_fault page_fault; } __packed; struct mlx5_eqe { @@ -826,7 +878,7 @@ struct mlx5_query_special_ctxs_mbox_out { struct mlx5_create_mkey_mbox_in { struct mlx5_inbox_hdr hdr; __be32 input_mkey_index; - u8 rsvd0[4]; + __be32 flags; struct mlx5_mkey_seg seg; u8 rsvd1[16]; __be32 xlat_oct_act_size; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index b1bf41556b32..7088dcd19214 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -113,6 +113,13 @@ enum { MLX5_REG_HOST_ENDIANNESS = 0x7004, }; +enum mlx5_page_fault_resume_flags { + MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0, + MLX5_PAGE_FAULT_RESUME_WRITE = 1 << 1, + MLX5_PAGE_FAULT_RESUME_RDMA = 1 << 2, + MLX5_PAGE_FAULT_RESUME_ERROR = 1 << 7, +}; + enum dbg_rsc_type { MLX5_DBG_RSC_QP, MLX5_DBG_RSC_EQ, @@ -703,6 +710,9 @@ void mlx5_eq_cleanup(struct mlx5_core_dev *dev); void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas); void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn); void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type); +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe); +#endif void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type); struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn); void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector); @@ -740,6 +750,8 @@ int mlx5_core_create_psv(struct mlx5_core_dev *dev, u32 pdn, int npsvs, u32 *sig_index); int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); +int mlx5_query_odp_caps(struct mlx5_core_dev *dev, + struct mlx5_odp_caps *odp_caps); static inline u32 mlx5_mkey_to_idx(u32 mkey) { diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 67f4b9660b06..6b1d6f60c7e6 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -50,6 +50,9 @@ #define MLX5_BSF_APPTAG_ESCAPE 0x1 #define MLX5_BSF_APPREF_ESCAPE 0x2 +#define MLX5_QPN_BITS 24 +#define MLX5_QPN_MASK ((1 << MLX5_QPN_BITS) - 1) + enum mlx5_qp_optpar { MLX5_QP_OPTPAR_ALT_ADDR_PATH = 1 << 0, MLX5_QP_OPTPAR_RRE = 1 << 1, @@ -363,9 +366,46 @@ struct mlx5_stride_block_ctrl_seg { __be16 num_entries; }; +enum mlx5_pagefault_flags { + MLX5_PFAULT_REQUESTOR = 1 << 0, + MLX5_PFAULT_WRITE = 1 << 1, + MLX5_PFAULT_RDMA = 1 << 2, +}; + +/* Contains the details of a pagefault. */ +struct mlx5_pagefault { + u32 bytes_committed; + u8 event_subtype; + enum mlx5_pagefault_flags flags; + union { + /* Initiator or send message responder pagefault details. */ + struct { + /* Received packet size, only valid for responders. */ + u32 packet_size; + /* + * WQE index. Refers to either the send queue or + * receive queue, according to event_subtype. + */ + u16 wqe_index; + } wqe; + /* RDMA responder pagefault details */ + struct { + u32 r_key; + /* + * Received packet size, minimal size page fault + * resolution required for forward progress. + */ + u32 packet_size; + u32 rdma_op_len; + u64 rdma_va; + } rdma; + }; +}; + struct mlx5_core_qp { struct mlx5_core_rsc_common common; /* must be first */ void (*event) (struct mlx5_core_qp *, int); + void (*pfault_handler)(struct mlx5_core_qp *, struct mlx5_pagefault *); int qpn; struct mlx5_rsc_debug *dbg; int pid; @@ -533,6 +573,17 @@ static inline struct mlx5_core_mr *__mlx5_mr_lookup(struct mlx5_core_dev *dev, u return radix_tree_lookup(&dev->priv.mr_table.tree, key); } +struct mlx5_page_fault_resume_mbox_in { + struct mlx5_inbox_hdr hdr; + __be32 flags_qpn; + u8 reserved[4]; +}; + +struct mlx5_page_fault_resume_mbox_out { + struct mlx5_outbox_hdr hdr; + u8 rsvd[8]; +}; + int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, struct mlx5_create_qp_mbox_in *in, @@ -552,6 +603,10 @@ void mlx5_init_qp_table(struct mlx5_core_dev *dev); void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev); int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, + u8 context, int error); +#endif static inline const char *mlx5_qp_type_str(int type) { -- cgit v1.2.3 From cc149f751b75211df8c41fcd60bd0006e6143ed6 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:21 +0200 Subject: IB/mlx5: Changes in memory region creation to support on-demand paging This patch wraps together several changes needed for on-demand paging support in the mlx5_ib_populate_pas function, and when registering memory regions. * Instead of accepting a UMR bit telling the function to enable all access flags, the function now accepts the access flags themselves. * For on-demand paging memory regions, fill the memory tables from the correct list, and enable/disable the access flags per-page according to whether the page is present. * A new bit is set to enable writing of access flags when using the firmware create_mkey command. * Disable contig pages when on-demand paging is enabled. In addition the patch changes the UMR code to use PTR_ALIGN instead of our own macro. Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- include/linux/mlx5/device.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 70c28239e339..64512a7354cb 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -198,6 +198,9 @@ enum { MLX5_UMR_INLINE = (1 << 7), }; +#define MLX5_UMR_MTT_ALIGNMENT 0x40 +#define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) + enum mlx5_event { MLX5_EVENT_TYPE_COMP = 0x0, -- cgit v1.2.3 From 832a6b06ab5e13c228fc27e333ad360aa03ace6f Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:22 +0200 Subject: IB/mlx5: Add mlx5_ib_update_mtt to update page tables after creation The new function allows updating the page tables of a memory region after it was created. This can be used to handle page faults and page invalidations. Since mlx5_ib_update_mtt will need to work from within page invalidation, so it must not block on memory allocation. It employs an atomic memory allocation mechanism that is used as a fallback when kmalloc(GFP_ATOMIC) fails. In order to reuse code from mlx5_ib_populate_pas, the patch splits this function and add the needed parameters. Signed-off-by: Haggai Eran Signed-off-by: Shachar Raindel Signed-off-by: Roland Dreier --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 64512a7354cb..4e5bd813bb9a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -200,6 +200,7 @@ enum { #define MLX5_UMR_MTT_ALIGNMENT 0x40 #define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) +#define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT enum mlx5_event { MLX5_EVENT_TYPE_COMP = 0x0, -- cgit v1.2.3 From 6aec21f6a8322fa8d43df3ea7f051dfd8967f1b9 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:23 +0200 Subject: IB/mlx5: Page faults handling infrastructure * Refactor MR registration and cleanup, and fix reg_pages accounting. * Create a work queue to handle page fault events in a kthread context. * Register a fault handler to get events from the core for each QP. The registered fault handler is empty in this patch, and only a later patch implements it. Signed-off-by: Sagi Grimberg Signed-off-by: Shachar Raindel Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7088dcd19214..166d9315fe4b 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -474,7 +474,7 @@ struct mlx5_priv { struct workqueue_struct *pg_wq; struct rb_root page_root; int fw_pages; - int reg_pages; + atomic_t reg_pages; struct list_head free_list; struct mlx5_core_health health; -- cgit v1.2.3 From 7bdf65d411c1715d695be0d9a555d7f48d0a7220 Mon Sep 17 00:00:00 2001 From: Haggai Eran Date: Thu, 11 Dec 2014 17:04:24 +0200 Subject: IB/mlx5: Handle page faults This patch implement a page fault handler (leaving the pages pinned as of time being). The page fault handler handles initiator and responder page faults for UD/RC transports, for send/receive operations, as well as RDMA read/write initiator support. Signed-off-by: Sagi Grimberg Signed-off-by: Shachar Raindel Signed-off-by: Haggai Eran Signed-off-by: Roland Dreier --- include/linux/mlx5/qp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 6b1d6f60c7e6..61f7a342d1bf 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -193,7 +193,12 @@ struct mlx5_wqe_ctrl_seg { }; #define MLX5_WQE_CTRL_DS_MASK 0x3f +#define MLX5_WQE_CTRL_QPN_MASK 0xffffff00 +#define MLX5_WQE_CTRL_QPN_SHIFT 8 #define MLX5_WQE_DS_UNITS 16 +#define MLX5_WQE_CTRL_OPCODE_MASK 0xff +#define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00 +#define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8 struct mlx5_wqe_xrc_seg { __be32 xrc_srqn; @@ -298,6 +303,8 @@ struct mlx5_wqe_signature_seg { u8 rsvd1[11]; }; +#define MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK 0x3ff + struct mlx5_wqe_inline_seg { __be32 byte_count; }; -- cgit v1.2.3