diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-22 11:50:05 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-22 11:50:05 -0700 |
commit | 7c034dfd58bbc056280262887acf5b7a98944d0a (patch) | |
tree | 49dc3b77590f9929069db17922c823bb679c1456 /drivers | |
parent | 1204c464458e9837320a326a9fce550e3c5ef5de (diff) | |
parent | c1c2fef6cfb04cf30a56bb37cff40d3498d7edbf (diff) |
Merge tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband
Pull InfiniBand/RDMA updates from Roland Dreier:
- IPoIB fixes from Doug Ledford and Erez Shitrit
- iSER updates from Sagi Grimberg
- mlx4 GUID handling changes from Yishai Hadas
- other misc fixes
* tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband: (51 commits)
mlx5: wrong page mask if CONFIG_ARCH_DMA_ADDR_T_64BIT enabled for 32Bit architectures
IB/iser: Rewrite bounce buffer code path
IB/iser: Bump version to 1.6
IB/iser: Remove code duplication for a single DMA entry
IB/iser: Pass struct iser_mem_reg to iser_fast_reg_mr and iser_reg_sig_mr
IB/iser: Modify struct iser_mem_reg members
IB/iser: Make fastreg pool cache friendly
IB/iser: Move PI context alloc/free to routines
IB/iser: Move fastreg descriptor pool get/put to helper functions
IB/iser: Merge build page-vec into register page-vec
IB/iser: Get rid of struct iser_rdma_regd
IB/iser: Remove redundant assignments in iser_reg_page_vec
IB/iser: Move memory reg/dereg routines to iser_memory.c
IB/iser: Don't pass ib_device to fall_to_bounce_buff routine
IB/iser: Remove a redundant struct iser_data_buf
IB/iser: Remove redundant cmd_data_len calculation
IB/iser: Fix wrong calculation of protection buffer length
IB/iser: Handle fastreg/local_inv completion errors
IB/iser: Fix unload during ep_poll wrong dereference
ib_srpt: convert printk's to pr_* functions
...
Diffstat (limited to 'drivers')
25 files changed, 1494 insertions, 1131 deletions
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 8c014b5dab4c..38acb3cfc545 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -99,12 +99,15 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (dmasync) dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); + if (!size) + return ERR_PTR(-EINVAL); + /* * If the combination of the addr and size requested for this memory * region causes an integer overflow, return error. */ - if ((PAGE_ALIGN(addr + size) <= size) || - (PAGE_ALIGN(addr + size) <= addr)) + if (((addr + size) < addr) || + PAGE_ALIGN(addr + size) < (addr + size)) return ERR_PTR(-EINVAL); if (!can_do_mlock()) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 259dcc7779f5..88cce9bb72fe 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -246,6 +246,17 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(uqp); } + list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { + struct ib_srq *srq = uobj->object; + struct ib_uevent_object *uevent = + container_of(uobj, struct ib_uevent_object, uobject); + + idr_remove_uobj(&ib_uverbs_srq_idr, uobj); + ib_destroy_srq(srq); + ib_uverbs_release_uevent(file, uevent); + kfree(uevent); + } + list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) { struct ib_cq *cq = uobj->object; struct ib_uverbs_event_file *ev_file = cq->cq_context; @@ -258,17 +269,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, kfree(ucq); } - list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) { - struct ib_srq *srq = uobj->object; - struct ib_uevent_object *uevent = - container_of(uobj, struct ib_uevent_object, uobject); - - idr_remove_uobj(&ib_uverbs_srq_idr, uobj); - ib_destroy_srq(srq); - ib_uverbs_release_uevent(file, uevent); - kfree(uevent); - } - list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) { struct ib_mr *mr = uobj->object; diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index a31e031afd87..0f00204d2ece 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -58,14 +58,19 @@ struct mlx4_alias_guid_work_context { int query_id; struct list_head list; int block_num; + ib_sa_comp_mask guid_indexes; + u8 method; }; struct mlx4_next_alias_guid_work { u8 port; u8 block_num; + u8 method; struct mlx4_sriov_alias_guid_info_rec_det rec_det; }; +static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port, + int *resched_delay_sec); void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num, u8 port_num, u8 *p_data) @@ -118,6 +123,57 @@ ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index) return IB_SA_COMP_MASK(4 + index); } +void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave, + int port, int slave_init) +{ + __be64 curr_guid, required_guid; + int record_num = slave / 8; + int index = slave % 8; + int port_index = port - 1; + unsigned long flags; + int do_work = 0; + + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + if (dev->sriov.alias_guid.ports_guid[port_index].state_flags & + GUID_STATE_NEED_PORT_INIT) + goto unlock; + if (!slave_init) { + curr_guid = *(__be64 *)&dev->sriov. + alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * index]; + if (curr_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL) || + !curr_guid) + goto unlock; + required_guid = cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL); + } else { + required_guid = mlx4_get_admin_guid(dev->dev, slave, port); + if (required_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + goto unlock; + } + *(__be64 *)&dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num]. + all_recs[GUID_REC_SIZE * index] = required_guid; + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num].guid_indexes + |= mlx4_ib_get_aguid_comp_mask_from_ix(index); + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num].status + = MLX4_GUID_INFO_STATUS_IDLE; + /* set to run immediately */ + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num].time_to_run = 0; + dev->sriov.alias_guid.ports_guid[port_index]. + all_rec_per_port[record_num]. + guids_retry_schedule[index] = 0; + do_work = 1; +unlock: + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + + if (do_work) + mlx4_ib_init_alias_guid_work(dev, port_index); +} + /* * Whenever new GUID is set/unset (guid table change) create event and * notify the relevant slave (master also should be notified). @@ -138,10 +194,15 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, enum slave_port_state prev_state; __be64 tmp_cur_ag, form_cache_ag; enum slave_port_gen_event gen_event; + struct mlx4_sriov_alias_guid_info_rec_det *rec; + unsigned long flags; + __be64 required_value; if (!mlx4_is_master(dev->dev)) return; + rec = &dev->sriov.alias_guid.ports_guid[port_num - 1]. + all_rec_per_port[block_num]; guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid. ports_guid[port_num - 1]. all_rec_per_port[block_num].guid_indexes); @@ -166,8 +227,27 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, */ if (tmp_cur_ag != form_cache_ag) continue; - mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num); + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + required_value = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]; + + if (required_value == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + required_value = 0; + + if (tmp_cur_ag == required_value) { + rec->guid_indexes = rec->guid_indexes & + ~mlx4_ib_get_aguid_comp_mask_from_ix(i); + } else { + /* may notify port down if value is 0 */ + if (tmp_cur_ag != MLX4_NOT_SET_GUID) { + spin_unlock_irqrestore(&dev->sriov. + alias_guid.ag_work_lock, flags); + continue; + } + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, + flags); + mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num); /*2 cases: Valid GUID, and Invalid Guid*/ if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/ @@ -188,10 +268,14 @@ void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev, set_and_calc_slave_port_state(dev->dev, slave_id, port_num, MLX4_PORT_STATE_IB_EVENT_GID_INVALID, &gen_event); - pr_debug("sending PORT DOWN event to slave: %d, port: %d\n", - slave_id, port_num); - mlx4_gen_port_state_change_eqe(dev->dev, slave_id, port_num, - MLX4_PORT_CHANGE_SUBTYPE_DOWN); + if (gen_event == SLAVE_PORT_GEN_EVENT_DOWN) { + pr_debug("sending PORT DOWN event to slave: %d, port: %d\n", + slave_id, port_num); + mlx4_gen_port_state_change_eqe(dev->dev, + slave_id, + port_num, + MLX4_PORT_CHANGE_SUBTYPE_DOWN); + } } } } @@ -206,6 +290,9 @@ static void aliasguid_query_handler(int status, int i; struct mlx4_sriov_alias_guid_info_rec_det *rec; unsigned long flags, flags1; + ib_sa_comp_mask declined_guid_indexes = 0; + ib_sa_comp_mask applied_guid_indexes = 0; + unsigned int resched_delay_sec = 0; if (!context) return; @@ -216,9 +303,9 @@ static void aliasguid_query_handler(int status, all_rec_per_port[cb_ctx->block_num]; if (status) { - rec->status = MLX4_GUID_INFO_STATUS_IDLE; pr_debug("(port: %d) failed: status = %d\n", cb_ctx->port, status); + rec->time_to_run = ktime_get_real_ns() + 1 * NSEC_PER_SEC; goto out; } @@ -235,57 +322,101 @@ static void aliasguid_query_handler(int status, rec = &dev->sriov.alias_guid.ports_guid[port_index]. all_rec_per_port[guid_rec->block_num]; - rec->status = MLX4_GUID_INFO_STATUS_SET; - rec->method = MLX4_GUID_INFO_RECORD_SET; - + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) { - __be64 tmp_cur_ag; - tmp_cur_ag = *(__be64 *)&guid_rec->guid_info_list[i * GUID_REC_SIZE]; + __be64 sm_response, required_val; + + if (!(cb_ctx->guid_indexes & + mlx4_ib_get_aguid_comp_mask_from_ix(i))) + continue; + sm_response = *(__be64 *)&guid_rec->guid_info_list + [i * GUID_REC_SIZE]; + required_val = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]; + if (cb_ctx->method == MLX4_GUID_INFO_RECORD_DELETE) { + if (required_val == + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + goto next_entry; + + /* A new value was set till we got the response */ + pr_debug("need to set new value %llx, record num %d, block_num:%d\n", + be64_to_cpu(required_val), + i, guid_rec->block_num); + goto entry_declined; + } + /* check if the SM didn't assign one of the records. - * if it didn't, if it was not sysadmin request: - * ask the SM to give a new GUID, (instead of the driver request). + * if it didn't, re-ask for. */ - if (tmp_cur_ag == MLX4_NOT_SET_GUID) { - mlx4_ib_warn(&dev->ib_dev, "%s:Record num %d in " - "block_num: %d was declined by SM, " - "ownership by %d (0 = driver, 1=sysAdmin," - " 2=None)\n", __func__, i, - guid_rec->block_num, rec->ownership); - if (rec->ownership == MLX4_GUID_DRIVER_ASSIGN) { - /* if it is driver assign, asks for new GUID from SM*/ - *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] = - MLX4_NOT_SET_GUID; - - /* Mark the record as not assigned, and let it - * be sent again in the next work sched.*/ - rec->status = MLX4_GUID_INFO_STATUS_IDLE; - rec->guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i); - } + if (sm_response == MLX4_NOT_SET_GUID) { + if (rec->guids_retry_schedule[i] == 0) + mlx4_ib_warn(&dev->ib_dev, + "%s:Record num %d in block_num: %d was declined by SM\n", + __func__, i, + guid_rec->block_num); + goto entry_declined; } else { /* properly assigned record. */ /* We save the GUID we just got from the SM in the * admin_guid in order to be persistent, and in the * request from the sm the process will ask for the same GUID */ - if (rec->ownership == MLX4_GUID_SYSADMIN_ASSIGN && - tmp_cur_ag != *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE]) { - /* the sysadmin assignment failed.*/ - mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set" - " admin guid after SysAdmin " - "configuration. " - "Record num %d in block_num:%d " - "was declined by SM, " - "new val(0x%llx) was kept\n", - __func__, i, - guid_rec->block_num, - be64_to_cpu(*(__be64 *) & - rec->all_recs[i * GUID_REC_SIZE])); + if (required_val && + sm_response != required_val) { + /* Warn only on first retry */ + if (rec->guids_retry_schedule[i] == 0) + mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set" + " admin guid after SysAdmin " + "configuration. " + "Record num %d in block_num:%d " + "was declined by SM, " + "new val(0x%llx) was kept, SM returned (0x%llx)\n", + __func__, i, + guid_rec->block_num, + be64_to_cpu(required_val), + be64_to_cpu(sm_response)); + goto entry_declined; } else { - memcpy(&rec->all_recs[i * GUID_REC_SIZE], - &guid_rec->guid_info_list[i * GUID_REC_SIZE], - GUID_REC_SIZE); + *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] = + sm_response; + if (required_val == 0) + mlx4_set_admin_guid(dev->dev, + sm_response, + (guid_rec->block_num + * NUM_ALIAS_GUID_IN_REC) + i, + cb_ctx->port); + goto next_entry; } } +entry_declined: + declined_guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i); + rec->guids_retry_schedule[i] = + (rec->guids_retry_schedule[i] == 0) ? 1 : + min((unsigned int)60, + rec->guids_retry_schedule[i] * 2); + /* using the minimum value among all entries in that record */ + resched_delay_sec = (resched_delay_sec == 0) ? + rec->guids_retry_schedule[i] : + min(resched_delay_sec, + rec->guids_retry_schedule[i]); + continue; + +next_entry: + rec->guids_retry_schedule[i] = 0; } + + applied_guid_indexes = cb_ctx->guid_indexes & ~declined_guid_indexes; + if (declined_guid_indexes || + rec->guid_indexes & ~(applied_guid_indexes)) { + pr_debug("record=%d wasn't fully set, guid_indexes=0x%llx applied_indexes=0x%llx, declined_indexes=0x%llx\n", + guid_rec->block_num, + be64_to_cpu((__force __be64)rec->guid_indexes), + be64_to_cpu((__force __be64)applied_guid_indexes), + be64_to_cpu((__force __be64)declined_guid_indexes)); + rec->time_to_run = ktime_get_real_ns() + + resched_delay_sec * NSEC_PER_SEC; + } else { + rec->status = MLX4_GUID_INFO_STATUS_SET; + } + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); /* The func is call here to close the cases when the sm doesn't send smp, so in the sa response the driver @@ -297,10 +428,13 @@ static void aliasguid_query_handler(int status, out: spin_lock_irqsave(&dev->sriov.going_down_lock, flags); spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); - if (!dev->sriov.is_going_down) + if (!dev->sriov.is_going_down) { + get_low_record_time_index(dev, port_index, &resched_delay_sec); queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq, &dev->sriov.alias_guid.ports_guid[port_index]. - alias_guid_work, 0); + alias_guid_work, + msecs_to_jiffies(resched_delay_sec * 1000)); + } if (cb_ctx->sa_query) { list_del(&cb_ctx->list); kfree(cb_ctx); @@ -317,9 +451,7 @@ static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index) ib_sa_comp_mask comp_mask = 0; dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status - = MLX4_GUID_INFO_STATUS_IDLE; - dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].method - = MLX4_GUID_INFO_RECORD_SET; + = MLX4_GUID_INFO_STATUS_SET; /* calculate the comp_mask for that record.*/ for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { @@ -333,19 +465,21 @@ static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index) need to assign GUIDs, then don't put it up for assignment. */ if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val || - (!index && !i) || - MLX4_GUID_NONE_ASSIGN == dev->sriov.alias_guid. - ports_guid[port - 1].all_rec_per_port[index].ownership) + (!index && !i)) continue; comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i); } dev->sriov.alias_guid.ports_guid[port - 1]. - all_rec_per_port[index].guid_indexes = comp_mask; + all_rec_per_port[index].guid_indexes |= comp_mask; + if (dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].guid_indexes) + dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[index].status = MLX4_GUID_INFO_STATUS_IDLE; + } static int set_guid_rec(struct ib_device *ibdev, - u8 port, int index, - struct mlx4_sriov_alias_guid_info_rec_det *rec_det) + struct mlx4_next_alias_guid_work *rec) { int err; struct mlx4_ib_dev *dev = to_mdev(ibdev); @@ -354,6 +488,9 @@ static int set_guid_rec(struct ib_device *ibdev, struct ib_port_attr attr; struct mlx4_alias_guid_work_context *callback_context; unsigned long resched_delay, flags, flags1; + u8 port = rec->port + 1; + int index = rec->block_num; + struct mlx4_sriov_alias_guid_info_rec_det *rec_det = &rec->rec_det; struct list_head *head = &dev->sriov.alias_guid.ports_guid[port - 1].cb_list; @@ -380,6 +517,8 @@ static int set_guid_rec(struct ib_device *ibdev, callback_context->port = port; callback_context->dev = dev; callback_context->block_num = index; + callback_context->guid_indexes = rec_det->guid_indexes; + callback_context->method = rec->method; memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec)); @@ -399,7 +538,7 @@ static int set_guid_rec(struct ib_device *ibdev, callback_context->query_id = ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client, ibdev, port, &guid_info_rec, - comp_mask, rec_det->method, 1000, + comp_mask, rec->method, 1000, GFP_KERNEL, aliasguid_query_handler, callback_context, &callback_context->sa_query); @@ -434,6 +573,30 @@ out: return err; } +static void mlx4_ib_guid_port_init(struct mlx4_ib_dev *dev, int port) +{ + int j, k, entry; + __be64 guid; + + /*Check if the SM doesn't need to assign the GUIDs*/ + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) { + entry = j * NUM_ALIAS_GUID_IN_REC + k; + /* no request for the 0 entry (hw guid) */ + if (!entry || entry > dev->dev->persist->num_vfs || + !mlx4_is_slave_active(dev->dev, entry)) + continue; + guid = mlx4_get_admin_guid(dev->dev, entry, port); + *(__be64 *)&dev->sriov.alias_guid.ports_guid[port - 1]. + all_rec_per_port[j].all_recs + [GUID_REC_SIZE * k] = guid; + pr_debug("guid was set, entry=%d, val=0x%llx, port=%d\n", + entry, + be64_to_cpu(guid), + port); + } + } +} void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port) { int i; @@ -443,6 +606,13 @@ void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port) spin_lock_irqsave(&dev->sriov.going_down_lock, flags); spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); + + if (dev->sriov.alias_guid.ports_guid[port - 1].state_flags & + GUID_STATE_NEED_PORT_INIT) { + mlx4_ib_guid_port_init(dev, port); + dev->sriov.alias_guid.ports_guid[port - 1].state_flags &= + (~GUID_STATE_NEED_PORT_INIT); + } for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++) invalidate_guid_record(dev, port, i); @@ -462,60 +632,107 @@ void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port) spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags); } -/* The function returns the next record that was - * not configured (or failed to be configured) */ -static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port, - struct mlx4_next_alias_guid_work *rec) +static void set_required_record(struct mlx4_ib_dev *dev, u8 port, + struct mlx4_next_alias_guid_work *next_rec, + int record_index) { - int j; - unsigned long flags; + int i; + int lowset_time_entry = -1; + int lowest_time = 0; + ib_sa_comp_mask delete_guid_indexes = 0; + ib_sa_comp_mask set_guid_indexes = 0; + struct mlx4_sriov_alias_guid_info_rec_det *rec = + &dev->sriov.alias_guid.ports_guid[port]. + all_rec_per_port[record_index]; - for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { - spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); - if (dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status == - MLX4_GUID_INFO_STATUS_IDLE) { - memcpy(&rec->rec_det, - &dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j], - sizeof (struct mlx4_sriov_alias_guid_info_rec_det)); - rec->port = port; - rec->block_num = j; - dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[j].status = - MLX4_GUID_INFO_STATUS_PENDING; - spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); - return 0; + for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) { + if (!(rec->guid_indexes & + mlx4_ib_get_aguid_comp_mask_from_ix(i))) + continue; + + if (*(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] == + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) + delete_guid_indexes |= + mlx4_ib_get_aguid_comp_mask_from_ix(i); + else + set_guid_indexes |= + mlx4_ib_get_aguid_comp_mask_from_ix(i); + + if (lowset_time_entry == -1 || rec->guids_retry_schedule[i] <= + lowest_time) { + lowset_time_entry = i; + lowest_time = rec->guids_retry_schedule[i]; } - spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); } - return -ENOENT; + + memcpy(&next_rec->rec_det, rec, sizeof(*rec)); + next_rec->port = port; + next_rec->block_num = record_index; + + if (*(__be64 *)&rec->all_recs[lowset_time_entry * GUID_REC_SIZE] == + cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) { + next_rec->rec_det.guid_indexes = delete_guid_indexes; + next_rec->method = MLX4_GUID_INFO_RECORD_DELETE; + } else { + next_rec->rec_det.guid_indexes = set_guid_indexes; + next_rec->method = MLX4_GUID_INFO_RECORD_SET; + } } -static void set_administratively_guid_record(struct mlx4_ib_dev *dev, int port, - int rec_index, - struct mlx4_sriov_alias_guid_info_rec_det *rec_det) +/* return index of record that should be updated based on lowest + * rescheduled time + */ +static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port, + int *resched_delay_sec) { - dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].guid_indexes = - rec_det->guid_indexes; - memcpy(dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].all_recs, - rec_det->all_recs, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE); - dev->sriov.alias_guid.ports_guid[port].all_rec_per_port[rec_index].status = - rec_det->status; + int record_index = -1; + u64 low_record_time = 0; + struct mlx4_sriov_alias_guid_info_rec_det rec; + int j; + + for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { + rec = dev->sriov.alias_guid.ports_guid[port]. + all_rec_per_port[j]; + if (rec.status == MLX4_GUID_INFO_STATUS_IDLE && + rec.guid_indexes) { + if (record_index == -1 || + rec.time_to_run < low_record_time) { + record_index = j; + low_record_time = rec.time_to_run; + } + } + } + if (resched_delay_sec) { + u64 curr_time = ktime_get_real_ns(); + + *resched_delay_sec = (low_record_time < curr_time) ? 0 : + div_u64((low_record_time - curr_time), NSEC_PER_SEC); + } + + return record_index; } -static void set_all_slaves_guids(struct mlx4_ib_dev *dev, int port) +/* The function returns the next record that was + * not configured (or failed to be configured) */ +static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port, + struct mlx4_next_alias_guid_work *rec) { - int j; - struct mlx4_sriov_alias_guid_info_rec_det rec_det ; - - for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT ; j++) { - memset(rec_det.all_recs, 0, NUM_ALIAS_GUID_IN_REC * GUID_REC_SIZE); - rec_det.guid_indexes = (!j ? 0 : IB_SA_GUIDINFO_REC_GID0) | - IB_SA_GUIDINFO_REC_GID1 | IB_SA_GUIDINFO_REC_GID2 | - IB_SA_GUIDINFO_REC_GID3 | IB_SA_GUIDINFO_REC_GID4 | - IB_SA_GUIDINFO_REC_GID5 | IB_SA_GUIDINFO_REC_GID6 | - IB_SA_GUIDINFO_REC_GID7; - rec_det.status = MLX4_GUID_INFO_STATUS_IDLE; - set_administratively_guid_record(dev, port, j, &rec_det); + unsigned long flags; + int record_index; + int ret = 0; + + spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags); + record_index = get_low_record_time_index(dev, port, NULL); + + if (record_index < 0) { + ret = -ENOENT; + goto out; } + + set_required_record(dev, port, rec, record_index); +out: + spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags); + return ret; } static void alias_guid_work(struct work_struct *work) @@ -545,9 +762,7 @@ static void alias_guid_work(struct work_struct *work) goto out; } - set_guid_rec(&dev->ib_dev, rec->port + 1, rec->block_num, - &rec->rec_det); - + set_guid_rec(&dev->ib_dev, rec); out: kfree(rec); } @@ -562,6 +777,12 @@ void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port) spin_lock_irqsave(&dev->sriov.going_down_lock, flags); spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1); if (!dev->sriov.is_going_down) { + /* If there is pending one should cancell then run, otherwise + * won't run till previous one is ended as same work + * struct is used. + */ + cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[port]. + alias_guid_work); queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq, &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0); } @@ -609,7 +830,7 @@ int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) { char alias_wq_name[15]; int ret = 0; - int i, j, k; + int i, j; union ib_gid gid; if (!mlx4_is_master(dev->dev)) @@ -633,33 +854,25 @@ int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) for (i = 0 ; i < dev->num_ports; i++) { memset(&dev->sriov.alias_guid.ports_guid[i], 0, sizeof (struct mlx4_sriov_alias_guid_port_rec_det)); - /*Check if the SM doesn't need to assign the GUIDs*/ + dev->sriov.alias_guid.ports_guid[i].state_flags |= + GUID_STATE_NEED_PORT_INIT; for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) { - if (mlx4_ib_sm_guid_assign) { - dev->sriov.alias_guid.ports_guid[i]. - all_rec_per_port[j]. - ownership = MLX4_GUID_DRIVER_ASSIGN; - continue; - } - dev->sriov.alias_guid.ports_guid[i].all_rec_per_port[j]. - ownership = MLX4_GUID_NONE_ASSIGN; - /*mark each val as it was deleted, - till the sysAdmin will give it valid val*/ - for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) { - *(__be64 *)&dev->sriov.alias_guid.ports_guid[i]. - all_rec_per_port[j].all_recs[GUID_REC_SIZE * k] = - cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL); - } + /* mark each val as it was deleted */ + memset(dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j].all_recs, 0xFF, + sizeof(dev->sriov.alias_guid.ports_guid[i]. + all_rec_per_port[j].all_recs)); } INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list); /*prepare the records, set them to be allocated by sm*/ + if (mlx4_ib_sm_guid_assign) + for (j = 1; j < NUM_ALIAS_GUID_PER_PORT; j++) + mlx4_set_admin_guid(dev->dev, 0, j, i + 1); for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) invalidate_guid_record(dev, i + 1, j); dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid; dev->sriov.alias_guid.ports_guid[i].port = i; - if (mlx4_ib_sm_guid_assign) - set_all_slaves_guids(dev, i); snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); dev->sriov.alias_guid.ports_guid[i].wq = diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 59040265e361..9cd2b002d7ae 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -1430,6 +1430,10 @@ static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, tun_qp->ring[i].addr, rx_buf_size, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ctx->ib_dev, tun_qp->ring[i].map)) { + kfree(tun_qp->ring[i].addr); + goto err; + } } for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) { @@ -1442,6 +1446,11 @@ static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx, tun_qp->tx_ring[i].buf.addr, tx_buf_size, DMA_TO_DEVICE); + if (ib_dma_mapping_error(ctx->ib_dev, + tun_qp->tx_ring[i].buf.map)) { + kfree(tun_qp->tx_ring[i].buf.addr); + goto tx_err; + } tun_qp->tx_ring[i].ah = NULL; } spin_lock_init(&tun_qp->tx_lock); diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 976bea794b5f..57070c529dfb 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -66,9 +66,9 @@ MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRV_VERSION); -int mlx4_ib_sm_guid_assign = 1; +int mlx4_ib_sm_guid_assign = 0; module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); -MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)"); +MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 0)"); static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" @@ -2791,9 +2791,31 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, case MLX4_DEV_EVENT_SLAVE_INIT: /* here, p is the slave id */ do_slave_init(ibdev, p, 1); + if (mlx4_is_master(dev)) { + int i; + + for (i = 1; i <= ibdev->num_ports; i++) { + if (rdma_port_get_link_layer(&ibdev->ib_dev, i) + == IB_LINK_LAYER_INFINIBAND) + mlx4_ib_slave_alias_guid_event(ibdev, + p, i, + 1); + } + } return; case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: + if (mlx4_is_master(dev)) { + int i; + + for (i = 1; i <= ibdev->num_ports; i++) { + if (rdma_port_get_link_layer(&ibdev->ib_dev, i) + == IB_LINK_LAYER_INFINIBAND) + mlx4_ib_slave_alias_guid_event(ibdev, + p, i, + 0); + } + } /* here, p is the slave id */ do_slave_init(ibdev, p, 0); return; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index f829fd935b79..fce3934372a1 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -342,14 +342,9 @@ struct mlx4_ib_ah { enum mlx4_guid_alias_rec_status { MLX4_GUID_INFO_STATUS_IDLE, MLX4_GUID_INFO_STATUS_SET, - MLX4_GUID_INFO_STATUS_PENDING, }; -enum mlx4_guid_alias_rec_ownership { - MLX4_GUID_DRIVER_ASSIGN, - MLX4_GUID_SYSADMIN_ASSIGN, - MLX4_GUID_NONE_ASSIGN, /*init state of each record*/ -}; +#define GUID_STATE_NEED_PORT_INIT 0x01 enum mlx4_guid_alias_rec_method { MLX4_GUID_INFO_RECORD_SET = IB_MGMT_METHOD_SET, @@ -360,8 +355,8 @@ struct mlx4_sriov_alias_guid_info_rec_det { u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC]; ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/ enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/ - u8 method; /*set or delete*/ - enum mlx4_guid_alias_rec_ownership ownership; /*indicates who assign that alias_guid record*/ + unsigned int guids_retry_schedule[NUM_ALIAS_GUID_IN_REC]; + u64 time_to_run; }; struct mlx4_sriov_alias_guid_port_rec_det { @@ -369,6 +364,7 @@ struct mlx4_sriov_alias_guid_port_rec_det { struct workqueue_struct *wq; struct delayed_work alias_guid_work; u8 port; + u32 state_flags; struct mlx4_sriov_alias_guid *parent; struct list_head cb_list; }; @@ -802,6 +798,8 @@ int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num, struct attribute *attr); ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index); +void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave, + int port, int slave_init); int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index ed2bd6701f9b..02fc91c68027 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -566,6 +566,10 @@ static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr, sizeof (struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE); + if (ib_dma_mapping_error(dev, qp->sqp_proxy_rcv[i].map)) { + kfree(qp->sqp_proxy_rcv[i].addr); + goto err; + } } return 0; @@ -2605,8 +2609,7 @@ static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr, memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen); - *lso_hdr_sz = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 | - wr->wr.ud.hlen); + *lso_hdr_sz = cpu_to_be32(wr->wr.ud.mss << 16 | wr->wr.ud.hlen); *lso_seg_len = halign; return 0; } diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c index d10c2b8a5dad..6797108ce873 100644 --- a/drivers/infiniband/hw/mlx4/sysfs.c +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -46,21 +46,17 @@ static ssize_t show_admin_alias_guid(struct device *dev, struct device_attribute *attr, char *buf) { - int record_num;/*0-15*/ - int guid_index_in_rec; /*0 - 7*/ struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry = container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry); struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; struct mlx4_ib_dev *mdev = port->dev; + __be64 sysadmin_ag_val; - record_num = mlx4_ib_iov_dentry->entry_num / 8 ; - guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8 ; + sysadmin_ag_val = mlx4_get_admin_guid(mdev->dev, + mlx4_ib_iov_dentry->entry_num, + port->num); - return sprintf(buf, "%llx\n", - be64_to_cpu(*(__be64 *)&mdev->sriov.alias_guid. - ports_guid[port->num - 1]. - all_rec_per_port[record_num]. - all_recs[8 * guid_index_in_rec])); + return sprintf(buf, "%llx\n", be64_to_cpu(sysadmin_ag_val)); } /* store_admin_alias_guid stores the (new) administratively assigned value of that GUID. @@ -80,6 +76,7 @@ static ssize_t store_admin_alias_guid(struct device *dev, struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx; struct mlx4_ib_dev *mdev = port->dev; u64 sysadmin_ag_val; + unsigned long flags; record_num = mlx4_ib_iov_dentry->entry_num / 8; guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8; @@ -87,6 +84,7 @@ static ssize_t store_admin_alias_guid(struct device *dev, pr_err("GUID 0 block 0 is RO\n"); return count; } + spin_lock_irqsave(&mdev->sriov.alias_guid.ag_work_lock, flags); sscanf(buf, "%llx", &sysadmin_ag_val); *(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1]. all_rec_per_port[record_num]. @@ -96,33 +94,15 @@ static ssize_t store_admin_alias_guid(struct device *dev, /* Change the state to be pending for update */ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status = MLX4_GUID_INFO_STATUS_IDLE ; - - mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method - = MLX4_GUID_INFO_RECORD_SET; - - switch (sysadmin_ag_val) { - case MLX4_GUID_FOR_DELETE_VAL: - mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].method - = MLX4_GUID_INFO_RECORD_DELETE; - mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership - = MLX4_GUID_SYSADMIN_ASSIGN; - break; - /* The sysadmin requests the SM to re-assign */ - case MLX4_NOT_SET_GUID: - mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership - = MLX4_GUID_DRIVER_ASSIGN; - break; - /* The sysadmin requests a specific value.*/ - default: - mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].ownership - = MLX4_GUID_SYSADMIN_ASSIGN; - break; - } + mlx4_set_admin_guid(mdev->dev, cpu_to_be64(sysadmin_ag_val), + mlx4_ib_iov_dentry->entry_num, + port->num); /* set the record index */ mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes - = mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec); + |= mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec); + spin_unlock_irqrestore(&mdev->sriov.alias_guid.ag_work_lock, flags); mlx4_ib_init_alias_guid_work(mdev, port->num - 1); return count; diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index d7562beb5423..bd94b0a6e9e5 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -87,7 +87,6 @@ enum { IPOIB_FLAG_ADMIN_UP = 2, IPOIB_PKEY_ASSIGNED = 3, IPOIB_FLAG_SUBINTERFACE = 5, - IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, IPOIB_FLAG_ADMIN_CM = 9, IPOIB_FLAG_UMCAST = 10, @@ -98,9 +97,15 @@ enum { IPOIB_MCAST_FLAG_FOUND = 0, /* used in set_multicast_list */ IPOIB_MCAST_FLAG_SENDONLY = 1, - IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ + /* + * For IPOIB_MCAST_FLAG_BUSY + * When set, in flight join and mcast->mc is unreliable + * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or + * haven't started yet + * When clear and mcast->mc is valid pointer, join was successful + */ + IPOIB_MCAST_FLAG_BUSY = 2, IPOIB_MCAST_FLAG_ATTACHED = 3, - IPOIB_MCAST_JOIN_STARTED = 4, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, @@ -148,6 +153,7 @@ struct ipoib_mcast { unsigned long created; unsigned long backoff; + unsigned long delay_until; unsigned long flags; unsigned char logcount; @@ -292,6 +298,11 @@ struct ipoib_neigh_table { struct completion deleted; }; +struct ipoib_qp_state_validate { + struct work_struct work; + struct ipoib_dev_priv *priv; +}; + /* * Device private locking: network stack tx_lock protects members used * in TX fast path, lock protects everything else. lock nests inside @@ -317,6 +328,7 @@ struct ipoib_dev_priv { struct list_head multicast_list; struct rb_root multicast_tree; + struct workqueue_struct *wq; struct delayed_work mcast_task; struct work_struct carrier_on_task; struct work_struct flush_light; @@ -426,11 +438,6 @@ struct ipoib_neigh { #define IPOIB_UD_MTU(ib_mtu) (ib_mtu - IPOIB_ENCAP_LEN) #define IPOIB_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES) -static inline int ipoib_ud_need_sg(unsigned int ib_mtu) -{ - return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE; -} - void ipoib_neigh_dtor(struct ipoib_neigh *neigh); static inline void ipoib_neigh_put(struct ipoib_neigh *neigh) { @@ -477,10 +484,10 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); -int ipoib_ib_dev_open(struct net_device *dev, int flush); +int ipoib_ib_dev_open(struct net_device *dev); int ipoib_ib_dev_up(struct net_device *dev); -int ipoib_ib_dev_down(struct net_device *dev, int flush); -int ipoib_ib_dev_stop(struct net_device *dev, int flush); +int ipoib_ib_dev_down(struct net_device *dev); +int ipoib_ib_dev_stop(struct net_device *dev); void ipoib_pkey_dev_check_presence(struct net_device *dev); int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); @@ -492,7 +499,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); void ipoib_mcast_restart_task(struct work_struct *work); int ipoib_mcast_start_thread(struct net_device *dev); -int ipoib_mcast_stop_thread(struct net_device *dev, int flush); +int ipoib_mcast_stop_thread(struct net_device *dev); void ipoib_mcast_dev_down(struct net_device *dev); void ipoib_mcast_dev_flush(struct net_device *dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 933efcea0d03..56959adb6c7d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -474,7 +474,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even } spin_lock_irq(&priv->lock); - queue_delayed_work(ipoib_workqueue, + queue_delayed_work(priv->wq, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); /* Add this entry to passive ids list head, but do not re-add it * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */ @@ -576,7 +576,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list); ipoib_cm_start_rx_drain(priv); - queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + queue_work(priv->wq, &priv->cm.rx_reap_task); spin_unlock_irqrestore(&priv->lock, flags); } else ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n", @@ -603,7 +603,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) spin_lock_irqsave(&priv->lock, flags); list_move(&p->list, &priv->cm.rx_reap_list); spin_unlock_irqrestore(&priv->lock, flags); - queue_work(ipoib_workqueue, &priv->cm.rx_reap_task); + queue_work(priv->wq, &priv->cm.rx_reap_task); } return; } @@ -827,7 +827,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); - queue_work(ipoib_workqueue, &priv->cm.reap_task); + queue_work(priv->wq, &priv->cm.reap_task); } clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags); @@ -1255,7 +1255,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { list_move(&tx->list, &priv->cm.reap_list); - queue_work(ipoib_workqueue, &priv->cm.reap_task); + queue_work(priv->wq, &priv->cm.reap_task); } spin_unlock_irqrestore(&priv->lock, flags); @@ -1284,7 +1284,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path tx->dev = dev; list_add(&tx->list, &priv->cm.start_list); set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags); - queue_work(ipoib_workqueue, &priv->cm.start_task); + queue_work(priv->wq, &priv->cm.start_task); return tx; } @@ -1295,7 +1295,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx) if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) { spin_lock_irqsave(&priv->lock, flags); list_move(&tx->list, &priv->cm.reap_list); - queue_work(ipoib_workqueue, &priv->cm.reap_task); + queue_work(priv->wq, &priv->cm.reap_task); ipoib_dbg(priv, "Reap connection for gid %pI6\n", tx->neigh->daddr + 4); tx->neigh = NULL; @@ -1417,7 +1417,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb, skb_queue_tail(&priv->cm.skb_queue, skb); if (e) - queue_work(ipoib_workqueue, &priv->cm.skb_task); + queue_work(priv->wq, &priv->cm.skb_task); } static void ipoib_cm_rx_reap(struct work_struct *work) @@ -1450,7 +1450,7 @@ static void ipoib_cm_stale_task(struct work_struct *work) } if (!list_empty(&priv->cm.passive_ids)) - queue_delayed_work(ipoib_workqueue, + queue_delayed_work(priv->wq, &priv->cm.stale_task, IPOIB_CM_RX_DELAY); spin_unlock_irq(&priv->lock); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 72626c348174..63b92cbb29ad 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -94,39 +94,9 @@ void ipoib_free_ah(struct kref *kref) static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, u64 mapping[IPOIB_UD_RX_SG]) { - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE, - DMA_FROM_DEVICE); - ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE, - DMA_FROM_DEVICE); - } else - ib_dma_unmap_single(priv->ca, mapping[0], - IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), - DMA_FROM_DEVICE); -} - -static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv, - struct sk_buff *skb, - unsigned int length) -{ - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[0]; - unsigned int size; - /* - * There is only two buffers needed for max_payload = 4K, - * first buf size is IPOIB_UD_HEAD_SIZE - */ - skb->tail += IPOIB_UD_HEAD_SIZE; - skb->len += length; - - size = length - IPOIB_UD_HEAD_SIZE; - - skb_frag_size_set(frag, size); - skb->data_len += size; - skb->truesize += PAGE_SIZE; - } else - skb_put(skb, length); - + ib_dma_unmap_single(priv->ca, mapping[0], + IPOIB_UD_BUF_SIZE(priv->max_ib_mtu), + DMA_FROM_DEVICE); } static int ipoib_ib_post_receive(struct net_device *dev, int id) @@ -156,18 +126,11 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; int buf_size; - int tailroom; u64 *mapping; - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - buf_size = IPOIB_UD_HEAD_SIZE; - tailroom = 128; /* reserve some tailroom for IP/TCP headers */ - } else { - buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); - tailroom = 0; - } + buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); - skb = dev_alloc_skb(buf_size + tailroom + 4); + skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN); if (unlikely(!skb)) return NULL; @@ -184,23 +147,8 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) goto error; - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - struct page *page = alloc_page(GFP_ATOMIC); - if (!page) - goto partial_error; - skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE); - mapping[1] = - ib_dma_map_page(priv->ca, page, - 0, PAGE_SIZE, DMA_FROM_DEVICE); - if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1]))) - goto partial_error; - } - priv->rx_ring[id].skb = skb; return skb; - -partial_error: - ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE); error: dev_kfree_skb_any(skb); return NULL; @@ -278,7 +226,8 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) wc->byte_len, wc->slid); ipoib_ud_dma_unmap_rx(priv, mapping); - ipoib_ud_skb_put_frags(priv, skb, wc->byte_len); + + skb_put(skb, wc->byte_len); /* First byte of dgid signals multicast when 0xff */ dgid = &((struct ib_grh *)skb->data)->dgid; @@ -296,6 +245,8 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) skb_reset_mac_header(skb); skb_pull(skb, IPOIB_ENCAP_LEN); + skb->truesize = SKB_TRUESIZE(skb->len); + ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; @@ -376,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca, } } +/* + * As the result of a completion error the QP Can be transferred to SQE states. + * The function checks if the (send)QP is in SQE state and + * moves it back to RTS state, that in order to have it functional again. + */ +static void ipoib_qp_state_validate_work(struct work_struct *work) +{ + struct ipoib_qp_state_validate *qp_work = + container_of(work, struct ipoib_qp_state_validate, work); + + struct ipoib_dev_priv *priv = qp_work->priv; + struct ib_qp_attr qp_attr; + struct ib_qp_init_attr query_init_attr; + int ret; + + ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr); + if (ret) { + ipoib_warn(priv, "%s: Failed to query QP ret: %d\n", + __func__, ret); + goto free_res; + } + pr_info("%s: QP: 0x%x is in state: %d\n", + __func__, priv->qp->qp_num, qp_attr.qp_state); + + /* currently support only in SQE->RTS transition*/ + if (qp_attr.qp_state == IB_QPS_SQE) { + qp_attr.qp_state = IB_QPS_RTS; + + ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE); + if (ret) { + pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n", + ret, priv->qp->qp_num); + goto free_res; + } + pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n", + __func__, priv->qp->qp_num); + } else { + pr_warn("QP (%d) will stay in state: %d\n", + priv->qp->qp_num, qp_attr.qp_state); + } + +free_res: + kfree(qp_work); +} + static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -407,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) netif_wake_queue(dev); if (wc->status != IB_WC_SUCCESS && - wc->status != IB_WC_WR_FLUSH_ERR) + wc->status != IB_WC_WR_FLUSH_ERR) { + struct ipoib_qp_state_validate *qp_work; ipoib_warn(priv, "failed send event " "(status=%d, wrid=%d vend_err %x)\n", wc->status, wr_id, wc->vendor_err); + qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC); + if (!qp_work) { + ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n", + __func__, priv->qp->qp_num); + return; + } + + INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work); + qp_work->priv = priv; + queue_work(priv->wq, &qp_work->work); + } } static int poll_tx(struct ipoib_dev_priv *priv) @@ -655,16 +663,33 @@ void ipoib_reap_ah(struct work_struct *work) __ipoib_reap_ah(dev); if (!test_bit(IPOIB_STOP_REAPER, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + queue_delayed_work(priv->wq, &priv->ah_reap_task, round_jiffies_relative(HZ)); } +static void ipoib_flush_ah(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + cancel_delayed_work(&priv->ah_reap_task); + flush_workqueue(priv->wq); + ipoib_reap_ah(&priv->ah_reap_task.work); +} + +static void ipoib_stop_ah(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + + set_bit(IPOIB_STOP_REAPER, &priv->flags); + ipoib_flush_ah(dev); +} + static void ipoib_ib_tx_timer_func(unsigned long ctx) { drain_tx_cq((struct net_device *)ctx); } -int ipoib_ib_dev_open(struct net_device *dev, int flush) +int ipoib_ib_dev_open(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; @@ -696,7 +721,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) } clear_bit(IPOIB_STOP_REAPER, &priv->flags); - queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, + queue_delayed_work(priv->wq, &priv->ah_reap_task, round_jiffies_relative(HZ)); if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) @@ -706,7 +731,7 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) dev_stop: if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) napi_enable(&priv->napi); - ipoib_ib_dev_stop(dev, flush); + ipoib_ib_dev_stop(dev); return -1; } @@ -738,7 +763,7 @@ int ipoib_ib_dev_up(struct net_device *dev) return ipoib_mcast_start_thread(dev); } -int ipoib_ib_dev_down(struct net_device *dev, int flush) +int ipoib_ib_dev_down(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -747,7 +772,7 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); netif_carrier_off(dev); - ipoib_mcast_stop_thread(dev, flush); + ipoib_mcast_stop_thread(dev); ipoib_mcast_dev_flush(dev); ipoib_flush_paths(dev); @@ -807,7 +832,7 @@ void ipoib_drain_cq(struct net_device *dev) local_bh_enable(); } -int ipoib_ib_dev_stop(struct net_device *dev, int flush) +int ipoib_ib_dev_stop(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ib_qp_attr qp_attr; @@ -877,24 +902,7 @@ timeout: if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) ipoib_warn(priv, "Failed to modify QP to RESET state\n"); - /* Wait for all AHs to be reaped */ - set_bit(IPOIB_STOP_REAPER, &priv->flags); - cancel_delayed_work(&priv->ah_reap_task); - if (flush) - flush_workqueue(ipoib_workqueue); - - begin = jiffies; - - while (!list_empty(&priv->dead_ahs)) { - __ipoib_reap_ah(dev); - - if (time_after(jiffies, begin + HZ)) { - ipoib_warn(priv, "timing out; will leak address handles\n"); - break; - } - - msleep(1); - } + ipoib_flush_ah(dev); ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP); @@ -918,7 +926,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) (unsigned long) dev); if (dev->flags & IFF_UP) { - if (ipoib_ib_dev_open(dev, 1)) { + if (ipoib_ib_dev_open(dev)) { ipoib_transport_dev_cleanup(dev); return -ENODEV; } @@ -1037,15 +1045,16 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, if (level == IPOIB_FLUSH_LIGHT) { ipoib_mark_paths_invalid(dev); ipoib_mcast_dev_flush(dev); + ipoib_flush_ah(dev); } if (level >= IPOIB_FLUSH_NORMAL) - ipoib_ib_dev_down(dev, 0); + ipoib_ib_dev_down(dev); if (level == IPOIB_FLUSH_HEAVY) { if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) - ipoib_ib_dev_stop(dev, 0); - if (ipoib_ib_dev_open(dev, 0) != 0) + ipoib_ib_dev_stop(dev); + if (ipoib_ib_dev_open(dev) != 0) return; if (netif_queue_stopped(dev)) netif_start_queue(dev); @@ -1097,9 +1106,17 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) */ ipoib_flush_paths(dev); - ipoib_mcast_stop_thread(dev, 1); + ipoib_mcast_stop_thread(dev); ipoib_mcast_dev_flush(dev); + /* + * All of our ah references aren't free until after + * ipoib_mcast_dev_flush(), ipoib_flush_paths, and + * the neighbor garbage collection is stopped and reaped. + * That should all be done now, so make a final ah flush. + */ + ipoib_stop_ah(dev); + ipoib_transport_dev_cleanup(dev); } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 915ad04a827e..9e1b203d756d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -108,7 +108,7 @@ int ipoib_open(struct net_device *dev) set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); - if (ipoib_ib_dev_open(dev, 1)) { + if (ipoib_ib_dev_open(dev)) { if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; goto err_disable; @@ -139,7 +139,7 @@ int ipoib_open(struct net_device *dev) return 0; err_stop: - ipoib_ib_dev_stop(dev, 1); + ipoib_ib_dev_stop(dev); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); @@ -157,8 +157,8 @@ static int ipoib_stop(struct net_device *dev) netif_stop_queue(dev); - ipoib_ib_dev_down(dev, 1); - ipoib_ib_dev_stop(dev, 0); + ipoib_ib_dev_down(dev); + ipoib_ib_dev_stop(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; @@ -640,8 +640,10 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, if (!path->query && path_rec_start(dev, path)) goto err_path; - - __skb_queue_tail(&neigh->queue, skb); + if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) + __skb_queue_tail(&neigh->queue, skb); + else + goto err_drop; } spin_unlock_irqrestore(&priv->lock, flags); @@ -676,7 +678,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, new_path = 1; } if (path) { - __skb_queue_tail(&path->queue, skb); + if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + __skb_queue_tail(&path->queue, skb); + } else { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + } if (!path->query && path_rec_start(dev, path)) { spin_unlock_irqrestore(&priv->lock, flags); @@ -839,7 +846,7 @@ static void ipoib_set_mcast_list(struct net_device *dev) return; } - queue_work(ipoib_workqueue, &priv->restart_task); + queue_work(priv->wq, &priv->restart_task); } static int ipoib_get_iflink(const struct net_device *dev) @@ -966,7 +973,7 @@ static void ipoib_reap_neigh(struct work_struct *work) __ipoib_reap_neigh(priv); if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); } @@ -1145,7 +1152,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) /* start garbage collection */ clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); - queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task, + queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); return 0; @@ -1274,15 +1281,13 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) { struct ipoib_dev_priv *priv = netdev_priv(dev); - if (ipoib_neigh_hash_init(priv) < 0) - goto out; /* Allocate RX/TX "rings" to hold queued skbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); - goto out_neigh_hash_cleanup; + goto out; } priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); @@ -1297,16 +1302,24 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) if (ipoib_ib_dev_init(dev, ca, port)) goto out_tx_ring_cleanup; + /* + * Must be after ipoib_ib_dev_init so we can allocate a per + * device wq there and use it here + */ + if (ipoib_neigh_hash_init(priv) < 0) + goto out_dev_uninit; + return 0; +out_dev_uninit: + ipoib_ib_dev_cleanup(dev); + out_tx_ring_cleanup: vfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); -out_neigh_hash_cleanup: - ipoib_neigh_hash_uninit(dev); out: return -ENOMEM; } @@ -1329,6 +1342,12 @@ void ipoib_dev_cleanup(struct net_device *dev) } unregister_netdevice_many(&head); + /* + * Must be before ipoib_ib_dev_cleanup or we delete an in use + * work queue + */ + ipoib_neigh_hash_uninit(dev); + ipoib_ib_dev_cleanup(dev); kfree(priv->rx_ring); @@ -1336,8 +1355,6 @@ void ipoib_dev_cleanup(struct net_device *dev) priv->rx_ring = NULL; priv->tx_ring = NULL; - - ipoib_neigh_hash_uninit(dev); } static const struct header_ops ipoib_header_ops = { @@ -1646,10 +1663,11 @@ sysfs_failed: register_failed: ib_unregister_event_handler(&priv->event_handler); + flush_workqueue(ipoib_workqueue); /* Stop GC if started before flush */ set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); cancel_delayed_work(&priv->neigh_reap_task); - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); event_failed: ipoib_dev_cleanup(priv->dev); @@ -1712,6 +1730,7 @@ static void ipoib_remove_one(struct ib_device *device) list_for_each_entry_safe(priv, tmp, dev_list, list) { ib_unregister_event_handler(&priv->event_handler); + flush_workqueue(ipoib_workqueue); rtnl_lock(); dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); @@ -1720,7 +1739,7 @@ static void ipoib_remove_one(struct ib_device *device) /* Stop GC */ set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); cancel_delayed_work(&priv->neigh_reap_task); - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); unregister_netdev(priv->dev); free_netdev(priv->dev); @@ -1755,14 +1774,16 @@ static int __init ipoib_init_module(void) return ret; /* - * We create our own workqueue mainly because we want to be - * able to flush it when devices are being removed. We can't - * use schedule_work()/flush_scheduled_work() because both - * unregister_netdev() and linkwatch_event take the rtnl lock, - * so flush_scheduled_work() can deadlock during device - * removal. + * We create a global workqueue here that is used for all flush + * operations. However, if you attempt to flush a workqueue + * from a task on that same workqueue, it deadlocks the system. + * We want to be able to flush the tasks associated with a + * specific net device, so we also create a workqueue for each + * netdevice. We queue up the tasks for that device only on + * its private workqueue, and we only queue up flush events + * on our global flush workqueue. This avoids the deadlocks. */ - ipoib_workqueue = create_singlethread_workqueue("ipoib"); + ipoib_workqueue = create_singlethread_workqueue("ipoib_flush"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index ffb83b5f7e80..0d23e0568deb 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -55,8 +55,6 @@ MODULE_PARM_DESC(mcast_debug_level, "Enable multicast debug tracing if > 0"); #endif -static DEFINE_MUTEX(mcast_mutex); - struct ipoib_mcast_iter { struct net_device *dev; union ib_gid mgid; @@ -66,6 +64,48 @@ struct ipoib_mcast_iter { unsigned int send_only; }; +/* + * This should be called with the priv->lock held + */ +static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv, + struct ipoib_mcast *mcast, + bool delay) +{ + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + return; + + /* + * We will be scheduling *something*, so cancel whatever is + * currently scheduled first + */ + cancel_delayed_work(&priv->mcast_task); + if (mcast && delay) { + /* + * We had a failure and want to schedule a retry later + */ + mcast->backoff *= 2; + if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) + mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; + mcast->delay_until = jiffies + (mcast->backoff * HZ); + /* + * Mark this mcast for its delay, but restart the + * task immediately. The join task will make sure to + * clear out all entries without delays, and then + * schedule itself to run again when the earliest + * delay expires + */ + queue_delayed_work(priv->wq, &priv->mcast_task, 0); + } else if (delay) { + /* + * Special case of retrying after a failure to + * allocate the broadcast multicast group, wait + * 1 second and try again + */ + queue_delayed_work(priv->wq, &priv->mcast_task, HZ); + } else + queue_delayed_work(priv->wq, &priv->mcast_task, 0); +} + static void ipoib_mcast_free(struct ipoib_mcast *mcast) { struct net_device *dev = mcast->dev; @@ -103,6 +143,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, mcast->dev = dev; mcast->created = jiffies; + mcast->delay_until = jiffies; mcast->backoff = 1; INIT_LIST_HEAD(&mcast->list); @@ -185,17 +226,27 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, spin_unlock_irq(&priv->lock); return -EAGAIN; } - priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + /*update priv member according to the new mcast*/ + priv->broadcast->mcmember.qkey = mcmember->qkey; + priv->broadcast->mcmember.mtu = mcmember->mtu; + priv->broadcast->mcmember.traffic_class = mcmember->traffic_class; + priv->broadcast->mcmember.rate = mcmember->rate; + priv->broadcast->mcmember.sl = mcmember->sl; + priv->broadcast->mcmember.flow_label = mcmember->flow_label; + priv->broadcast->mcmember.hop_limit = mcmember->hop_limit; + /* assume if the admin and the mcast are the same both can be changed */ + if (priv->mcast_mtu == priv->admin_mtu) + priv->admin_mtu = + priv->mcast_mtu = + IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + else + priv->mcast_mtu = + IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); priv->tx_wr.wr.ud.remote_qkey = priv->qkey; set_qkey = 1; - - if (!ipoib_cm_admin_enabled(dev)) { - rtnl_lock(); - dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu)); - rtnl_unlock(); - } } if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { @@ -270,107 +321,35 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, return 0; } -static int -ipoib_mcast_sendonly_join_complete(int status, - struct ib_sa_multicast *multicast) -{ - struct ipoib_mcast *mcast = multicast->context; - struct net_device *dev = mcast->dev; - - /* We trap for port events ourselves. */ - if (status == -ENETRESET) - return 0; - - if (!status) - status = ipoib_mcast_join_finish(mcast, &multicast->rec); - - if (status) { - if (mcast->logcount++ < 20) - ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); - - /* Flush out any queued packets */ - netif_tx_lock_bh(dev); - while (!skb_queue_empty(&mcast->pkt_queue)) { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); - } - netif_tx_unlock_bh(dev); - - /* Clear the busy flag so we try again */ - status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, - &mcast->flags); - } - return status; -} - -static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast) -{ - struct net_device *dev = mcast->dev; - struct ipoib_dev_priv *priv = netdev_priv(dev); - struct ib_sa_mcmember_rec rec = { -#if 0 /* Some SMs don't support send-only yet */ - .join_state = 4 -#else - .join_state = 1 -#endif - }; - int ret = 0; - - if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { - ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n"); - return -ENODEV; - } - - if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { - ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n"); - return -EBUSY; - } - - rec.mgid = mcast->mcmember.mgid; - rec.port_gid = priv->local_gid; - rec.pkey = cpu_to_be16(priv->pkey); - - mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, - priv->port, &rec, - IB_SA_MCMEMBER_REC_MGID | - IB_SA_MCMEMBER_REC_PORT_GID | - IB_SA_MCMEMBER_REC_PKEY | - IB_SA_MCMEMBER_REC_JOIN_STATE, - GFP_ATOMIC, - ipoib_mcast_sendonly_join_complete, - mcast); - if (IS_ERR(mcast->mc)) { - ret = PTR_ERR(mcast->mc); - clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n", - ret); - } else { - ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n", - mcast->mcmember.mgid.raw); - } - - return ret; -} - void ipoib_mcast_carrier_on_task(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, carrier_on_task); struct ib_port_attr attr; - /* - * Take rtnl_lock to avoid racing with ipoib_stop() and - * turning the carrier back on while a device is being - * removed. - */ if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); return; } - rtnl_lock(); + /* + * Take rtnl_lock to avoid racing with ipoib_stop() and + * turning the carrier back on while a device is being + * removed. However, ipoib_stop() will attempt to flush + * the workqueue while holding the rtnl lock, so loop + * on trylock until either we get the lock or we see + * FLAG_OPER_UP go away as that signals that we are bailing + * and can safely ignore the carrier on work. + */ + while (!rtnl_trylock()) { + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + return; + else + msleep(20); + } + if (!ipoib_cm_admin_enabled(priv->dev)) + dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu)); netif_carrier_on(priv->dev); rtnl_unlock(); } @@ -382,7 +361,9 @@ static int ipoib_mcast_join_complete(int status, struct net_device *dev = mcast->dev; struct ipoib_dev_priv *priv = netdev_priv(dev); - ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n", + ipoib_dbg_mcast(priv, "%sjoin completion for %pI6 (status %d)\n", + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? + "sendonly " : "", mcast->mcmember.mgid.raw, status); /* We trap for port events ourselves. */ @@ -396,49 +377,74 @@ static int ipoib_mcast_join_complete(int status, if (!status) { mcast->backoff = 1; - mutex_lock(&mcast_mutex); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, 0); - mutex_unlock(&mcast_mutex); + mcast->delay_until = jiffies; /* - * Defer carrier on work to ipoib_workqueue to avoid a - * deadlock on rtnl_lock here. + * Defer carrier on work to priv->wq to avoid a + * deadlock on rtnl_lock here. Requeue our multicast + * work too, which will end up happening right after + * our carrier on task work and will allow us to + * send out all of the non-broadcast joins */ - if (mcast == priv->broadcast) - queue_work(ipoib_workqueue, &priv->carrier_on_task); - - status = 0; - goto out; - } + if (mcast == priv->broadcast) { + spin_lock_irq(&priv->lock); + queue_work(priv->wq, &priv->carrier_on_task); + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + goto out_locked; + } + } else { + if (mcast->logcount++ < 20) { + if (status == -ETIMEDOUT || status == -EAGAIN) { + ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n", + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "", + mcast->mcmember.mgid.raw, status); + } else { + ipoib_warn(priv, "%smulticast join failed for %pI6, status %d\n", + test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "", + mcast->mcmember.mgid.raw, status); + } + } - if (mcast->logcount++ < 20) { - if (status == -ETIMEDOUT || status == -EAGAIN) { - ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && + mcast->backoff >= 2) { + /* + * We only retry sendonly joins once before we drop + * the packet and quit trying to deal with the + * group. However, we leave the group in the + * mcast list as an unjoined group. If we want to + * try joining again, we simply queue up a packet + * and restart the join thread. The empty queue + * is why the join thread ignores this group. + */ + mcast->backoff = 1; + netif_tx_lock_bh(dev); + while (!skb_queue_empty(&mcast->pkt_queue)) { + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue)); + } + netif_tx_unlock_bh(dev); } else { - ipoib_warn(priv, "multicast join failed for %pI6, status %d\n", - mcast->mcmember.mgid.raw, status); + spin_lock_irq(&priv->lock); + /* Requeue this join task with a backoff delay */ + __ipoib_mcast_schedule_join_thread(priv, mcast, 1); + goto out_locked; } } - - mcast->backoff *= 2; - if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) - mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; - - /* Clear the busy flag so we try again */ - status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - - mutex_lock(&mcast_mutex); +out: spin_lock_irq(&priv->lock); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, - mcast->backoff * HZ); +out_locked: + /* + * Make sure to set mcast->mc before we clear the busy flag to avoid + * racing with code that checks for BUSY before checking mcast->mc + */ + if (status) + mcast->mc = NULL; + else + mcast->mc = multicast; + clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); spin_unlock_irq(&priv->lock); - mutex_unlock(&mcast_mutex); -out: complete(&mcast->done); + return status; } @@ -446,6 +452,7 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, int create) { struct ipoib_dev_priv *priv = netdev_priv(dev); + struct ib_sa_multicast *multicast; struct ib_sa_mcmember_rec rec = { .join_state = 1 }; @@ -487,29 +494,18 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, rec.hop_limit = priv->broadcast->mcmember.hop_limit; } - set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); - init_completion(&mcast->done); - set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); - - mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, + multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); - if (IS_ERR(mcast->mc)) { + if (IS_ERR(multicast)) { + ret = PTR_ERR(multicast); + ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); + spin_lock_irq(&priv->lock); + /* Requeue this join task with a backoff delay */ + __ipoib_mcast_schedule_join_thread(priv, mcast, 1); clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + spin_unlock_irq(&priv->lock); complete(&mcast->done); - ret = PTR_ERR(mcast->mc); - ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); - - mcast->backoff *= 2; - if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS) - mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS; - - mutex_lock(&mcast_mutex); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, - mcast->backoff * HZ); - mutex_unlock(&mcast_mutex); } } @@ -519,8 +515,11 @@ void ipoib_mcast_join_task(struct work_struct *work) container_of(work, struct ipoib_dev_priv, mcast_task.work); struct net_device *dev = priv->dev; struct ib_port_attr port_attr; + unsigned long delay_until = 0; + struct ipoib_mcast *mcast = NULL; + int create = 1; - if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) return; if (ib_query_port(priv->ca, priv->port, &port_attr) || @@ -536,93 +535,118 @@ void ipoib_mcast_join_task(struct work_struct *work) else memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + spin_lock_irq(&priv->lock); + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + goto out; + if (!priv->broadcast) { struct ipoib_mcast *broadcast; - if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - return; - - broadcast = ipoib_mcast_alloc(dev, 1); + broadcast = ipoib_mcast_alloc(dev, 0); if (!broadcast) { ipoib_warn(priv, "failed to allocate broadcast group\n"); - mutex_lock(&mcast_mutex); - if (test_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->mcast_task, HZ); - mutex_unlock(&mcast_mutex); - return; + /* + * Restart us after a 1 second delay to retry + * creating our broadcast group and attaching to + * it. Until this succeeds, this ipoib dev is + * completely stalled (multicast wise). + */ + __ipoib_mcast_schedule_join_thread(priv, NULL, 1); + goto out; } - spin_lock_irq(&priv->lock); memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4, sizeof (union ib_gid)); priv->broadcast = broadcast; __ipoib_mcast_add(dev, priv->broadcast); - spin_unlock_irq(&priv->lock); } if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) { - if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) - ipoib_mcast_join(dev, priv->broadcast, 0); - return; - } - - while (1) { - struct ipoib_mcast *mcast = NULL; - - spin_lock_irq(&priv->lock); - list_for_each_entry(mcast, &priv->multicast_list, list) { - if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) - && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) - && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { - /* Found the next unjoined group */ - break; + if (IS_ERR_OR_NULL(priv->broadcast->mc) && + !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) { + mcast = priv->broadcast; + create = 0; + if (mcast->backoff > 1 && + time_before(jiffies, mcast->delay_until)) { + delay_until = mcast->delay_until; + mcast = NULL; } } - spin_unlock_irq(&priv->lock); + goto out; + } - if (&mcast->list == &priv->multicast_list) { - /* All done */ - break; + /* + * We'll never get here until the broadcast group is both allocated + * and attached + */ + list_for_each_entry(mcast, &priv->multicast_list, list) { + if (IS_ERR_OR_NULL(mcast->mc) && + !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) && + (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) || + !skb_queue_empty(&mcast->pkt_queue))) { + if (mcast->backoff == 1 || + time_after_eq(jiffies, mcast->delay_until)) { + /* Found the next unjoined group */ + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + create = 0; + else + create = 1; + spin_unlock_irq(&priv->lock); + ipoib_mcast_join(dev, mcast, create); + spin_lock_irq(&priv->lock); + } else if (!delay_until || + time_before(mcast->delay_until, delay_until)) + delay_until = mcast->delay_until; } - - ipoib_mcast_join(dev, mcast, 1); - return; } - ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); + mcast = NULL; + ipoib_dbg_mcast(priv, "successfully started all multicast joins\n"); - clear_bit(IPOIB_MCAST_RUN, &priv->flags); +out: + if (delay_until) { + cancel_delayed_work(&priv->mcast_task); + queue_delayed_work(priv->wq, &priv->mcast_task, + delay_until - jiffies); + } + if (mcast) { + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + } + spin_unlock_irq(&priv->lock); + if (mcast) + ipoib_mcast_join(dev, mcast, create); } int ipoib_mcast_start_thread(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned long flags; ipoib_dbg_mcast(priv, "starting multicast thread\n"); - mutex_lock(&mcast_mutex); - if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags)) - queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0); - mutex_unlock(&mcast_mutex); + spin_lock_irqsave(&priv->lock, flags); + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + spin_unlock_irqrestore(&priv->lock, flags); return 0; } -int ipoib_mcast_stop_thread(struct net_device *dev, int flush) +int ipoib_mcast_stop_thread(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); + unsigned long flags; ipoib_dbg_mcast(priv, "stopping multicast thread\n"); - mutex_lock(&mcast_mutex); - clear_bit(IPOIB_MCAST_RUN, &priv->flags); + spin_lock_irqsave(&priv->lock, flags); cancel_delayed_work(&priv->mcast_task); - mutex_unlock(&mcast_mutex); + spin_unlock_irqrestore(&priv->lock, flags); - if (flush) - flush_workqueue(ipoib_workqueue); + flush_workqueue(priv->wq); return 0; } @@ -633,6 +657,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) int ret = 0; if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n"); + + if (!IS_ERR_OR_NULL(mcast->mc)) ib_sa_free_multicast(mcast->mc); if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) { @@ -644,7 +671,9 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) be16_to_cpu(mcast->mcmember.mlid)); if (ret) ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret); - } + } else if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) + ipoib_dbg(priv, "leaving with no mcmember but not a " + "SENDONLY join\n"); return 0; } @@ -667,49 +696,37 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) } mcast = __ipoib_mcast_find(dev, mgid); - if (!mcast) { - /* Let's create a new send only group now */ - ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n", - mgid); - - mcast = ipoib_mcast_alloc(dev, 0); + if (!mcast || !mcast->ah) { if (!mcast) { - ipoib_warn(priv, "unable to allocate memory for " - "multicast structure\n"); - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); - goto out; - } - - set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); - memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid)); - __ipoib_mcast_add(dev, mcast); - list_add_tail(&mcast->list, &priv->multicast_list); - } + /* Let's create a new send only group now */ + ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n", + mgid); + + mcast = ipoib_mcast_alloc(dev, 0); + if (!mcast) { + ipoib_warn(priv, "unable to allocate memory " + "for multicast structure\n"); + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + goto unlock; + } - if (!mcast->ah) { + set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags); + memcpy(mcast->mcmember.mgid.raw, mgid, + sizeof (union ib_gid)); + __ipoib_mcast_add(dev, mcast); + list_add_tail(&mcast->list, &priv->multicast_list); + } if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) skb_queue_tail(&mcast->pkt_queue, skb); else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } - - if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) - ipoib_dbg_mcast(priv, "no address vector, " - "but multicast join already started\n"); - else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) - ipoib_mcast_sendonly_join(mcast); - - /* - * If lookup completes between here and out:, don't - * want to send packet twice. - */ - mcast = NULL; - } - -out: - if (mcast && mcast->ah) { + if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) { + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + } + } else { struct ipoib_neigh *neigh; spin_unlock_irqrestore(&priv->lock, flags); @@ -759,9 +776,12 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); - /* seperate between the wait to the leave*/ + /* + * make sure the in-flight joins have finished before we attempt + * to leave + */ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) - if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) wait_for_completion(&mcast->done); list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { @@ -792,9 +812,14 @@ void ipoib_mcast_restart_task(struct work_struct *work) unsigned long flags; struct ib_sa_mcmember_rec rec; - ipoib_dbg_mcast(priv, "restarting multicast task\n"); + if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) + /* + * shortcut...on shutdown flush is called next, just + * let it do all the work + */ + return; - ipoib_mcast_stop_thread(dev, 0); + ipoib_dbg_mcast(priv, "restarting multicast task\n"); local_irq_save(flags); netif_addr_lock(dev); @@ -880,14 +905,27 @@ void ipoib_mcast_restart_task(struct work_struct *work) netif_addr_unlock(dev); local_irq_restore(flags); - /* We have to cancel outside of the spinlock */ + /* + * make sure the in-flight joins have finished before we attempt + * to leave + */ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(mcast->dev, mcast); ipoib_mcast_free(mcast); } - if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) - ipoib_mcast_start_thread(dev); + /* + * Double check that we are still up + */ + if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { + spin_lock_irqsave(&priv->lock, flags); + __ipoib_mcast_schedule_join_thread(priv, NULL, 0); + spin_unlock_irqrestore(&priv->lock, flags); + } } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index c56d5d44c53b..e5cc43074196 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -157,6 +157,16 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) goto out_free_pd; } + /* + * the various IPoIB tasks assume they will never race against + * themselves, so always use a single thread workqueue + */ + priv->wq = create_singlethread_workqueue("ipoib_wq"); + if (!priv->wq) { + printk(KERN_WARNING "ipoib: failed to allocate device WQ\n"); + goto out_free_mr; + } + size = ipoib_recvq_size + 1; ret = ipoib_cm_dev_init(dev); if (!ret) { @@ -165,12 +175,13 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */ else size += ipoib_recvq_size * ipoib_max_conn_qp; - } + } else + goto out_free_wq; priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0); if (IS_ERR(priv->recv_cq)) { printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); - goto out_free_mr; + goto out_cm_dev_cleanup; } priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL, @@ -216,15 +227,10 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->tx_wr.send_flags = IB_SEND_SIGNALED; priv->rx_sge[0].lkey = priv->mr->lkey; - if (ipoib_ud_need_sg(priv->max_ib_mtu)) { - priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE; - priv->rx_sge[1].length = PAGE_SIZE; - priv->rx_sge[1].lkey = priv->mr->lkey; - priv->rx_wr.num_sge = IPOIB_UD_RX_SG; - } else { - priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); - priv->rx_wr.num_sge = 1; - } + + priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); + priv->rx_wr.num_sge = 1; + priv->rx_wr.next = NULL; priv->rx_wr.sg_list = priv->rx_sge; @@ -236,12 +242,19 @@ out_free_send_cq: out_free_recv_cq: ib_destroy_cq(priv->recv_cq); +out_cm_dev_cleanup: + ipoib_cm_dev_cleanup(dev); + +out_free_wq: + destroy_workqueue(priv->wq); + priv->wq = NULL; + out_free_mr: ib_dereg_mr(priv->mr); - ipoib_cm_dev_cleanup(dev); out_free_pd: ib_dealloc_pd(priv->pd); + return -ENODEV; } @@ -265,11 +278,18 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) ipoib_cm_dev_cleanup(dev); + if (priv->wq) { + flush_workqueue(priv->wq); + destroy_workqueue(priv->wq); + priv->wq = NULL; + } + if (ib_dereg_mr(priv->mr)) ipoib_warn(priv, "ib_dereg_mr failed\n"); if (ib_dealloc_pd(priv->pd)) ipoib_warn(priv, "ib_dealloc_pd failed\n"); + } void ipoib_event(struct ib_event_handler *handler, diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index b47aea1094b2..262ba1f8ee50 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -69,7 +69,7 @@ #define DRV_NAME "iser" #define PFX DRV_NAME ": " -#define DRV_VER "1.5" +#define DRV_VER "1.6" #define iser_dbg(fmt, arg...) \ do { \ @@ -218,22 +218,21 @@ enum iser_data_dir { /** * struct iser_data_buf - iSER data buffer * - * @buf: pointer to the sg list + * @sg: pointer to the sg list * @size: num entries of this sg * @data_len: total beffer byte len * @dma_nents: returned by dma_map_sg - * @copy_buf: allocated copy buf for SGs unaligned - * for rdma which are copied - * @sg_single: SG-ified clone of a non SG SC or - * unaligned SG + * @orig_sg: pointer to the original sg list (in case + * we used a copy) + * @orig_size: num entris of orig sg list */ struct iser_data_buf { - void *buf; + struct scatterlist *sg; unsigned int size; unsigned long data_len; unsigned int dma_nents; - char *copy_buf; - struct scatterlist sg_single; + struct scatterlist *orig_sg; + unsigned int orig_size; }; /* fwd declarations */ @@ -244,35 +243,14 @@ struct iscsi_endpoint; /** * struct iser_mem_reg - iSER memory registration info * - * @lkey: MR local key - * @rkey: MR remote key - * @va: MR start address (buffer va) - * @len: MR length + * @sge: memory region sg element + * @rkey: memory region remote key * @mem_h: pointer to registration context (FMR/Fastreg) */ struct iser_mem_reg { - u32 lkey; - u32 rkey; - u64 va; - u64 len; - void *mem_h; -}; - -/** - * struct iser_regd_buf - iSER buffer registration desc - * - * @reg: memory registration info - * @virt_addr: virtual address of buffer - * @device: reference to iser device - * @direction: dma direction (for dma_unmap) - * @data_size: data buffer size in bytes - */ -struct iser_regd_buf { - struct iser_mem_reg reg; - void *virt_addr; - struct iser_device *device; - enum dma_data_direction direction; - unsigned int data_size; + struct ib_sge sge; + u32 rkey; + void *mem_h; }; enum iser_desc_type { @@ -534,11 +512,9 @@ struct iser_conn { * @sc: link to scsi command * @command_sent: indicate if command was sent * @dir: iser data direction - * @rdma_regd: task rdma registration desc + * @rdma_reg: task rdma registration desc * @data: iser data buffer desc - * @data_copy: iser data copy buffer desc (bounce buffer) * @prot: iser protection buffer desc - * @prot_copy: iser protection copy buffer desc (bounce buffer) */ struct iscsi_iser_task { struct iser_tx_desc desc; @@ -547,11 +523,9 @@ struct iscsi_iser_task { struct scsi_cmnd *sc; int command_sent; int dir[ISER_DIRS_NUM]; - struct iser_regd_buf rdma_regd[ISER_DIRS_NUM]; + struct iser_mem_reg rdma_reg[ISER_DIRS_NUM]; struct iser_data_buf data[ISER_DIRS_NUM]; - struct iser_data_buf data_copy[ISER_DIRS_NUM]; struct iser_data_buf prot[ISER_DIRS_NUM]; - struct iser_data_buf prot_copy[ISER_DIRS_NUM]; }; struct iser_page_vec { @@ -621,7 +595,6 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn); void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, struct iser_data_buf *mem, - struct iser_data_buf *mem_copy, enum iser_data_dir cmd_dir); int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task, @@ -634,10 +607,6 @@ int iser_connect(struct iser_conn *iser_conn, struct sockaddr *dst_addr, int non_blocking); -int iser_reg_page_vec(struct ib_conn *ib_conn, - struct iser_page_vec *page_vec, - struct iser_mem_reg *mem_reg); - void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir); void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, @@ -667,4 +636,9 @@ int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max); void iser_free_fastreg_pool(struct ib_conn *ib_conn); u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir, sector_t *sector); +struct fast_reg_descriptor * +iser_reg_desc_get(struct ib_conn *ib_conn); +void +iser_reg_desc_put(struct ib_conn *ib_conn, + struct fast_reg_descriptor *desc); #endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 20e859a6f1a6..3e2118e8ed87 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -50,7 +50,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) { struct iscsi_iser_task *iser_task = task->dd_data; struct iser_device *device = iser_task->iser_conn->ib_conn.device; - struct iser_regd_buf *regd_buf; + struct iser_mem_reg *mem_reg; int err; struct iser_hdr *hdr = &iser_task->desc.iser_header; struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN]; @@ -78,15 +78,15 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) iser_err("Failed to set up Data-IN RDMA\n"); return err; } - regd_buf = &iser_task->rdma_regd[ISER_DIR_IN]; + mem_reg = &iser_task->rdma_reg[ISER_DIR_IN]; hdr->flags |= ISER_RSV; - hdr->read_stag = cpu_to_be32(regd_buf->reg.rkey); - hdr->read_va = cpu_to_be64(regd_buf->reg.va); + hdr->read_stag = cpu_to_be32(mem_reg->rkey); + hdr->read_va = cpu_to_be64(mem_reg->sge.addr); iser_dbg("Cmd itt:%d READ tags RKEY:%#.4X VA:%#llX\n", - task->itt, regd_buf->reg.rkey, - (unsigned long long)regd_buf->reg.va); + task->itt, mem_reg->rkey, + (unsigned long long)mem_reg->sge.addr); return 0; } @@ -104,7 +104,7 @@ iser_prepare_write_cmd(struct iscsi_task *task, { struct iscsi_iser_task *iser_task = task->dd_data; struct iser_device *device = iser_task->iser_conn->ib_conn.device; - struct iser_regd_buf *regd_buf; + struct iser_mem_reg *mem_reg; int err; struct iser_hdr *hdr = &iser_task->desc.iser_header; struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT]; @@ -134,25 +134,25 @@ iser_prepare_write_cmd(struct iscsi_task *task, return err; } - regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT]; + mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT]; if (unsol_sz < edtl) { hdr->flags |= ISER_WSV; - hdr->write_stag = cpu_to_be32(regd_buf->reg.rkey); - hdr->write_va = cpu_to_be64(regd_buf->reg.va + unsol_sz); + hdr->write_stag = cpu_to_be32(mem_reg->rkey); + hdr->write_va = cpu_to_be64(mem_reg->sge.addr + unsol_sz); iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X " "VA:%#llX + unsol:%d\n", - task->itt, regd_buf->reg.rkey, - (unsigned long long)regd_buf->reg.va, unsol_sz); + task->itt, mem_reg->rkey, + (unsigned long long)mem_reg->sge.addr, unsol_sz); } if (imm_sz > 0) { iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n", task->itt, imm_sz); - tx_dsg->addr = regd_buf->reg.va; + tx_dsg->addr = mem_reg->sge.addr; tx_dsg->length = imm_sz; - tx_dsg->lkey = regd_buf->reg.lkey; + tx_dsg->lkey = mem_reg->sge.lkey; iser_task->desc.num_sge = 2; } @@ -401,16 +401,16 @@ int iser_send_command(struct iscsi_conn *conn, } if (scsi_sg_count(sc)) { /* using a scatter list */ - data_buf->buf = scsi_sglist(sc); + data_buf->sg = scsi_sglist(sc); data_buf->size = scsi_sg_count(sc); } data_buf->data_len = scsi_bufflen(sc); if (scsi_prot_sg_count(sc)) { - prot_buf->buf = scsi_prot_sglist(sc); + prot_buf->sg = scsi_prot_sglist(sc); prot_buf->size = scsi_prot_sg_count(sc); - prot_buf->data_len = data_buf->data_len >> - ilog2(sc->device->sector_size) * 8; + prot_buf->data_len = (data_buf->data_len >> + ilog2(sc->device->sector_size)) * 8; } if (hdr->flags & ISCSI_FLAG_CMD_READ) { @@ -450,7 +450,7 @@ int iser_send_data_out(struct iscsi_conn *conn, struct iser_conn *iser_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; struct iser_tx_desc *tx_desc = NULL; - struct iser_regd_buf *regd_buf; + struct iser_mem_reg *mem_reg; unsigned long buf_offset; unsigned long data_seg_len; uint32_t itt; @@ -477,11 +477,11 @@ int iser_send_data_out(struct iscsi_conn *conn, /* build the tx desc */ iser_initialize_task_headers(task, tx_desc); - regd_buf = &iser_task->rdma_regd[ISER_DIR_OUT]; + mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT]; tx_dsg = &tx_desc->tx_sg[1]; - tx_dsg->addr = regd_buf->reg.va + buf_offset; - tx_dsg->length = data_seg_len; - tx_dsg->lkey = regd_buf->reg.lkey; + tx_dsg->addr = mem_reg->sge.addr + buf_offset; + tx_dsg->length = data_seg_len; + tx_dsg->lkey = mem_reg->sge.lkey; tx_desc->num_sge = 2; if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) { @@ -658,10 +658,10 @@ void iser_task_rdma_init(struct iscsi_iser_task *iser_task) iser_task->prot[ISER_DIR_IN].data_len = 0; iser_task->prot[ISER_DIR_OUT].data_len = 0; - memset(&iser_task->rdma_regd[ISER_DIR_IN], 0, - sizeof(struct iser_regd_buf)); - memset(&iser_task->rdma_regd[ISER_DIR_OUT], 0, - sizeof(struct iser_regd_buf)); + memset(&iser_task->rdma_reg[ISER_DIR_IN], 0, + sizeof(struct iser_mem_reg)); + memset(&iser_task->rdma_reg[ISER_DIR_OUT], 0, + sizeof(struct iser_mem_reg)); } void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) @@ -674,35 +674,31 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) /* if we were reading, copy back to unaligned sglist, * anyway dma_unmap and free the copy */ - if (iser_task->data_copy[ISER_DIR_IN].copy_buf != NULL) { + if (iser_task->data[ISER_DIR_IN].orig_sg) { is_rdma_data_aligned = 0; iser_finalize_rdma_unaligned_sg(iser_task, &iser_task->data[ISER_DIR_IN], - &iser_task->data_copy[ISER_DIR_IN], ISER_DIR_IN); } - if (iser_task->data_copy[ISER_DIR_OUT].copy_buf != NULL) { + if (iser_task->data[ISER_DIR_OUT].orig_sg) { is_rdma_data_aligned = 0; iser_finalize_rdma_unaligned_sg(iser_task, &iser_task->data[ISER_DIR_OUT], - &iser_task->data_copy[ISER_DIR_OUT], ISER_DIR_OUT); } - if (iser_task->prot_copy[ISER_DIR_IN].copy_buf != NULL) { + if (iser_task->prot[ISER_DIR_IN].orig_sg) { is_rdma_prot_aligned = 0; iser_finalize_rdma_unaligned_sg(iser_task, &iser_task->prot[ISER_DIR_IN], - &iser_task->prot_copy[ISER_DIR_IN], ISER_DIR_IN); } - if (iser_task->prot_copy[ISER_DIR_OUT].copy_buf != NULL) { + if (iser_task->prot[ISER_DIR_OUT].orig_sg) { is_rdma_prot_aligned = 0; iser_finalize_rdma_unaligned_sg(iser_task, &iser_task->prot[ISER_DIR_OUT], - &iser_task->prot_copy[ISER_DIR_OUT], ISER_DIR_OUT); } diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 341040bf0984..f0cdc961eb11 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -39,68 +39,173 @@ #include "iscsi_iser.h" -#define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ +static void +iser_free_bounce_sg(struct iser_data_buf *data) +{ + struct scatterlist *sg; + int count; -/** - * iser_start_rdma_unaligned_sg - */ -static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, - struct iser_data_buf *data, - struct iser_data_buf *data_copy, - enum iser_data_dir cmd_dir) + for_each_sg(data->sg, sg, data->size, count) + __free_page(sg_page(sg)); + + kfree(data->sg); + + data->sg = data->orig_sg; + data->size = data->orig_size; + data->orig_sg = NULL; + data->orig_size = 0; +} + +static int +iser_alloc_bounce_sg(struct iser_data_buf *data) { - struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device; - struct scatterlist *sgl = (struct scatterlist *)data->buf; struct scatterlist *sg; - char *mem = NULL; - unsigned long cmd_data_len = 0; - int dma_nents, i; + struct page *page; + unsigned long length = data->data_len; + int i = 0, nents = DIV_ROUND_UP(length, PAGE_SIZE); - for_each_sg(sgl, sg, data->size, i) - cmd_data_len += ib_sg_dma_len(dev, sg); + sg = kcalloc(nents, sizeof(*sg), GFP_ATOMIC); + if (!sg) + goto err; - if (cmd_data_len > ISER_KMALLOC_THRESHOLD) - mem = (void *)__get_free_pages(GFP_ATOMIC, - ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); - else - mem = kmalloc(cmd_data_len, GFP_ATOMIC); + sg_init_table(sg, nents); + while (length) { + u32 page_len = min_t(u32, length, PAGE_SIZE); - if (mem == NULL) { - iser_err("Failed to allocate mem size %d %d for copying sglist\n", - data->size, (int)cmd_data_len); - return -ENOMEM; + page = alloc_page(GFP_ATOMIC); + if (!page) + goto err; + + sg_set_page(&sg[i], page, page_len, 0); + length -= page_len; + i++; } - if (cmd_dir == ISER_DIR_OUT) { - /* copy the unaligned sg the buffer which is used for RDMA */ - char *p, *from; - - sgl = (struct scatterlist *)data->buf; - p = mem; - for_each_sg(sgl, sg, data->size, i) { - from = kmap_atomic(sg_page(sg)); - memcpy(p, - from + sg->offset, - sg->length); - kunmap_atomic(from); - p += sg->length; + data->orig_sg = data->sg; + data->orig_size = data->size; + data->sg = sg; + data->size = nents; + + return 0; + +err: + for (; i > 0; i--) + __free_page(sg_page(&sg[i - 1])); + kfree(sg); + + return -ENOMEM; +} + +static void +iser_copy_bounce(struct iser_data_buf *data, bool to_buffer) +{ + struct scatterlist *osg, *bsg = data->sg; + void *oaddr, *baddr; + unsigned int left = data->data_len; + unsigned int bsg_off = 0; + int i; + + for_each_sg(data->orig_sg, osg, data->orig_size, i) { + unsigned int copy_len, osg_off = 0; + + oaddr = kmap_atomic(sg_page(osg)) + osg->offset; + copy_len = min(left, osg->length); + while (copy_len) { + unsigned int len = min(copy_len, bsg->length - bsg_off); + + baddr = kmap_atomic(sg_page(bsg)) + bsg->offset; + if (to_buffer) + memcpy(baddr + bsg_off, oaddr + osg_off, len); + else + memcpy(oaddr + osg_off, baddr + bsg_off, len); + + kunmap_atomic(baddr - bsg->offset); + osg_off += len; + bsg_off += len; + copy_len -= len; + + if (bsg_off >= bsg->length) { + bsg = sg_next(bsg); + bsg_off = 0; + } } + kunmap_atomic(oaddr - osg->offset); + left -= osg_off; } +} + +static inline void +iser_copy_from_bounce(struct iser_data_buf *data) +{ + iser_copy_bounce(data, false); +} + +static inline void +iser_copy_to_bounce(struct iser_data_buf *data) +{ + iser_copy_bounce(data, true); +} + +struct fast_reg_descriptor * +iser_reg_desc_get(struct ib_conn *ib_conn) +{ + struct fast_reg_descriptor *desc; + unsigned long flags; + + spin_lock_irqsave(&ib_conn->lock, flags); + desc = list_first_entry(&ib_conn->fastreg.pool, + struct fast_reg_descriptor, list); + list_del(&desc->list); + spin_unlock_irqrestore(&ib_conn->lock, flags); + + return desc; +} + +void +iser_reg_desc_put(struct ib_conn *ib_conn, + struct fast_reg_descriptor *desc) +{ + unsigned long flags; - sg_init_one(&data_copy->sg_single, mem, cmd_data_len); - data_copy->buf = &data_copy->sg_single; - data_copy->size = 1; - data_copy->copy_buf = mem; + spin_lock_irqsave(&ib_conn->lock, flags); + list_add(&desc->list, &ib_conn->fastreg.pool); + spin_unlock_irqrestore(&ib_conn->lock, flags); +} - dma_nents = ib_dma_map_sg(dev, &data_copy->sg_single, 1, - (cmd_dir == ISER_DIR_OUT) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - BUG_ON(dma_nents == 0); +/** + * iser_start_rdma_unaligned_sg + */ +static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, + struct iser_data_buf *data, + enum iser_data_dir cmd_dir) +{ + struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device; + int rc; + + rc = iser_alloc_bounce_sg(data); + if (rc) { + iser_err("Failed to allocate bounce for data len %lu\n", + data->data_len); + return rc; + } + + if (cmd_dir == ISER_DIR_OUT) + iser_copy_to_bounce(data); - data_copy->dma_nents = dma_nents; - data_copy->data_len = cmd_data_len; + data->dma_nents = ib_dma_map_sg(dev, data->sg, data->size, + (cmd_dir == ISER_DIR_OUT) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + if (!data->dma_nents) { + iser_err("Got dma_nents %d, something went wrong...\n", + data->dma_nents); + rc = -ENOMEM; + goto err; + } return 0; +err: + iser_free_bounce_sg(data); + return rc; } /** @@ -109,51 +214,18 @@ static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, struct iser_data_buf *data, - struct iser_data_buf *data_copy, enum iser_data_dir cmd_dir) { - struct ib_device *dev; - unsigned long cmd_data_len; - - dev = iser_task->iser_conn->ib_conn.device->ib_device; + struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device; - ib_dma_unmap_sg(dev, &data_copy->sg_single, 1, + ib_dma_unmap_sg(dev, data->sg, data->size, (cmd_dir == ISER_DIR_OUT) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (cmd_dir == ISER_DIR_IN) { - char *mem; - struct scatterlist *sgl, *sg; - unsigned char *p, *to; - unsigned int sg_size; - int i; - - /* copy back read RDMA to unaligned sg */ - mem = data_copy->copy_buf; - - sgl = (struct scatterlist *)data->buf; - sg_size = data->size; - - p = mem; - for_each_sg(sgl, sg, sg_size, i) { - to = kmap_atomic(sg_page(sg)); - memcpy(to + sg->offset, - p, - sg->length); - kunmap_atomic(to); - p += sg->length; - } - } + if (cmd_dir == ISER_DIR_IN) + iser_copy_from_bounce(data); - cmd_data_len = data->data_len; - - if (cmd_data_len > ISER_KMALLOC_THRESHOLD) - free_pages((unsigned long)data_copy->copy_buf, - ilog2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); - else - kfree(data_copy->copy_buf); - - data_copy->copy_buf = NULL; + iser_free_bounce_sg(data); } #define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) @@ -175,7 +247,7 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data, struct ib_device *ibdev, u64 *pages, int *offset, int *data_size) { - struct scatterlist *sg, *sgl = (struct scatterlist *)data->buf; + struct scatterlist *sg, *sgl = data->sg; u64 start_addr, end_addr, page, chunk_start = 0; unsigned long total_sz = 0; unsigned int dma_len; @@ -227,14 +299,14 @@ static int iser_sg_to_page_vec(struct iser_data_buf *data, static int iser_data_buf_aligned_len(struct iser_data_buf *data, struct ib_device *ibdev) { - struct scatterlist *sgl, *sg, *next_sg = NULL; + struct scatterlist *sg, *sgl, *next_sg = NULL; u64 start_addr, end_addr; int i, ret_len, start_check = 0; if (data->dma_nents == 1) return 1; - sgl = (struct scatterlist *)data->buf; + sgl = data->sg; start_addr = ib_sg_dma_address(ibdev, sgl); for_each_sg(sgl, sg, data->dma_nents, i) { @@ -266,11 +338,10 @@ static int iser_data_buf_aligned_len(struct iser_data_buf *data, static void iser_data_buf_dump(struct iser_data_buf *data, struct ib_device *ibdev) { - struct scatterlist *sgl = (struct scatterlist *)data->buf; struct scatterlist *sg; int i; - for_each_sg(sgl, sg, data->dma_nents, i) + for_each_sg(data->sg, sg, data->dma_nents, i) iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p " "off:0x%x sz:0x%x dma_len:0x%x\n", i, (unsigned long)ib_sg_dma_address(ibdev, sg), @@ -288,31 +359,6 @@ static void iser_dump_page_vec(struct iser_page_vec *page_vec) iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); } -static void iser_page_vec_build(struct iser_data_buf *data, - struct iser_page_vec *page_vec, - struct ib_device *ibdev) -{ - int page_vec_len = 0; - - page_vec->length = 0; - page_vec->offset = 0; - - iser_dbg("Translating sg sz: %d\n", data->dma_nents); - page_vec_len = iser_sg_to_page_vec(data, ibdev, page_vec->pages, - &page_vec->offset, - &page_vec->data_size); - iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents, page_vec_len); - - page_vec->length = page_vec_len; - - if (page_vec_len * SIZE_4K < page_vec->data_size) { - iser_err("page_vec too short to hold this SG\n"); - iser_data_buf_dump(data, ibdev); - iser_dump_page_vec(page_vec); - BUG(); - } -} - int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, struct iser_data_buf *data, enum iser_data_dir iser_dir, @@ -323,7 +369,7 @@ int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, iser_task->dir[iser_dir] = 1; dev = iser_task->iser_conn->ib_conn.device->ib_device; - data->dma_nents = ib_dma_map_sg(dev, data->buf, data->size, dma_dir); + data->dma_nents = ib_dma_map_sg(dev, data->sg, data->size, dma_dir); if (data->dma_nents == 0) { iser_err("dma_map_sg failed!!!\n"); return -EINVAL; @@ -338,24 +384,41 @@ void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task, struct ib_device *dev; dev = iser_task->iser_conn->ib_conn.device->ib_device; - ib_dma_unmap_sg(dev, data->buf, data->size, dir); + ib_dma_unmap_sg(dev, data->sg, data->size, dir); +} + +static int +iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, + struct iser_mem_reg *reg) +{ + struct scatterlist *sg = mem->sg; + + reg->sge.lkey = device->mr->lkey; + reg->rkey = device->mr->rkey; + reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]); + reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]); + + iser_dbg("Single DMA entry: lkey=0x%x, rkey=0x%x, addr=0x%llx," + " length=0x%x\n", reg->sge.lkey, reg->rkey, + reg->sge.addr, reg->sge.length); + + return 0; } static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, - struct ib_device *ibdev, struct iser_data_buf *mem, - struct iser_data_buf *mem_copy, enum iser_data_dir cmd_dir, int aligned_len) { - struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; + struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn; + struct iser_device *device = iser_task->iser_conn->ib_conn.device; iscsi_conn->fmr_unalign_cnt++; iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n", aligned_len, mem->size); if (iser_debug_level > 0) - iser_data_buf_dump(mem, ibdev); + iser_data_buf_dump(mem, device->ib_device); /* unmap the command data before accessing it */ iser_dma_unmap_task_data(iser_task, mem, @@ -364,13 +427,95 @@ static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task, /* allocate copy buf, if we are writing, copy the */ /* unaligned scatterlist, dma map the copy */ - if (iser_start_rdma_unaligned_sg(iser_task, mem, mem_copy, cmd_dir) != 0) + if (iser_start_rdma_unaligned_sg(iser_task, mem, cmd_dir) != 0) return -ENOMEM; return 0; } /** + * iser_reg_page_vec - Register physical memory + * + * returns: 0 on success, errno code on failure + */ +static +int iser_reg_page_vec(struct iscsi_iser_task *iser_task, + struct iser_data_buf *mem, + struct iser_page_vec *page_vec, + struct iser_mem_reg *mem_reg) +{ + struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; + struct iser_device *device = ib_conn->device; + struct ib_pool_fmr *fmr; + int ret, plen; + + plen = iser_sg_to_page_vec(mem, device->ib_device, + page_vec->pages, + &page_vec->offset, + &page_vec->data_size); + page_vec->length = plen; + if (plen * SIZE_4K < page_vec->data_size) { + iser_err("page vec too short to hold this SG\n"); + iser_data_buf_dump(mem, device->ib_device); + iser_dump_page_vec(page_vec); + return -EINVAL; + } + + fmr = ib_fmr_pool_map_phys(ib_conn->fmr.pool, + page_vec->pages, + page_vec->length, + page_vec->pages[0]); + if (IS_ERR(fmr)) { + ret = PTR_ERR(fmr); + iser_err("ib_fmr_pool_map_phys failed: %d\n", ret); + return ret; + } + + mem_reg->sge.lkey = fmr->fmr->lkey; + mem_reg->rkey = fmr->fmr->rkey; + mem_reg->sge.addr = page_vec->pages[0] + page_vec->offset; + mem_reg->sge.length = page_vec->data_size; + mem_reg->mem_h = fmr; + + return 0; +} + +/** + * Unregister (previosuly registered using FMR) memory. + * If memory is non-FMR does nothing. + */ +void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; + int ret; + + if (!reg->mem_h) + return; + + iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n", reg->mem_h); + + ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h); + if (ret) + iser_err("ib_fmr_pool_unmap failed %d\n", ret); + + reg->mem_h = NULL; +} + +void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, + enum iser_data_dir cmd_dir) +{ + struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; + + if (!reg->mem_h) + return; + + iser_reg_desc_put(&iser_task->iser_conn->ib_conn, + reg->mem_h); + reg->mem_h = NULL; +} + +/** * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA, * using FMR (if possible) obtaining rkey and va * @@ -383,45 +528,29 @@ int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task, struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct iser_data_buf *mem = &iser_task->data[cmd_dir]; - struct iser_regd_buf *regd_buf; + struct iser_mem_reg *mem_reg; int aligned_len; int err; int i; - struct scatterlist *sg; - regd_buf = &iser_task->rdma_regd[cmd_dir]; + mem_reg = &iser_task->rdma_reg[cmd_dir]; aligned_len = iser_data_buf_aligned_len(mem, ibdev); if (aligned_len != mem->dma_nents) { - err = fall_to_bounce_buf(iser_task, ibdev, mem, - &iser_task->data_copy[cmd_dir], + err = fall_to_bounce_buf(iser_task, mem, cmd_dir, aligned_len); if (err) { iser_err("failed to allocate bounce buffer\n"); return err; } - mem = &iser_task->data_copy[cmd_dir]; } /* if there a single dma entry, FMR is not needed */ if (mem->dma_nents == 1) { - sg = (struct scatterlist *)mem->buf; - - regd_buf->reg.lkey = device->mr->lkey; - regd_buf->reg.rkey = device->mr->rkey; - regd_buf->reg.len = ib_sg_dma_len(ibdev, &sg[0]); - regd_buf->reg.va = ib_sg_dma_address(ibdev, &sg[0]); - - iser_dbg("PHYSICAL Mem.register: lkey: 0x%08X rkey: 0x%08X " - "va: 0x%08lX sz: %ld]\n", - (unsigned int)regd_buf->reg.lkey, - (unsigned int)regd_buf->reg.rkey, - (unsigned long)regd_buf->reg.va, - (unsigned long)regd_buf->reg.len); + return iser_reg_dma(device, mem, mem_reg); } else { /* use FMR for multiple dma entries */ - iser_page_vec_build(mem, ib_conn->fmr.page_vec, ibdev); - err = iser_reg_page_vec(ib_conn, ib_conn->fmr.page_vec, - ®d_buf->reg); + err = iser_reg_page_vec(iser_task, mem, ib_conn->fmr.page_vec, + mem_reg); if (err && err != -EAGAIN) { iser_data_buf_dump(mem, ibdev); iser_err("mem->dma_nents = %d (dlength = 0x%x)\n", @@ -519,8 +648,10 @@ iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) static int iser_reg_sig_mr(struct iscsi_iser_task *iser_task, - struct fast_reg_descriptor *desc, struct ib_sge *data_sge, - struct ib_sge *prot_sge, struct ib_sge *sig_sge) + struct fast_reg_descriptor *desc, + struct iser_mem_reg *data_reg, + struct iser_mem_reg *prot_reg, + struct iser_mem_reg *sig_reg) { struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; struct iser_pi_context *pi_ctx = desc->pi_ctx; @@ -544,12 +675,12 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, memset(&sig_wr, 0, sizeof(sig_wr)); sig_wr.opcode = IB_WR_REG_SIG_MR; sig_wr.wr_id = ISER_FASTREG_LI_WRID; - sig_wr.sg_list = data_sge; + sig_wr.sg_list = &data_reg->sge; sig_wr.num_sge = 1; sig_wr.wr.sig_handover.sig_attrs = &sig_attrs; sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr; if (scsi_prot_sg_count(iser_task->sc)) - sig_wr.wr.sig_handover.prot = prot_sge; + sig_wr.wr.sig_handover.prot = &prot_reg->sge; sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; @@ -566,27 +697,26 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, } desc->reg_indicators &= ~ISER_SIG_KEY_VALID; - sig_sge->lkey = pi_ctx->sig_mr->lkey; - sig_sge->addr = 0; - sig_sge->length = scsi_transfer_length(iser_task->sc); + sig_reg->sge.lkey = pi_ctx->sig_mr->lkey; + sig_reg->rkey = pi_ctx->sig_mr->rkey; + sig_reg->sge.addr = 0; + sig_reg->sge.length = scsi_transfer_length(iser_task->sc); - iser_dbg("sig_sge: addr: 0x%llx length: %u lkey: 0x%x\n", - sig_sge->addr, sig_sge->length, - sig_sge->lkey); + iser_dbg("sig_sge: lkey: 0x%x, rkey: 0x%x, addr: 0x%llx, length: %u\n", + sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr, + sig_reg->sge.length); err: return ret; } static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, - struct iser_regd_buf *regd_buf, struct iser_data_buf *mem, + struct fast_reg_descriptor *desc, enum iser_reg_indicator ind, - struct ib_sge *sge) + struct iser_mem_reg *reg) { - struct fast_reg_descriptor *desc = regd_buf->reg.mem_h; struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; - struct ib_device *ibdev = device->ib_device; struct ib_mr *mr; struct ib_fast_reg_page_list *frpl; struct ib_send_wr fastreg_wr, inv_wr; @@ -594,17 +724,8 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, int ret, offset, size, plen; /* if there a single dma entry, dma mr suffices */ - if (mem->dma_nents == 1) { - struct scatterlist *sg = (struct scatterlist *)mem->buf; - - sge->lkey = device->mr->lkey; - sge->addr = ib_sg_dma_address(ibdev, &sg[0]); - sge->length = ib_sg_dma_len(ibdev, &sg[0]); - - iser_dbg("Single DMA entry: lkey=0x%x, addr=0x%llx, length=0x%x\n", - sge->lkey, sge->addr, sge->length); - return 0; - } + if (mem->dma_nents == 1) + return iser_reg_dma(device, mem, reg); if (ind == ISER_DATA_KEY_VALID) { mr = desc->data_mr; @@ -652,9 +773,10 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, } desc->reg_indicators &= ~ind; - sge->lkey = mr->lkey; - sge->addr = frpl->page_list[0] + offset; - sge->length = size; + reg->sge.lkey = mr->lkey; + reg->rkey = mr->rkey; + reg->sge.addr = frpl->page_list[0] + offset; + reg->sge.length = size; return ret; } @@ -672,93 +794,66 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task, struct iser_device *device = ib_conn->device; struct ib_device *ibdev = device->ib_device; struct iser_data_buf *mem = &iser_task->data[cmd_dir]; - struct iser_regd_buf *regd_buf = &iser_task->rdma_regd[cmd_dir]; + struct iser_mem_reg *mem_reg = &iser_task->rdma_reg[cmd_dir]; struct fast_reg_descriptor *desc = NULL; - struct ib_sge data_sge; int err, aligned_len; - unsigned long flags; aligned_len = iser_data_buf_aligned_len(mem, ibdev); if (aligned_len != mem->dma_nents) { - err = fall_to_bounce_buf(iser_task, ibdev, mem, - &iser_task->data_copy[cmd_dir], + err = fall_to_bounce_buf(iser_task, mem, cmd_dir, aligned_len); if (err) { iser_err("failed to allocate bounce buffer\n"); return err; } - mem = &iser_task->data_copy[cmd_dir]; } if (mem->dma_nents != 1 || scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { - spin_lock_irqsave(&ib_conn->lock, flags); - desc = list_first_entry(&ib_conn->fastreg.pool, - struct fast_reg_descriptor, list); - list_del(&desc->list); - spin_unlock_irqrestore(&ib_conn->lock, flags); - regd_buf->reg.mem_h = desc; + desc = iser_reg_desc_get(ib_conn); + mem_reg->mem_h = desc; } - err = iser_fast_reg_mr(iser_task, regd_buf, mem, - ISER_DATA_KEY_VALID, &data_sge); + err = iser_fast_reg_mr(iser_task, mem, desc, + ISER_DATA_KEY_VALID, mem_reg); if (err) goto err_reg; if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) { - struct ib_sge prot_sge, sig_sge; + struct iser_mem_reg prot_reg; - memset(&prot_sge, 0, sizeof(prot_sge)); + memset(&prot_reg, 0, sizeof(prot_reg)); if (scsi_prot_sg_count(iser_task->sc)) { mem = &iser_task->prot[cmd_dir]; aligned_len = iser_data_buf_aligned_len(mem, ibdev); if (aligned_len != mem->dma_nents) { - err = fall_to_bounce_buf(iser_task, ibdev, mem, - &iser_task->prot_copy[cmd_dir], + err = fall_to_bounce_buf(iser_task, mem, cmd_dir, aligned_len); if (err) { iser_err("failed to allocate bounce buffer\n"); return err; } - mem = &iser_task->prot_copy[cmd_dir]; } - err = iser_fast_reg_mr(iser_task, regd_buf, mem, - ISER_PROT_KEY_VALID, &prot_sge); + err = iser_fast_reg_mr(iser_task, mem, desc, + ISER_PROT_KEY_VALID, &prot_reg); if (err) goto err_reg; } - err = iser_reg_sig_mr(iser_task, desc, &data_sge, - &prot_sge, &sig_sge); + err = iser_reg_sig_mr(iser_task, desc, mem_reg, + &prot_reg, mem_reg); if (err) { iser_err("Failed to register signature mr\n"); return err; } desc->reg_indicators |= ISER_FASTREG_PROTECTED; - - regd_buf->reg.lkey = sig_sge.lkey; - regd_buf->reg.rkey = desc->pi_ctx->sig_mr->rkey; - regd_buf->reg.va = sig_sge.addr; - regd_buf->reg.len = sig_sge.length; - } else { - if (desc) - regd_buf->reg.rkey = desc->data_mr->rkey; - else - regd_buf->reg.rkey = device->mr->rkey; - - regd_buf->reg.lkey = data_sge.lkey; - regd_buf->reg.va = data_sge.addr; - regd_buf->reg.len = data_sge.length; } return 0; err_reg: - if (desc) { - spin_lock_irqsave(&ib_conn->lock, flags); - list_add_tail(&desc->list, &ib_conn->fastreg.pool); - spin_unlock_irqrestore(&ib_conn->lock, flags); - } + if (desc) + iser_reg_desc_put(ib_conn, desc); return err; } diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 4065abe28829..cc2dd35ffbc0 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -274,6 +274,65 @@ void iser_free_fmr_pool(struct ib_conn *ib_conn) } static int +iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd, + struct fast_reg_descriptor *desc) +{ + struct iser_pi_context *pi_ctx = NULL; + struct ib_mr_init_attr mr_init_attr = {.max_reg_descriptors = 2, + .flags = IB_MR_SIGNATURE_EN}; + int ret = 0; + + desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); + if (!desc->pi_ctx) + return -ENOMEM; + + pi_ctx = desc->pi_ctx; + + pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, + ISCSI_ISER_SG_TABLESIZE); + if (IS_ERR(pi_ctx->prot_frpl)) { + ret = PTR_ERR(pi_ctx->prot_frpl); + goto prot_frpl_failure; + } + + pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, + ISCSI_ISER_SG_TABLESIZE + 1); + if (IS_ERR(pi_ctx->prot_mr)) { + ret = PTR_ERR(pi_ctx->prot_mr); + goto prot_mr_failure; + } + desc->reg_indicators |= ISER_PROT_KEY_VALID; + + pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); + if (IS_ERR(pi_ctx->sig_mr)) { + ret = PTR_ERR(pi_ctx->sig_mr); + goto sig_mr_failure; + } + desc->reg_indicators |= ISER_SIG_KEY_VALID; + desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; + + return 0; + +sig_mr_failure: + ib_dereg_mr(desc->pi_ctx->prot_mr); +prot_mr_failure: + ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); +prot_frpl_failure: + kfree(desc->pi_ctx); + + return ret; +} + +static void +iser_free_pi_ctx(struct iser_pi_context *pi_ctx) +{ + ib_free_fast_reg_page_list(pi_ctx->prot_frpl); + ib_dereg_mr(pi_ctx->prot_mr); + ib_destroy_mr(pi_ctx->sig_mr); + kfree(pi_ctx); +} + +static int iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, bool pi_enable, struct fast_reg_descriptor *desc) { @@ -297,59 +356,12 @@ iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, desc->reg_indicators |= ISER_DATA_KEY_VALID; if (pi_enable) { - struct ib_mr_init_attr mr_init_attr = {0}; - struct iser_pi_context *pi_ctx = NULL; - - desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); - if (!desc->pi_ctx) { - iser_err("Failed to allocate pi context\n"); - ret = -ENOMEM; + ret = iser_alloc_pi_ctx(ib_device, pd, desc); + if (ret) goto pi_ctx_alloc_failure; - } - pi_ctx = desc->pi_ctx; - - pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(pi_ctx->prot_frpl)) { - ret = PTR_ERR(pi_ctx->prot_frpl); - iser_err("Failed to allocate prot frpl ret=%d\n", - ret); - goto prot_frpl_failure; - } - - pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, - ISCSI_ISER_SG_TABLESIZE + 1); - if (IS_ERR(pi_ctx->prot_mr)) { - ret = PTR_ERR(pi_ctx->prot_mr); - iser_err("Failed to allocate prot frmr ret=%d\n", - ret); - goto prot_mr_failure; - } - desc->reg_indicators |= ISER_PROT_KEY_VALID; - - mr_init_attr.max_reg_descriptors = 2; - mr_init_attr.flags |= IB_MR_SIGNATURE_EN; - pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); - if (IS_ERR(pi_ctx->sig_mr)) { - ret = PTR_ERR(pi_ctx->sig_mr); - iser_err("Failed to allocate signature enabled mr err=%d\n", - ret); - goto sig_mr_failure; - } - desc->reg_indicators |= ISER_SIG_KEY_VALID; } - desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; - - iser_dbg("Create fr_desc %p page_list %p\n", - desc, desc->data_frpl->page_list); return 0; -sig_mr_failure: - ib_dereg_mr(desc->pi_ctx->prot_mr); -prot_mr_failure: - ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); -prot_frpl_failure: - kfree(desc->pi_ctx); pi_ctx_alloc_failure: ib_dereg_mr(desc->data_mr); fast_reg_mr_failure: @@ -416,12 +428,8 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn) list_del(&desc->list); ib_free_fast_reg_page_list(desc->data_frpl); ib_dereg_mr(desc->data_mr); - if (desc->pi_ctx) { - ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); - ib_dereg_mr(desc->pi_ctx->prot_mr); - ib_destroy_mr(desc->pi_ctx->sig_mr); - kfree(desc->pi_ctx); - } + if (desc->pi_ctx) + iser_free_pi_ctx(desc->pi_ctx); kfree(desc); ++i; } @@ -721,7 +729,7 @@ static void iser_connect_error(struct rdma_cm_id *cma_id) struct iser_conn *iser_conn; iser_conn = (struct iser_conn *)cma_id->context; - iser_conn->state = ISER_CONN_DOWN; + iser_conn->state = ISER_CONN_TERMINATING; } /** @@ -992,93 +1000,6 @@ connect_failure: return err; } -/** - * iser_reg_page_vec - Register physical memory - * - * returns: 0 on success, errno code on failure - */ -int iser_reg_page_vec(struct ib_conn *ib_conn, - struct iser_page_vec *page_vec, - struct iser_mem_reg *mem_reg) -{ - struct ib_pool_fmr *mem; - u64 io_addr; - u64 *page_list; - int status; - - page_list = page_vec->pages; - io_addr = page_list[0]; - - mem = ib_fmr_pool_map_phys(ib_conn->fmr.pool, - page_list, - page_vec->length, - io_addr); - - if (IS_ERR(mem)) { - status = (int)PTR_ERR(mem); - iser_err("ib_fmr_pool_map_phys failed: %d\n", status); - return status; - } - - mem_reg->lkey = mem->fmr->lkey; - mem_reg->rkey = mem->fmr->rkey; - mem_reg->len = page_vec->length * SIZE_4K; - mem_reg->va = io_addr; - mem_reg->mem_h = (void *)mem; - - mem_reg->va += page_vec->offset; - mem_reg->len = page_vec->data_size; - - iser_dbg("PHYSICAL Mem.register, [PHYS p_array: 0x%p, sz: %d, " - "entry[0]: (0x%08lx,%ld)] -> " - "[lkey: 0x%08X mem_h: 0x%p va: 0x%08lX sz: %ld]\n", - page_vec, page_vec->length, - (unsigned long)page_vec->pages[0], - (unsigned long)page_vec->data_size, - (unsigned int)mem_reg->lkey, mem_reg->mem_h, - (unsigned long)mem_reg->va, (unsigned long)mem_reg->len); - return 0; -} - -/** - * Unregister (previosuly registered using FMR) memory. - * If memory is non-FMR does nothing. - */ -void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir) -{ - struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; - int ret; - - if (!reg->mem_h) - return; - - iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); - - ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h); - if (ret) - iser_err("ib_fmr_pool_unmap failed %d\n", ret); - - reg->mem_h = NULL; -} - -void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir) -{ - struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; - struct iser_conn *iser_conn = iser_task->iser_conn; - struct ib_conn *ib_conn = &iser_conn->ib_conn; - struct fast_reg_descriptor *desc = reg->mem_h; - - if (!desc) - return; - - reg->mem_h = NULL; - spin_lock_bh(&ib_conn->lock); - list_add_tail(&desc->list, &ib_conn->fastreg.pool); - spin_unlock_bh(&ib_conn->lock); -} - int iser_post_recvl(struct iser_conn *iser_conn) { struct ib_recv_wr rx_wr, *rx_wr_failed; @@ -1210,6 +1131,9 @@ iser_handle_comp_error(struct ib_conn *ib_conn, iscsi_conn_failure(iser_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); + if (wc->wr_id == ISER_FASTREG_LI_WRID) + return; + if (is_iser_tx_desc(iser_conn, wr_id)) { struct iser_tx_desc *desc = wr_id; @@ -1254,13 +1178,11 @@ static void iser_handle_wc(struct ib_wc *wc) else iser_dbg("flush error: wr id %llx\n", wc->wr_id); - if (wc->wr_id != ISER_FASTREG_LI_WRID && - wc->wr_id != ISER_BEACON_WRID) - iser_handle_comp_error(ib_conn, wc); - - /* complete in case all flush errors were consumed */ if (wc->wr_id == ISER_BEACON_WRID) + /* all flush errors were consumed */ complete(&ib_conn->flush_comp); + else + iser_handle_comp_error(ib_conn, wc); } } @@ -1306,7 +1228,7 @@ static void iser_cq_callback(struct ib_cq *cq, void *cq_context) u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir, sector_t *sector) { - struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; + struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; struct fast_reg_descriptor *desc = reg->mem_h; unsigned long sector_size = iser_task->sc->device->sector_size; struct ib_mr_status mr_status; diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 0747c0595a9d..918814cd0f80 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -40,6 +40,7 @@ #include <linux/parser.h> #include <linux/random.h> #include <linux/jiffies.h> +#include <rdma/ib_cache.h> #include <linux/atomic.h> @@ -265,10 +266,10 @@ static int srp_init_qp(struct srp_target_port *target, if (!attr) return -ENOMEM; - ret = ib_find_pkey(target->srp_host->srp_dev->dev, - target->srp_host->port, - be16_to_cpu(target->pkey), - &attr->pkey_index); + ret = ib_find_cached_pkey(target->srp_host->srp_dev->dev, + target->srp_host->port, + be16_to_cpu(target->pkey), + &attr->pkey_index); if (ret) goto out; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 6e0a477681e9..4b9b866e6b0d 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -207,7 +207,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, } break; default: - printk(KERN_ERR "received unrecognized IB event %d\n", + pr_err("received unrecognized IB event %d\n", event->event); break; } @@ -218,7 +218,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, */ static void srpt_srq_event(struct ib_event *event, void *ctx) { - printk(KERN_INFO "SRQ event %d\n", event->event); + pr_info("SRQ event %d\n", event->event); } /** @@ -242,8 +242,7 @@ static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) ch->sess_name, srpt_get_ch_state(ch)); break; default: - printk(KERN_ERR "received unrecognized IB QP event %d\n", - event->event); + pr_err("received unrecognized IB QP event %d\n", event->event); break; } } @@ -602,7 +601,7 @@ static void srpt_unregister_mad_agent(struct srpt_device *sdev) sport = &sdev->port[i - 1]; WARN_ON(sport->port != i); if (ib_modify_port(sdev->device, i, 0, &port_modify) < 0) - printk(KERN_ERR "disabling MAD processing failed.\n"); + pr_err("disabling MAD processing failed.\n"); if (sport->mad_agent) { ib_unregister_mad_agent(sport->mad_agent); sport->mad_agent = NULL; @@ -810,7 +809,7 @@ static int srpt_post_send(struct srpt_rdma_ch *ch, ret = -ENOMEM; if (unlikely(atomic_dec_return(&ch->sq_wr_avail) < 0)) { - printk(KERN_WARNING "IB send queue full (needed 1)\n"); + pr_warn("IB send queue full (needed 1)\n"); goto out; } @@ -912,7 +911,7 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, if (ioctx->n_rbuf > (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) { - printk(KERN_ERR "received unsupported SRP_CMD request" + pr_err("received unsupported SRP_CMD request" " type (%u out + %u in != %u / %zu)\n", srp_cmd->data_out_desc_cnt, srp_cmd->data_in_desc_cnt, @@ -1432,7 +1431,7 @@ static void srpt_handle_send_comp(struct srpt_rdma_ch *ch, srpt_unmap_sg_to_ib_sge(ch, ioctx); transport_generic_free_cmd(&ioctx->cmd, 0); } else { - printk(KERN_ERR "IB completion has been received too late for" + pr_err("IB completion has been received too late for" " wr_id = %u.\n", ioctx->ioctx.index); } } @@ -1457,7 +1456,7 @@ static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch, SRPT_STATE_DATA_IN)) target_execute_cmd(&ioctx->cmd); else - printk(KERN_ERR "%s[%d]: wrong state = %d\n", __func__, + pr_err("%s[%d]: wrong state = %d\n", __func__, __LINE__, srpt_get_cmd_state(ioctx)); } else if (opcode == SRPT_RDMA_ABORT) { ioctx->rdma_aborted = true; @@ -1481,7 +1480,7 @@ static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, switch (opcode) { case SRPT_RDMA_READ_LAST: if (ioctx->n_rdma <= 0) { - printk(KERN_ERR "Received invalid RDMA read" + pr_err("Received invalid RDMA read" " error completion with idx %d\n", ioctx->ioctx.index); break; @@ -1490,14 +1489,13 @@ static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, if (state == SRPT_STATE_NEED_DATA) srpt_abort_cmd(ioctx); else - printk(KERN_ERR "%s[%d]: wrong state = %d\n", + pr_err("%s[%d]: wrong state = %d\n", __func__, __LINE__, state); break; case SRPT_RDMA_WRITE_LAST: break; default: - printk(KERN_ERR "%s[%d]: opcode = %u\n", __func__, - __LINE__, opcode); + pr_err("%s[%d]: opcode = %u\n", __func__, __LINE__, opcode); break; } } @@ -1549,8 +1547,8 @@ static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch, BUILD_BUG_ON(MIN_MAX_RSP_SIZE <= sizeof(*srp_rsp)); max_sense_len = ch->max_ti_iu_len - sizeof(*srp_rsp); if (sense_data_len > max_sense_len) { - printk(KERN_WARNING "truncated sense data from %d to %d" - " bytes\n", sense_data_len, max_sense_len); + pr_warn("truncated sense data from %d to %d" + " bytes\n", sense_data_len, max_sense_len); sense_data_len = max_sense_len; } @@ -1628,8 +1626,8 @@ static uint64_t srpt_unpack_lun(const uint8_t *lun, int len) int addressing_method; if (unlikely(len < 2)) { - printk(KERN_ERR "Illegal LUN length %d, expected 2 bytes or " - "more", len); + pr_err("Illegal LUN length %d, expected 2 bytes or more\n", + len); goto out; } @@ -1663,7 +1661,7 @@ static uint64_t srpt_unpack_lun(const uint8_t *lun, int len) case SCSI_LUN_ADDR_METHOD_EXTENDED_LUN: default: - printk(KERN_ERR "Unimplemented LUN addressing method %u", + pr_err("Unimplemented LUN addressing method %u\n", addressing_method); break; } @@ -1672,8 +1670,7 @@ out: return res; out_err: - printk(KERN_ERR "Support for multi-level LUNs has not yet been" - " implemented"); + pr_err("Support for multi-level LUNs has not yet been implemented\n"); goto out; } @@ -1723,7 +1720,7 @@ static int srpt_handle_cmd(struct srpt_rdma_ch *ch, } if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) { - printk(KERN_ERR "0x%llx: parsing SRP descriptor table failed.\n", + pr_err("0x%llx: parsing SRP descriptor table failed.\n", srp_cmd->tag); ret = TCM_INVALID_CDB_FIELD; goto send_sense; @@ -1912,7 +1909,7 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, srpt_handle_tsk_mgmt(ch, recv_ioctx, send_ioctx); break; case SRP_I_LOGOUT: - printk(KERN_ERR "Not yet implemented: SRP_I_LOGOUT\n"); + pr_err("Not yet implemented: SRP_I_LOGOUT\n"); break; case SRP_CRED_RSP: pr_debug("received SRP_CRED_RSP\n"); @@ -1921,10 +1918,10 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, pr_debug("received SRP_AER_RSP\n"); break; case SRP_RSP: - printk(KERN_ERR "Received SRP_RSP\n"); + pr_err("Received SRP_RSP\n"); break; default: - printk(KERN_ERR "received IU with unknown opcode 0x%x\n", + pr_err("received IU with unknown opcode 0x%x\n", srp_cmd->opcode); break; } @@ -1948,12 +1945,12 @@ static void srpt_process_rcv_completion(struct ib_cq *cq, req_lim = atomic_dec_return(&ch->req_lim); if (unlikely(req_lim < 0)) - printk(KERN_ERR "req_lim = %d < 0\n", req_lim); + pr_err("req_lim = %d < 0\n", req_lim); ioctx = sdev->ioctx_ring[index]; srpt_handle_new_iu(ch, ioctx, NULL); } else { - printk(KERN_INFO "receiving failed for idx %u with status %d\n", - index, wc->status); + pr_info("receiving failed for idx %u with status %d\n", + index, wc->status); } } @@ -1993,12 +1990,12 @@ static void srpt_process_send_completion(struct ib_cq *cq, } } else { if (opcode == SRPT_SEND) { - printk(KERN_INFO "sending response for idx %u failed" - " with status %d\n", index, wc->status); + pr_info("sending response for idx %u failed" + " with status %d\n", index, wc->status); srpt_handle_send_err_comp(ch, wc->wr_id); } else if (opcode != SRPT_RDMA_MID) { - printk(KERN_INFO "RDMA t %d for idx %u failed with" - " status %d", opcode, index, wc->status); + pr_info("RDMA t %d for idx %u failed with" + " status %d\n", opcode, index, wc->status); srpt_handle_rdma_err_comp(ch, send_ioctx, opcode); } } @@ -2062,15 +2059,15 @@ static int srpt_compl_thread(void *arg) ch = arg; BUG_ON(!ch); - printk(KERN_INFO "Session %s: kernel thread %s (PID %d) started\n", - ch->sess_name, ch->thread->comm, current->pid); + pr_info("Session %s: kernel thread %s (PID %d) started\n", + ch->sess_name, ch->thread->comm, current->pid); while (!kthread_should_stop()) { wait_event_interruptible(ch->wait_queue, (srpt_process_completion(ch->cq, ch), kthread_should_stop())); } - printk(KERN_INFO "Session %s: kernel thread %s (PID %d) stopped\n", - ch->sess_name, ch->thread->comm, current->pid); + pr_info("Session %s: kernel thread %s (PID %d) stopped\n", + ch->sess_name, ch->thread->comm, current->pid); return 0; } @@ -2097,7 +2094,7 @@ retry: ch->rq_size + srp_sq_size, 0); if (IS_ERR(ch->cq)) { ret = PTR_ERR(ch->cq); - printk(KERN_ERR "failed to create CQ cqe= %d ret= %d\n", + pr_err("failed to create CQ cqe= %d ret= %d\n", ch->rq_size + srp_sq_size, ret); goto out; } @@ -2123,7 +2120,7 @@ retry: goto retry; } } - printk(KERN_ERR "failed to create_qp ret= %d\n", ret); + pr_err("failed to create_qp ret= %d\n", ret); goto err_destroy_cq; } @@ -2143,7 +2140,7 @@ retry: ch->thread = kthread_run(srpt_compl_thread, ch, "ib_srpt_compl"); if (IS_ERR(ch->thread)) { - printk(KERN_ERR "failed to create kernel thread %ld\n", + pr_err("failed to create kernel thread %ld\n", PTR_ERR(ch->thread)); ch->thread = NULL; goto err_destroy_qp; @@ -2204,7 +2201,7 @@ static void __srpt_close_ch(struct srpt_rdma_ch *ch) /* fall through */ case CH_LIVE: if (ib_send_cm_dreq(ch->cm_id, NULL, 0) < 0) - printk(KERN_ERR "sending CM DREQ failed.\n"); + pr_err("sending CM DREQ failed.\n"); break; case CH_DISCONNECTING: break; @@ -2291,7 +2288,7 @@ static void srpt_drain_channel(struct ib_cm_id *cm_id) ret = srpt_ch_qp_err(ch); if (ret < 0) - printk(KERN_ERR "Setting queue pair in error state" + pr_err("Setting queue pair in error state" " failed: %d\n", ret); } } @@ -2435,17 +2432,17 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, it_iu_len = be32_to_cpu(req->req_it_iu_len); - printk(KERN_INFO "Received SRP_LOGIN_REQ with i_port_id 0x%llx:0x%llx," - " t_port_id 0x%llx:0x%llx and it_iu_len %d on port %d" - " (guid=0x%llx:0x%llx)\n", - be64_to_cpu(*(__be64 *)&req->initiator_port_id[0]), - be64_to_cpu(*(__be64 *)&req->initiator_port_id[8]), - be64_to_cpu(*(__be64 *)&req->target_port_id[0]), - be64_to_cpu(*(__be64 *)&req->target_port_id[8]), - it_iu_len, - param->port, - be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]), - be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8])); + pr_info("Received SRP_LOGIN_REQ with i_port_id 0x%llx:0x%llx," + " t_port_id 0x%llx:0x%llx and it_iu_len %d on port %d" + " (guid=0x%llx:0x%llx)\n", + be64_to_cpu(*(__be64 *)&req->initiator_port_id[0]), + be64_to_cpu(*(__be64 *)&req->initiator_port_id[8]), + be64_to_cpu(*(__be64 *)&req->target_port_id[0]), + be64_to_cpu(*(__be64 *)&req->target_port_id[8]), + it_iu_len, + param->port, + be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]), + be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8])); rsp = kzalloc(sizeof *rsp, GFP_KERNEL); rej = kzalloc(sizeof *rej, GFP_KERNEL); @@ -2460,7 +2457,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE); ret = -EINVAL; - printk(KERN_ERR "rejected SRP_LOGIN_REQ because its" + pr_err("rejected SRP_LOGIN_REQ because its" " length (%d bytes) is out of range (%d .. %d)\n", it_iu_len, 64, srp_max_req_size); goto reject; @@ -2470,7 +2467,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); ret = -EINVAL; - printk(KERN_ERR "rejected SRP_LOGIN_REQ because the target port" + pr_err("rejected SRP_LOGIN_REQ because the target port" " has not yet been enabled\n"); goto reject; } @@ -2516,7 +2513,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL); ret = -ENOMEM; - printk(KERN_ERR "rejected SRP_LOGIN_REQ because it" + pr_err("rejected SRP_LOGIN_REQ because it" " has an invalid target port identifier.\n"); goto reject; } @@ -2525,7 +2522,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if (!ch) { rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - printk(KERN_ERR "rejected SRP_LOGIN_REQ because no memory.\n"); + pr_err("rejected SRP_LOGIN_REQ because no memory.\n"); ret = -ENOMEM; goto reject; } @@ -2562,7 +2559,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if (ret) { rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - printk(KERN_ERR "rejected SRP_LOGIN_REQ because creating" + pr_err("rejected SRP_LOGIN_REQ because creating" " a new RDMA channel failed.\n"); goto free_ring; } @@ -2571,7 +2568,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if (ret) { rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - printk(KERN_ERR "rejected SRP_LOGIN_REQ because enabling" + pr_err("rejected SRP_LOGIN_REQ because enabling" " RTR failed (error code = %d)\n", ret); goto destroy_ib; } @@ -2586,8 +2583,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, nacl = srpt_lookup_acl(sport, ch->i_port_id); if (!nacl) { - printk(KERN_INFO "Rejected login because no ACL has been" - " configured yet for initiator %s.\n", ch->sess_name); + pr_info("Rejected login because no ACL has been" + " configured yet for initiator %s.\n", ch->sess_name); rej->reason = __constant_cpu_to_be32( SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED); goto destroy_ib; @@ -2631,7 +2628,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, ret = ib_send_cm_rep(cm_id, rep_param); if (ret) { - printk(KERN_ERR "sending SRP_LOGIN_REQ response failed" + pr_err("sending SRP_LOGIN_REQ response failed" " (error code = %d)\n", ret); goto release_channel; } @@ -2679,7 +2676,7 @@ out: static void srpt_cm_rej_recv(struct ib_cm_id *cm_id) { - printk(KERN_INFO "Received IB REJ for cm_id %p.\n", cm_id); + pr_info("Received IB REJ for cm_id %p.\n", cm_id); srpt_drain_channel(cm_id); } @@ -2714,13 +2711,13 @@ static void srpt_cm_rtu_recv(struct ib_cm_id *cm_id) static void srpt_cm_timewait_exit(struct ib_cm_id *cm_id) { - printk(KERN_INFO "Received IB TimeWait exit for cm_id %p.\n", cm_id); + pr_info("Received IB TimeWait exit for cm_id %p.\n", cm_id); srpt_drain_channel(cm_id); } static void srpt_cm_rep_error(struct ib_cm_id *cm_id) { - printk(KERN_INFO "Received IB REP error for cm_id %p.\n", cm_id); + pr_info("Received IB REP error for cm_id %p.\n", cm_id); srpt_drain_channel(cm_id); } @@ -2755,9 +2752,9 @@ static void srpt_cm_dreq_recv(struct ib_cm_id *cm_id) if (send_drep) { if (ib_send_cm_drep(ch->cm_id, NULL, 0) < 0) - printk(KERN_ERR "Sending IB DREP failed.\n"); - printk(KERN_INFO "Received DREQ and sent DREP for session %s.\n", - ch->sess_name); + pr_err("Sending IB DREP failed.\n"); + pr_info("Received DREQ and sent DREP for session %s.\n", + ch->sess_name); } } @@ -2766,8 +2763,7 @@ static void srpt_cm_dreq_recv(struct ib_cm_id *cm_id) */ static void srpt_cm_drep_recv(struct ib_cm_id *cm_id) { - printk(KERN_INFO "Received InfiniBand DREP message for cm_id %p.\n", - cm_id); + pr_info("Received InfiniBand DREP message for cm_id %p.\n", cm_id); srpt_drain_channel(cm_id); } @@ -2811,14 +2807,13 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) srpt_cm_rep_error(cm_id); break; case IB_CM_DREQ_ERROR: - printk(KERN_INFO "Received IB DREQ ERROR event.\n"); + pr_info("Received IB DREQ ERROR event.\n"); break; case IB_CM_MRA_RECEIVED: - printk(KERN_INFO "Received IB MRA event\n"); + pr_info("Received IB MRA event\n"); break; default: - printk(KERN_ERR "received unrecognized IB CM event %d\n", - event->event); + pr_err("received unrecognized IB CM event %d\n", event->event); break; } @@ -2848,8 +2843,8 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, ret = -ENOMEM; sq_wr_avail = atomic_sub_return(n_rdma, &ch->sq_wr_avail); if (sq_wr_avail < 0) { - printk(KERN_WARNING "IB send queue full (needed %d)\n", - n_rdma); + pr_warn("IB send queue full (needed %d)\n", + n_rdma); goto out; } } @@ -2889,7 +2884,7 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, } if (ret) - printk(KERN_ERR "%s[%d]: ib_post_send() returned %d for %d/%d", + pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n", __func__, __LINE__, ret, i, n_rdma); if (ret && i > 0) { wr.num_sge = 0; @@ -2897,12 +2892,12 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, wr.send_flags = IB_SEND_SIGNALED; while (ch->state == CH_LIVE && ib_post_send(ch->qp, &wr, &bad_wr) != 0) { - printk(KERN_INFO "Trying to abort failed RDMA transfer [%d]", + pr_info("Trying to abort failed RDMA transfer [%d]\n", ioctx->ioctx.index); msleep(1000); } while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) { - printk(KERN_INFO "Waiting until RDMA abort finished [%d]", + pr_info("Waiting until RDMA abort finished [%d]\n", ioctx->ioctx.index); msleep(1000); } @@ -2923,17 +2918,17 @@ static int srpt_xfer_data(struct srpt_rdma_ch *ch, ret = srpt_map_sg_to_ib_sge(ch, ioctx); if (ret) { - printk(KERN_ERR "%s[%d] ret=%d\n", __func__, __LINE__, ret); + pr_err("%s[%d] ret=%d\n", __func__, __LINE__, ret); goto out; } ret = srpt_perform_rdmas(ch, ioctx); if (ret) { if (ret == -EAGAIN || ret == -ENOMEM) - printk(KERN_INFO "%s[%d] queue full -- ret=%d\n", - __func__, __LINE__, ret); + pr_info("%s[%d] queue full -- ret=%d\n", + __func__, __LINE__, ret); else - printk(KERN_ERR "%s[%d] fatal error -- ret=%d\n", + pr_err("%s[%d] fatal error -- ret=%d\n", __func__, __LINE__, ret); goto out_unmap; } @@ -3058,7 +3053,7 @@ static void srpt_queue_response(struct se_cmd *cmd) !ioctx->queue_status_only) { ret = srpt_xfer_data(ch, ioctx); if (ret) { - printk(KERN_ERR "xfer_data failed for tag %llu\n", + pr_err("xfer_data failed for tag %llu\n", ioctx->tag); return; } @@ -3075,7 +3070,7 @@ static void srpt_queue_response(struct se_cmd *cmd) } ret = srpt_post_send(ch, ioctx, resp_len); if (ret) { - printk(KERN_ERR "sending cmd response failed for tag %llu\n", + pr_err("sending cmd response failed for tag %llu\n", ioctx->tag); srpt_unmap_sg_to_ib_sge(ch, ioctx); srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); @@ -3154,7 +3149,7 @@ static int srpt_release_sdev(struct srpt_device *sdev) res = wait_event_interruptible(sdev->ch_releaseQ, srpt_ch_list_empty(sdev)); if (res) - printk(KERN_ERR "%s: interrupted.\n", __func__); + pr_err("%s: interrupted.\n", __func__); return 0; } @@ -3293,7 +3288,7 @@ static void srpt_add_one(struct ib_device *device) spin_lock_init(&sport->port_acl_lock); if (srpt_refresh_port(sport)) { - printk(KERN_ERR "MAD registration failed for %s-%d.\n", + pr_err("MAD registration failed for %s-%d.\n", srpt_sdev_name(sdev), i); goto err_ring; } @@ -3330,7 +3325,7 @@ free_dev: kfree(sdev); err: sdev = NULL; - printk(KERN_INFO "%s(%s) failed.\n", __func__, device->name); + pr_info("%s(%s) failed.\n", __func__, device->name); goto out; } @@ -3344,8 +3339,7 @@ static void srpt_remove_one(struct ib_device *device) sdev = ib_get_client_data(device, &srpt_client); if (!sdev) { - printk(KERN_INFO "%s(%s): nothing to do.\n", __func__, - device->name); + pr_info("%s(%s): nothing to do.\n", __func__, device->name); return; } @@ -3464,7 +3458,7 @@ static struct se_node_acl *srpt_alloc_fabric_acl(struct se_portal_group *se_tpg) nacl = kzalloc(sizeof(struct srpt_node_acl), GFP_KERNEL); if (!nacl) { - printk(KERN_ERR "Unable to allocate struct srpt_node_acl\n"); + pr_err("Unable to allocate struct srpt_node_acl\n"); return NULL; } @@ -3615,7 +3609,7 @@ static struct se_node_acl *srpt_make_nodeacl(struct se_portal_group *tpg, u8 i_port_id[16]; if (srpt_parse_i_port_id(i_port_id, name) < 0) { - printk(KERN_ERR "invalid initiator port ID %s\n", name); + pr_err("invalid initiator port ID %s\n", name); ret = -EINVAL; goto err; } @@ -3816,12 +3810,12 @@ static ssize_t srpt_tpg_store_enable( ret = kstrtoul(page, 0, &tmp); if (ret < 0) { - printk(KERN_ERR "Unable to extract srpt_tpg_store_enable\n"); + pr_err("Unable to extract srpt_tpg_store_enable\n"); return -EINVAL; } if ((tmp != 0) && (tmp != 1)) { - printk(KERN_ERR "Illegal value for srpt_tpg_store_enable: %lu\n", tmp); + pr_err("Illegal value for srpt_tpg_store_enable: %lu\n", tmp); return -EINVAL; } if (tmp == 1) @@ -3980,7 +3974,7 @@ static int __init srpt_init_module(void) ret = -EINVAL; if (srp_max_req_size < MIN_MAX_REQ_SIZE) { - printk(KERN_ERR "invalid value %d for kernel module parameter" + pr_err("invalid value %d for kernel module parameter" " srp_max_req_size -- must be at least %d.\n", srp_max_req_size, MIN_MAX_REQ_SIZE); goto out; @@ -3988,7 +3982,7 @@ static int __init srpt_init_module(void) if (srpt_srq_size < MIN_SRPT_SRQ_SIZE || srpt_srq_size > MAX_SRPT_SRQ_SIZE) { - printk(KERN_ERR "invalid value %d for kernel module parameter" + pr_err("invalid value %d for kernel module parameter" " srpt_srq_size -- must be in the range [%d..%d].\n", srpt_srq_size, MIN_SRPT_SRQ_SIZE, MAX_SRPT_SRQ_SIZE); goto out; @@ -3996,7 +3990,7 @@ static int __init srpt_init_module(void) srpt_target = target_fabric_configfs_init(THIS_MODULE, "srpt"); if (IS_ERR(srpt_target)) { - printk(KERN_ERR "couldn't register\n"); + pr_err("couldn't register\n"); ret = PTR_ERR(srpt_target); goto out; } @@ -4018,13 +4012,13 @@ static int __init srpt_init_module(void) ret = target_fabric_configfs_register(srpt_target); if (ret < 0) { - printk(KERN_ERR "couldn't register\n"); + pr_err("couldn't register\n"); goto out_free_target; } ret = ib_register_client(&srpt_client); if (ret) { - printk(KERN_ERR "couldn't register IB client\n"); + pr_err("couldn't register IB client\n"); goto out_unregister_target; } diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index f0fbb4ade85d..4f7dc044601e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -939,21 +939,34 @@ static int mlx4_MAD_IFC_wrapper(struct mlx4_dev *dev, int slave, return err; } if (smp->attr_id == IB_SMP_ATTR_GUID_INFO) { - /* compute slave's gid block */ - smp->attr_mod = cpu_to_be32(slave / 8); - /* execute cmd */ - err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, - vhcr->in_modifier, opcode_modifier, - vhcr->op, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); - if (!err) { - /* if needed, move slave gid to index 0 */ - if (slave % 8) - memcpy(outsmp->data, - outsmp->data + (slave % 8) * 8, 8); - /* delete all other gids */ - memset(outsmp->data + 8, 0, 56); + __be64 guid = mlx4_get_admin_guid(dev, slave, + port); + + /* set the PF admin guid to the FW/HW burned + * GUID, if it wasn't yet set + */ + if (slave == 0 && guid == 0) { + smp->attr_mod = 0; + err = mlx4_cmd_box(dev, + inbox->dma, + outbox->dma, + vhcr->in_modifier, + opcode_modifier, + vhcr->op, + MLX4_CMD_TIME_CLASS_C, + MLX4_CMD_NATIVE); + if (err) + return err; + mlx4_set_admin_guid(dev, + *(__be64 *)outsmp-> + data, slave, port); + } else { + memcpy(outsmp->data, &guid, 8); } - return err; + + /* clean all other gids */ + memset(outsmp->data + 8, 0, 56); + return 0; } if (smp->attr_id == IB_SMP_ATTR_NODE_INFO) { err = mlx4_cmd_box(dev, inbox->dma, outbox->dma, @@ -2350,6 +2363,7 @@ int mlx4_multi_func_init(struct mlx4_dev *dev) oper_vport->qos_vport = MLX4_VPP_DEFAULT_VPORT; vf_oper->vport[port].vlan_idx = NO_INDX; vf_oper->vport[port].mac_idx = NO_INDX; + mlx4_set_random_admin_guid(dev, i, port); } spin_lock_init(&s_state->lock); } diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c index 190fd624bdfe..2619c9fbf42d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/eq.c +++ b/drivers/net/ethernet/mellanox/mlx4/eq.c @@ -702,6 +702,8 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) priv->mfunc.master.slave_state[flr_slave].is_slave_going_down = 1; } spin_unlock_irqrestore(&priv->mfunc.master.slave_state_lock, flags); + mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_SHUTDOWN, + flr_slave); queue_work(priv->mfunc.master.comm_wq, &priv->mfunc.master.slave_flr_event_work); break; diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index acceb75e8c44..ced5ecab5aa7 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2260,6 +2260,37 @@ void mlx4_counter_free(struct mlx4_dev *dev, u32 idx) } EXPORT_SYMBOL_GPL(mlx4_counter_free); +void mlx4_set_admin_guid(struct mlx4_dev *dev, __be64 guid, int entry, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + priv->mfunc.master.vf_admin[entry].vport[port].guid = guid; +} +EXPORT_SYMBOL_GPL(mlx4_set_admin_guid); + +__be64 mlx4_get_admin_guid(struct mlx4_dev *dev, int entry, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + + return priv->mfunc.master.vf_admin[entry].vport[port].guid; +} +EXPORT_SYMBOL_GPL(mlx4_get_admin_guid); + +void mlx4_set_random_admin_guid(struct mlx4_dev *dev, int entry, int port) +{ + struct mlx4_priv *priv = mlx4_priv(dev); + __be64 guid; + + /* hw GUID */ + if (entry == 0) + return; + + get_random_bytes((char *)&guid, sizeof(guid)); + guid &= ~(cpu_to_be64(1ULL << 56)); + guid |= cpu_to_be64(1ULL << 57); + priv->mfunc.master.vf_admin[entry].vport[port].guid = guid; +} + static int mlx4_setup_hca(struct mlx4_dev *dev) { struct mlx4_priv *priv = mlx4_priv(dev); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index f30eeb730a86..502d3dd2c888 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -499,6 +499,7 @@ struct mlx4_vport_state { bool spoofchk; u32 link_state; u8 qos_vport; + __be64 guid; }; struct mlx4_vf_admin_state { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index df2238372ea7..8a64542abc16 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -211,26 +211,28 @@ static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr) return 0; } +#define MLX5_U64_4K_PAGE_MASK ((~(u64)0U) << PAGE_SHIFT) + static void free_4k(struct mlx5_core_dev *dev, u64 addr) { struct fw_page *fwp; int n; - fwp = find_fw_page(dev, addr & PAGE_MASK); + fwp = find_fw_page(dev, addr & MLX5_U64_4K_PAGE_MASK); if (!fwp) { mlx5_core_warn(dev, "page not found\n"); return; } - n = (addr & ~PAGE_MASK) >> MLX5_ADAPTER_PAGE_SHIFT; + n = (addr & ~MLX5_U64_4K_PAGE_MASK) >> MLX5_ADAPTER_PAGE_SHIFT; fwp->free_count++; set_bit(n, &fwp->bitmask); if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) { rb_erase(&fwp->rb_node, &dev->priv.page_root); if (fwp->free_count != 1) list_del(&fwp->list); - dma_unmap_page(&dev->pdev->dev, addr & PAGE_MASK, PAGE_SIZE, - DMA_BIDIRECTIONAL); + dma_unmap_page(&dev->pdev->dev, addr & MLX5_U64_4K_PAGE_MASK, + PAGE_SIZE, DMA_BIDIRECTIONAL); __free_page(fwp->page); kfree(fwp); } else if (fwp->free_count == 1) { |