summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorErez Shitrit <erezsh@mellanox.com>2018-05-21 11:41:01 +0300
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2018-07-03 11:23:09 +0200
commite355402cf19cf30e5f659ac21ddf40ccd7e5bcf0 (patch)
tree4b44b3367abfb16c5b127a36ad8214c32dffbe18
parent9cac0a08e476775df3ae5f90730a766ea01b6232 (diff)
IB/mlx5: Fetch soft WQE's on fatal error state
commit 7b74a83cf54a3747e22c57e25712bd70eef8acee upstream. On fatal error the driver simulates CQE's for ULPs that rely on completion of all their posted work-request. For the GSI traffic, the mlx5 has its own mechanism that sends the completions via software CQE's directly to the relevant CQ. This should be kept in fatal error too, so the driver should simulate such CQE's with the specified error state in order to complete GSI QP work requests. Without the fix the next deadlock might appears: schedule_timeout+0x274/0x350 wait_for_common+0xec/0x240 mcast_remove_one+0xd0/0x120 [ib_core] ib_unregister_device+0x12c/0x230 [ib_core] mlx5_ib_remove+0xc4/0x270 [mlx5_ib] mlx5_detach_device+0x184/0x1a0 [mlx5_core] mlx5_unload_one+0x308/0x340 [mlx5_core] mlx5_pci_err_detected+0x74/0xe0 [mlx5_core] Cc: <stable@vger.kernel.org> # 4.7 Fixes: 89ea94a7b6c4 ("IB/mlx5: Reset flow support for IB kernel ULPs") Signed-off-by: Erez Shitrit <erezsh@mellanox.com> Signed-off-by: Leon Romanovsky <leonro@mellanox.com> Signed-off-by: Jason Gunthorpe <jgg@mellanox.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--drivers/infiniband/hw/mlx5/cq.c15
1 files changed, 12 insertions, 3 deletions
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index fc62a7ded734..a19ebb19952e 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -645,7 +645,7 @@ repoll:
}
static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries,
- struct ib_wc *wc)
+ struct ib_wc *wc, bool is_fatal_err)
{
struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
struct mlx5_ib_wc *soft_wc, *next;
@@ -658,6 +658,10 @@ static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries,
mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n",
cq->mcq.cqn);
+ if (unlikely(is_fatal_err)) {
+ soft_wc->wc.status = IB_WC_WR_FLUSH_ERR;
+ soft_wc->wc.vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
+ }
wc[npolled++] = soft_wc->wc;
list_del(&soft_wc->list);
kfree(soft_wc);
@@ -678,12 +682,17 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
spin_lock_irqsave(&cq->lock, flags);
if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
- mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
+ /* make sure no soft wqe's are waiting */
+ if (unlikely(!list_empty(&cq->wc_list)))
+ soft_polled = poll_soft_wc(cq, num_entries, wc, true);
+
+ mlx5_ib_poll_sw_comp(cq, num_entries - soft_polled,
+ wc + soft_polled, &npolled);
goto out;
}
if (unlikely(!list_empty(&cq->wc_list)))
- soft_polled = poll_soft_wc(cq, num_entries, wc);
+ soft_polled = poll_soft_wc(cq, num_entries, wc, false);
for (npolled = 0; npolled < num_entries - soft_polled; npolled++) {
if (mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled))