summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorMohamad Haj Yahia <mohamad@mellanox.com>2016-10-25 18:36:33 +0300
committerDavid S. Miller <davem@davemloft.net>2016-10-29 12:00:39 -0400
commit05ac2c0b7438ea08c5d54b48797acf9b22cb2f6f (patch)
treee25e0ebfe1a447c0de7a32fea33212457debcb20 /drivers
parent2241007b3d783cbdbaa78c30bdb1994278b6f9b9 (diff)
net/mlx5: Fix race between PCI error handlers and health work
Currently there is a race between the health care work and the kernel pci error handlers because both of them detect the error, the first one to be called will do the error handling. There is a chance that health care will disable the pci after resuming pci slot. Also create a separate WQ because now we will have two types of health works, one for the error detection and one for the recovery. Fixes: 89d44f0a6c73 ('net/mlx5_core: Add pci error handlers to mlx5_core driver') Signed-off-by: Mohamad Haj Yahia <mohamad@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/health.c30
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/main.c9
2 files changed, 34 insertions, 5 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
index 2cb4094c9c49..2d00022c92d8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -64,6 +64,10 @@ enum {
MLX5_NIC_IFC_NO_DRAM_NIC = 2
};
+enum {
+ MLX5_DROP_NEW_HEALTH_WORK,
+};
+
static u8 get_nic_interface(struct mlx5_core_dev *dev)
{
return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3;
@@ -272,7 +276,13 @@ static void poll_health(unsigned long data)
if (in_fatal(dev) && !health->sick) {
health->sick = true;
print_health_info(dev);
- schedule_work(&health->work);
+ spin_lock(&health->wq_lock);
+ if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
+ queue_work(health->wq, &health->work);
+ else
+ dev_err(&dev->pdev->dev,
+ "new health works are not permitted at this stage\n");
+ spin_unlock(&health->wq_lock);
}
}
@@ -282,6 +292,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
init_timer(&health->timer);
health->sick = 0;
+ clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
health->health = &dev->iseg->health;
health->health_counter = &dev->iseg->health_counter;
@@ -298,11 +309,21 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
del_timer_sync(&health->timer);
}
+void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
+{
+ struct mlx5_core_health *health = &dev->priv.health;
+
+ spin_lock(&health->wq_lock);
+ set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+ spin_unlock(&health->wq_lock);
+ cancel_work_sync(&health->work);
+}
+
void mlx5_health_cleanup(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
- flush_work(&health->work);
+ destroy_workqueue(health->wq);
}
int mlx5_health_init(struct mlx5_core_dev *dev)
@@ -317,8 +338,11 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
strcpy(name, "mlx5_health");
strcat(name, dev_name(&dev->pdev->dev));
+ health->wq = create_singlethread_workqueue(name);
kfree(name);
-
+ if (!health->wq)
+ return -ENOMEM;
+ spin_lock_init(&health->wq_lock);
INIT_WORK(&health->work, health_care);
return 0;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 8a63910bfccf..9f90226bc120 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1315,8 +1315,13 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
dev_info(&pdev->dev, "%s was called\n", __func__);
mlx5_enter_error_state(dev);
mlx5_unload_one(dev, priv, false);
- pci_save_state(pdev);
- mlx5_pci_disable_device(dev);
+ /* In case of kernel call save the pci state and drain health wq */
+ if (state) {
+ pci_save_state(pdev);
+ mlx5_drain_health_wq(dev);
+ mlx5_pci_disable_device(dev);
+ }
+
return state == pci_channel_io_perm_failure ?
PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
}