summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2026-02-19 09:26:39 -0800
committerJakub Kicinski <kuba@kernel.org>2026-02-19 09:30:23 -0800
commit7997bca6c2b00310161fbcd628dc0161fd3b040c (patch)
treea19375288eea1b81ada5d776053556cb443cac99
parente6834a4c474697df23ab9948fd3577b26bf48656 (diff)
parent57a94d4b22b0c6cc5d601e6b6238d78fb923d991 (diff)
Merge branch 'mlx5-misc-fixes-2026-02-18'
Tariq Toukan says: ==================== mlx5 misc fixes 2026-02-18 This patchset provides misc bug fixes from the team to the mlx5 core and Eth drivers. ==================== Link: https://patch.msgid.link/20260218072904.1764634-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en.h3
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c14
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c13
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c52
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c10
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c13
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/en_main.c40
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/wc.c14
-rw-r--r--include/linux/mlx5/driver.h4
9 files changed, 78 insertions, 85 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 19fce51117c9..ea2cd1f5d1d0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -180,7 +180,8 @@ static inline u16 mlx5_min_rx_wqes(int wq_type, u32 wq_size)
}
/* Use this function to get max num channels (rxqs/txqs) only to create netdev */
-static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
+static inline unsigned int
+mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
{
return is_kdump_kernel() ?
MLX5E_MIN_NUM_CHANNELS :
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
index 424f8a2728a3..74660e7fe674 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
@@ -457,22 +457,8 @@ static void mlx5e_ptpsq_unhealthy_work(struct work_struct *work)
{
struct mlx5e_ptpsq *ptpsq =
container_of(work, struct mlx5e_ptpsq, report_unhealthy_work);
- struct mlx5e_txqsq *sq = &ptpsq->txqsq;
-
- /* Recovering the PTP SQ means re-enabling NAPI, which requires the
- * netdev instance lock. However, SQ closing has to wait for this work
- * task to finish while also holding the same lock. So either get the
- * lock or find that the SQ is no longer enabled and thus this work is
- * not relevant anymore.
- */
- while (!netdev_trylock(sq->netdev)) {
- if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
- return;
- msleep(20);
- }
mlx5e_reporter_tx_ptpsq_unhealthy(ptpsq);
- netdev_unlock(sq->netdev);
}
static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
index 0686fbdd5a05..6efb626b5506 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2019 Mellanox Technologies.
+#include <net/netdev_lock.h>
+
#include "health.h"
#include "params.h"
#include "txrx.h"
@@ -177,6 +179,16 @@ static int mlx5e_rx_reporter_timeout_recover(void *ctx)
rq = ctx;
priv = rq->priv;
+ /* Acquire netdev instance lock to synchronize with channel close and
+ * reopen flows. Either successfully obtain the lock, or detect that
+ * channels are closing for another reason, making this work no longer
+ * necessary.
+ */
+ while (!netdev_trylock(rq->netdev)) {
+ if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state))
+ return 0;
+ msleep(20);
+ }
mutex_lock(&priv->state_lock);
eq = rq->cq.mcq.eq;
@@ -186,6 +198,7 @@ static int mlx5e_rx_reporter_timeout_recover(void *ctx)
clear_bit(MLX5E_SQ_STATE_ENABLED, &rq->icosq->state);
mutex_unlock(&priv->state_lock);
+ netdev_unlock(rq->netdev);
return err;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index 4adc1adf9897..60ba840e00fa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -1,6 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2019 Mellanox Technologies. */
+#include <net/netdev_lock.h>
+
#include "health.h"
#include "en/ptp.h"
#include "en/devlink.h"
@@ -79,6 +81,18 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
return 0;
+ /* Recovering queues means re-enabling NAPI, which requires the netdev
+ * instance lock. However, SQ closing flows have to wait for work tasks
+ * to finish while also holding the netdev instance lock. So either get
+ * the lock or find that the SQ is no longer enabled and thus this work
+ * is not relevant anymore.
+ */
+ while (!netdev_trylock(dev)) {
+ if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
+ return 0;
+ msleep(20);
+ }
+
err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
if (err) {
netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
@@ -114,9 +128,11 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx)
else
mlx5e_trigger_napi_sched(sq->cq.napi);
+ netdev_unlock(dev);
return 0;
out:
clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state);
+ netdev_unlock(dev);
return err;
}
@@ -137,10 +153,24 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx)
sq = to_ctx->sq;
eq = sq->cq.mcq.eq;
priv = sq->priv;
+
+ /* Recovering the TX queues implies re-enabling NAPI, which requires
+ * the netdev instance lock.
+ * However, channel closing flows have to wait for this work to finish
+ * while holding the same lock. So either get the lock or find that
+ * channels are being closed for other reason and this work is not
+ * relevant anymore.
+ */
+ while (!netdev_trylock(sq->netdev)) {
+ if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state))
+ return 0;
+ msleep(20);
+ }
+
err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats);
if (!err) {
to_ctx->status = 0; /* this sq recovered */
- return err;
+ goto out;
}
mutex_lock(&priv->state_lock);
@@ -148,7 +178,7 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx)
mutex_unlock(&priv->state_lock);
if (!err) {
to_ctx->status = 1; /* all channels recovered */
- return err;
+ goto out;
}
to_ctx->status = err;
@@ -156,7 +186,8 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx)
netdev_err(priv->netdev,
"mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n",
err);
-
+out:
+ netdev_unlock(sq->netdev);
return err;
}
@@ -173,10 +204,22 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
return 0;
priv = ptpsq->txqsq.priv;
+ netdev = priv->netdev;
+
+ /* Recovering the PTP SQ means re-enabling NAPI, which requires the
+ * netdev instance lock. However, SQ closing has to wait for this work
+ * task to finish while also holding the same lock. So either get the
+ * lock or find that the SQ is no longer enabled and thus this work is
+ * not relevant anymore.
+ */
+ while (!netdev_trylock(netdev)) {
+ if (!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state))
+ return 0;
+ msleep(20);
+ }
mutex_lock(&priv->state_lock);
chs = &priv->channels;
- netdev = priv->netdev;
carrier_ok = netif_carrier_ok(netdev);
netif_carrier_off(netdev);
@@ -193,6 +236,7 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx)
netif_carrier_on(netdev);
mutex_unlock(&priv->state_lock);
+ netdev_unlock(netdev);
return err;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c
index 7819fb297280..d5d9146efca6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/meter.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
// Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#include <linux/iopoll.h>
#include <linux/math64.h>
#include "lib/aso.h"
#include "en/tc/post_act.h"
@@ -115,7 +116,6 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev,
struct mlx5e_flow_meters *flow_meters;
u8 cir_man, cir_exp, cbs_man, cbs_exp;
struct mlx5_aso_wqe *aso_wqe;
- unsigned long expires;
struct mlx5_aso *aso;
u64 rate, burst;
u8 ds_cnt;
@@ -187,12 +187,8 @@ mlx5e_tc_meter_modify(struct mlx5_core_dev *mdev,
mlx5_aso_post_wqe(aso, true, &aso_wqe->ctrl);
/* With newer FW, the wait for the first ASO WQE is more than 2us, put the wait 10ms. */
- expires = jiffies + msecs_to_jiffies(10);
- do {
- err = mlx5_aso_poll_cq(aso, true);
- if (err)
- usleep_range(2, 10);
- } while (err && time_is_after_jiffies(expires));
+ read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC,
+ false, aso, true);
mutex_unlock(&flow_meters->aso_lock);
return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c
index 528b04d4de41..90b3bc5f9166 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/macsec.c
@@ -5,6 +5,7 @@
#include <linux/mlx5/mlx5_ifc.h>
#include <linux/xarray.h>
#include <linux/if_vlan.h>
+#include <linux/iopoll.h>
#include "en.h"
#include "lib/aso.h"
@@ -1385,7 +1386,8 @@ static int macsec_aso_set_arm_event(struct mlx5_core_dev *mdev, struct mlx5e_mac
MLX5_ACCESS_ASO_OPC_MOD_MACSEC);
macsec_aso_build_ctrl(aso, &aso_wqe->aso_ctrl, in);
mlx5_aso_post_wqe(maso, false, &aso_wqe->ctrl);
- err = mlx5_aso_poll_cq(maso, false);
+ read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC,
+ false, maso, false);
mutex_unlock(&aso->aso_lock);
return err;
@@ -1397,7 +1399,6 @@ static int macsec_aso_query(struct mlx5_core_dev *mdev, struct mlx5e_macsec *mac
struct mlx5e_macsec_aso *aso;
struct mlx5_aso_wqe *aso_wqe;
struct mlx5_aso *maso;
- unsigned long expires;
int err;
aso = &macsec->aso;
@@ -1411,12 +1412,8 @@ static int macsec_aso_query(struct mlx5_core_dev *mdev, struct mlx5e_macsec *mac
macsec_aso_build_wqe_ctrl_seg(aso, &aso_wqe->aso_ctrl, NULL);
mlx5_aso_post_wqe(maso, false, &aso_wqe->ctrl);
- expires = jiffies + msecs_to_jiffies(10);
- do {
- err = mlx5_aso_poll_cq(maso, false);
- if (err)
- usleep_range(2, 10);
- } while (err && time_is_after_jiffies(expires));
+ read_poll_timeout(mlx5_aso_poll_cq, err, !err, 10, 10 * USEC_PER_MSEC,
+ false, maso, false);
if (err)
goto err_out;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 6a7ca4571c19..7eb691c2a1bd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -631,19 +631,7 @@ static void mlx5e_rq_timeout_work(struct work_struct *timeout_work)
struct mlx5e_rq,
rx_timeout_work);
- /* Acquire netdev instance lock to synchronize with channel close and
- * reopen flows. Either successfully obtain the lock, or detect that
- * channels are closing for another reason, making this work no longer
- * necessary.
- */
- while (!netdev_trylock(rq->netdev)) {
- if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state))
- return;
- msleep(20);
- }
-
mlx5e_reporter_rx_timeout(rq);
- netdev_unlock(rq->netdev);
}
static int mlx5e_alloc_mpwqe_rq_drop_page(struct mlx5e_rq *rq)
@@ -1952,20 +1940,7 @@ void mlx5e_tx_err_cqe_work(struct work_struct *recover_work)
struct mlx5e_txqsq *sq = container_of(recover_work, struct mlx5e_txqsq,
recover_work);
- /* Recovering queues means re-enabling NAPI, which requires the netdev
- * instance lock. However, SQ closing flows have to wait for work tasks
- * to finish while also holding the netdev instance lock. So either get
- * the lock or find that the SQ is no longer enabled and thus this work
- * is not relevant anymore.
- */
- while (!netdev_trylock(sq->netdev)) {
- if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))
- return;
- msleep(20);
- }
-
mlx5e_reporter_tx_err_cqe(sq);
- netdev_unlock(sq->netdev);
}
static struct dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode)
@@ -5115,19 +5090,6 @@ static void mlx5e_tx_timeout_work(struct work_struct *work)
struct net_device *netdev = priv->netdev;
int i;
- /* Recovering the TX queues implies re-enabling NAPI, which requires
- * the netdev instance lock.
- * However, channel closing flows have to wait for this work to finish
- * while holding the same lock. So either get the lock or find that
- * channels are being closed for other reason and this work is not
- * relevant anymore.
- */
- while (!netdev_trylock(netdev)) {
- if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state))
- return;
- msleep(20);
- }
-
for (i = 0; i < netdev->real_num_tx_queues; i++) {
struct netdev_queue *dev_queue =
netdev_get_tx_queue(netdev, i);
@@ -5140,8 +5102,6 @@ static void mlx5e_tx_timeout_work(struct work_struct *work)
/* break if tried to reopened channels */
break;
}
-
- netdev_unlock(netdev);
}
static void mlx5e_tx_timeout(struct net_device *dev, unsigned int txqueue)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wc.c b/drivers/net/ethernet/mellanox/mlx5/core/wc.c
index 815a7c97d6b0..04d03be1bb77 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/wc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/wc.c
@@ -2,6 +2,7 @@
// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#include <linux/io.h>
+#include <linux/iopoll.h>
#include <linux/mlx5/transobj.h>
#include "lib/clock.h"
#include "mlx5_core.h"
@@ -15,7 +16,7 @@
#define TEST_WC_NUM_WQES 255
#define TEST_WC_LOG_CQ_SZ (order_base_2(TEST_WC_NUM_WQES))
#define TEST_WC_SQ_LOG_WQ_SZ TEST_WC_LOG_CQ_SZ
-#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100)
+#define TEST_WC_POLLING_MAX_TIME_USEC (100 * USEC_PER_MSEC)
struct mlx5_wc_cq {
/* data path - accessed per cqe */
@@ -359,7 +360,6 @@ static int mlx5_wc_poll_cq(struct mlx5_wc_sq *sq)
static void mlx5_core_test_wc(struct mlx5_core_dev *mdev)
{
unsigned int offset = 0;
- unsigned long expires;
struct mlx5_wc_sq *sq;
int i, err;
@@ -389,13 +389,9 @@ static void mlx5_core_test_wc(struct mlx5_core_dev *mdev)
mlx5_wc_post_nop(sq, &offset, true);
- expires = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES;
- do {
- err = mlx5_wc_poll_cq(sq);
- if (err)
- usleep_range(2, 10);
- } while (mdev->wc_state == MLX5_WC_STATE_UNINITIALIZED &&
- time_is_after_jiffies(expires));
+ poll_timeout_us(mlx5_wc_poll_cq(sq),
+ mdev->wc_state != MLX5_WC_STATE_UNINITIALIZED, 10,
+ TEST_WC_POLLING_MAX_TIME_USEC, false);
mlx5_wc_destroy_sq(sq);
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e2d067b1e67b..04dcd09f7517 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1282,12 +1282,12 @@ static inline bool mlx5_rl_is_supported(struct mlx5_core_dev *dev)
static inline int mlx5_core_is_mp_slave(struct mlx5_core_dev *dev)
{
return MLX5_CAP_GEN(dev, affiliate_nic_vport_criteria) &&
- MLX5_CAP_GEN(dev, num_vhca_ports) <= 1;
+ MLX5_CAP_GEN_MAX(dev, num_vhca_ports) <= 1;
}
static inline int mlx5_core_is_mp_master(struct mlx5_core_dev *dev)
{
- return MLX5_CAP_GEN(dev, num_vhca_ports) > 1;
+ return MLX5_CAP_GEN_MAX(dev, num_vhca_ports) > 1;
}
static inline int mlx5_core_mp_enabled(struct mlx5_core_dev *dev)