From fd980bf6e9cdae885105685259421164f843ca55 Mon Sep 17 00:00:00 2001 From: Suraj Gupta Date: Wed, 13 Aug 2025 19:25:59 +0530 Subject: net: xilinx: axienet: Fix RX skb ring management in DMAengine mode Submit multiple descriptors in axienet_rx_cb() to fill Rx skb ring. This ensures the ring "catches up" on previously missed allocations. Increment Rx skb ring head pointer after BD is successfully allocated. Previously, head pointer was incremented before verifying if descriptor is successfully allocated and has valid entries, which could lead to ring state inconsistency if descriptor setup failed. These changes improve reliability by maintaining adequate descriptor availability and ensuring proper ring buffer state management. Fixes: 6a91b846af85 ("net: axienet: Introduce dmaengine support") Signed-off-by: Suraj Gupta Link: https://patch.msgid.link/20250813135559.1555652-1-suraj.gupta2@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c index 6011d7eae0c7..0d8a05fe541a 100644 --- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c +++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c @@ -1160,6 +1160,7 @@ static void axienet_dma_rx_cb(void *data, const struct dmaengine_result *result) struct axienet_local *lp = data; struct sk_buff *skb; u32 *app_metadata; + int i; skbuf_dma = axienet_get_rx_desc(lp, lp->rx_ring_tail++); skb = skbuf_dma->skb; @@ -1178,7 +1179,10 @@ static void axienet_dma_rx_cb(void *data, const struct dmaengine_result *result) u64_stats_add(&lp->rx_packets, 1); u64_stats_add(&lp->rx_bytes, rx_len); u64_stats_update_end(&lp->rx_stat_sync); - axienet_rx_submit_desc(lp->ndev); + + for (i = 0; i < CIRC_SPACE(lp->rx_ring_head, lp->rx_ring_tail, + RX_BUF_NUM_DEFAULT); i++) + axienet_rx_submit_desc(lp->ndev); dma_async_issue_pending(lp->rx_chan); } @@ -1457,7 +1461,6 @@ static void axienet_rx_submit_desc(struct net_device *ndev) if (!skbuf_dma) return; - lp->rx_ring_head++; skb = netdev_alloc_skb(ndev, lp->max_frm_size); if (!skb) return; @@ -1482,6 +1485,7 @@ static void axienet_rx_submit_desc(struct net_device *ndev) skbuf_dma->desc = dma_rx_desc; dma_rx_desc->callback_param = lp; dma_rx_desc->callback_result = axienet_dma_rx_cb; + lp->rx_ring_head++; dmaengine_submit(dma_rx_desc); return; -- cgit v1.2.3 From 065c31f2c6915b38f45b1c817b31f41f62eaa774 Mon Sep 17 00:00:00 2001 From: Justin Lai Date: Wed, 13 Aug 2025 15:16:31 +0800 Subject: rtase: Fix Rx descriptor CRC error bit definition The CRC error bit is located at bit 17 in the Rx descriptor, but the driver was incorrectly using bit 16. Fix it. Fixes: a36e9f5cfe9e ("rtase: Add support for a pci table in this module") Signed-off-by: Justin Lai Link: https://patch.msgid.link/20250813071631.7566-1-justinlai0215@realtek.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/rtase/rtase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/realtek/rtase/rtase.h b/drivers/net/ethernet/realtek/rtase/rtase.h index 20decdeb9fdb..b9209eb6ea73 100644 --- a/drivers/net/ethernet/realtek/rtase/rtase.h +++ b/drivers/net/ethernet/realtek/rtase/rtase.h @@ -241,7 +241,7 @@ union rtase_rx_desc { #define RTASE_RX_RES BIT(20) #define RTASE_RX_RUNT BIT(19) #define RTASE_RX_RWT BIT(18) -#define RTASE_RX_CRC BIT(16) +#define RTASE_RX_CRC BIT(17) #define RTASE_RX_V6F BIT(31) #define RTASE_RX_V4F BIT(30) #define RTASE_RX_UDPT BIT(29) -- cgit v1.2.3 From f604d3aaf64ff0d90cc875295474d3abf4155629 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 14 Aug 2025 15:06:40 +0200 Subject: mlxsw: spectrum: Forward packets with an IPv4 link-local source IP By default, the device does not forward IPv4 packets with a link-local source IP (i.e., 169.254.0.0/16). This behavior does not align with the kernel which does forward them. Fix by instructing the device to forward such packets instead of dropping them. Fixes: ca360db4b825 ("mlxsw: spectrum: Disable DIP_LINK_LOCAL check in hardware pipeline") Reported-by: Zoey Mertes Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Link: https://patch.msgid.link/6721e6b2c96feb80269e72ce8d0b426e2f32d99c.1755174341.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 2 ++ drivers/net/ethernet/mellanox/mlxsw/trap.h | 1 + 2 files changed, 3 insertions(+) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 618957d65663..9a2d64a0a858 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2375,6 +2375,8 @@ static const struct mlxsw_listener mlxsw_sp_listener[] = { ROUTER_EXP, false), MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_DIP_LINK_LOCAL, FORWARD, ROUTER_EXP, false), + MLXSW_SP_RXL_NO_MARK(DISCARD_ING_ROUTER_SIP_LINK_LOCAL, FORWARD, + ROUTER_EXP, false), /* Multicast Router Traps */ MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false), MLXSW_SP_RXL_L3_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false), diff --git a/drivers/net/ethernet/mellanox/mlxsw/trap.h b/drivers/net/ethernet/mellanox/mlxsw/trap.h index 80ee5c4825dc..9962dc157901 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/trap.h +++ b/drivers/net/ethernet/mellanox/mlxsw/trap.h @@ -94,6 +94,7 @@ enum { MLXSW_TRAP_ID_DISCARD_ING_ROUTER_IPV4_SIP_BC = 0x16A, MLXSW_TRAP_ID_DISCARD_ING_ROUTER_IPV4_DIP_LOCAL_NET = 0x16B, MLXSW_TRAP_ID_DISCARD_ING_ROUTER_DIP_LINK_LOCAL = 0x16C, + MLXSW_TRAP_ID_DISCARD_ING_ROUTER_SIP_LINK_LOCAL = 0x16D, MLXSW_TRAP_ID_DISCARD_ROUTER_IRIF_EN = 0x178, MLXSW_TRAP_ID_DISCARD_ROUTER_ERIF_EN = 0x179, MLXSW_TRAP_ID_DISCARD_ROUTER_LPM4 = 0x17B, -- cgit v1.2.3 From 12da2b92ad50e6602b4c5e9073d71f2368b70b63 Mon Sep 17 00:00:00 2001 From: Chandra Mohan Sundar Date: Thu, 14 Aug 2025 22:00:10 +0530 Subject: net: libwx: Fix the size in RSS hash key population While trying to fill a random RSS key, the size of the pointer is being used rather than the actual size of the RSS key. Fix by passing an appropriate value of the RSS key. This issue was reported by static coverity analyser. Fixes: eb4898fde1de8 ("net: libwx: add wangxun vf common api") Signed-off-by: Chandra Mohan Sundar Link: https://patch.msgid.link/20250814163014.613004-1-chandramohan.explore@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c index 5d48df7a849f..3023ea2732ef 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_vf_lib.c @@ -192,7 +192,7 @@ void wx_setup_vfmrqc_vf(struct wx *wx) u8 i, j; /* Fill out hash function seeds */ - netdev_rss_key_fill(wx->rss_key, sizeof(wx->rss_key)); + netdev_rss_key_fill(wx->rss_key, WX_RSS_KEY_SIZE); for (i = 0; i < WX_RSS_KEY_SIZE / 4; i++) wr32(wx, WX_VXRSSRK(i), wx->rss_key[i]); -- cgit v1.2.3 From 89eb9a62aed77b409663ba1eac152e8f758815b7 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Fri, 15 Aug 2025 22:18:09 +0200 Subject: net: dsa: b53: fix reserved register access in b53_fdb_dump() When BCM5325 support was added in c45655386e53 ("net: dsa: b53: add support for FDB operations on 5325/5365"), the register used for ARL access was made conditional on the chip. But in b53_fdb_dump(), instead of the register argument the page argument was replaced, causing it to write to a reserved page 0x50 on !BCM5325*. Writing to this page seems to completely lock the switch up: [ 89.680000] b53-switch spi0.1 lan2: Link is Down [ 89.680000] WARNING: CPU: 1 PID: 26 at drivers/net/phy/phy.c:1350 _phy_state_machine+0x1bc/0x454 [ 89.720000] phy_check_link_status+0x0/0x114: returned: -5 [ 89.730000] Modules linked in: nft_fib_inet nf_flow_table_inet nft_reject_ipv6 nft_reject_ipv4 nft_reject_inet nft_reject nft_redir nft_quota nft_numgen nft_nat nft_masq nft_log nft_limit nft_hash nft_flow_offload nft_fib_ipv6 nft_fib_ipv4 nft_fib nft_ct nft_chain_nat nf_tables nf_nat nf_flow_table nf_conntrack nfnetlink nf_reject_ipv6 nf_reject_ipv4 nf_log_syslog nf_defrag_ipv6 nf_defrag_ipv4 cls_flower sch_tbf sch_ingress sch_htb sch_hfsc em_u32 cls_u32 cls_route cls_matchall cls_fw cls_flow cls_basic act_skbedit act_mirred act_gact vrf md5 crc32c_cryptoapi [ 89.780000] CPU: 1 UID: 0 PID: 26 Comm: kworker/u10:0 Tainted: G W 6.16.0-rc1+ #0 NONE [ 89.780000] Tainted: [W]=WARN [ 89.780000] Hardware name: Netgear DGND3700 v1 [ 89.780000] Workqueue: events_power_efficient phy_state_machine [ 89.780000] Stack : 809c762c 8006b050 00000001 820a9ce3 0000114c 000affff 805d22d0 8200ba00 [ 89.780000] 82005000 6576656e 74735f70 6f776572 5f656666 10008b00 820a9cb8 82088700 [ 89.780000] 00000000 00000000 809c762c 820a9a98 00000000 00000000 ffffefff 80a7a76c [ 89.780000] 80a70000 820a9af8 80a70000 80a70000 80a70000 00000000 809c762c 820a9dd4 [ 89.780000] 00000000 805d1494 80a029e4 80a70000 00000003 00000000 00000004 81a60004 [ 89.780000] ... [ 89.780000] Call Trace: [ 89.780000] [<800228b8>] show_stack+0x38/0x118 [ 89.780000] [<8001afc4>] dump_stack_lvl+0x6c/0xac [ 89.780000] [<80046b90>] __warn+0x9c/0x114 [ 89.780000] [<80046da8>] warn_slowpath_fmt+0x1a0/0x1b0 [ 89.780000] [<805d1494>] _phy_state_machine+0x1bc/0x454 [ 89.780000] [<805d22fc>] phy_state_machine+0x2c/0x70 [ 89.780000] [<80066b08>] process_one_work+0x1e8/0x3e0 [ 89.780000] [<80067a1c>] worker_thread+0x354/0x4e4 [ 89.780000] [<800706cc>] kthread+0x130/0x274 [ 89.780000] [<8001d808>] ret_from_kernel_thread+0x14/0x1c And any further accesses fail: [ 120.790000] b53-switch spi0.1: timeout waiting for ARL to finish: 0x81 [ 120.800000] b53-switch spi0.1: port 2 failed to add 2c:b0:5d:27:9a:bd vid 3 to fdb: -145 [ 121.010000] b53-switch spi0.1: timeout waiting for ARL to finish: 0xbf [ 121.020000] b53-switch spi0.1: port 3 failed to add 2c:b0:5d:27:9a:bd vid 3 to fdb: -145 Restore the correct page B53_ARLIO_PAGE again, and move the offset argument to the correct place. *On BCM5325, this became a write to the MIB page of Port 1. Still a reserved offset, but likely less brokenness from that write. Fixes: c45655386e53 ("net: dsa: b53: add support for FDB operations on 5325/5365") Signed-off-by: Jonas Gorski Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250815201809.549195-1-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 9942fb6f7f4b..829b1f087e9e 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -2078,7 +2078,7 @@ int b53_fdb_dump(struct dsa_switch *ds, int port, /* Start search operation */ reg = ARL_SRCH_STDN; - b53_write8(priv, offset, B53_ARL_SRCH_CTL, reg); + b53_write8(priv, B53_ARLIO_PAGE, offset, reg); do { ret = b53_arl_search_wait(priv); -- cgit v1.2.3 From 4611d88a37cfc18cbabc6978aaf7325d1ae3f53a Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Sat, 16 Aug 2025 11:38:50 -0700 Subject: bnxt_en: Fix lockdep warning during rmmod The commit under the Fixes tag added a netdev_assert_locked() in bnxt_free_ntp_fltrs(). The lock should be held during normal run-time but the assert will be triggered (see below) during bnxt_remove_one() which should not need the lock. The netdev is already unregistered by then. Fix it by calling netdev_assert_locked_or_invisible() which will not assert if the netdev is unregistered. WARNING: CPU: 5 PID: 2241 at ./include/net/netdev_lock.h:17 bnxt_free_ntp_fltrs+0xf8/0x100 [bnxt_en] Modules linked in: rpcrdma rdma_cm iw_cm ib_cm configfs ib_core bnxt_en(-) bridge stp llc x86_pkg_temp_thermal xfs tg3 [last unloaded: bnxt_re] CPU: 5 UID: 0 PID: 2241 Comm: rmmod Tainted: G S W 6.16.0 #2 PREEMPT(voluntary) Tainted: [S]=CPU_OUT_OF_SPEC, [W]=WARN Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.4.3 01/17/2017 RIP: 0010:bnxt_free_ntp_fltrs+0xf8/0x100 [bnxt_en] Code: 41 5c 41 5d 41 5e 41 5f c3 cc cc cc cc 48 8b 47 60 be ff ff ff ff 48 8d b8 28 0c 00 00 e8 d0 cf 41 c3 85 c0 0f 85 2e ff ff ff <0f> 0b e9 27 ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 0018:ffffa92082387da0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff9e5b593d8000 RCX: 0000000000000001 RDX: 0000000000000001 RSI: ffffffff83dc9a70 RDI: ffffffff83e1a1cf RBP: ffff9e5b593d8c80 R08: 0000000000000000 R09: ffffffff8373a2b3 R10: 000000008100009f R11: 0000000000000001 R12: 0000000000000001 R13: ffffffffc01c4478 R14: dead000000000122 R15: dead000000000100 FS: 00007f3a8a52c740(0000) GS:ffff9e631ad1c000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055bb289419c8 CR3: 000000011274e001 CR4: 00000000003706f0 Call Trace: bnxt_remove_one+0x57/0x180 [bnxt_en] pci_device_remove+0x39/0xc0 device_release_driver_internal+0xa5/0x130 driver_detach+0x42/0x90 bus_remove_driver+0x61/0xc0 pci_unregister_driver+0x38/0x90 bnxt_exit+0xc/0x7d0 [bnxt_en] Fixes: 004b5008016a ("eth: bnxt: remove most dependencies on RTNL") Reviewed-by: Pavan Chebbi Signed-off-by: Michael Chan Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250816183850.4125033-1-michael.chan@broadcom.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2800a90fba1f..207a8bb36ae5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -5332,7 +5332,7 @@ static void bnxt_free_ntp_fltrs(struct bnxt *bp, bool all) { int i; - netdev_assert_locked(bp->dev); + netdev_assert_locked_or_invisible(bp->dev); /* Under netdev instance lock and all our NAPIs have been disabled. * It's safe to delete the hash table. -- cgit v1.2.3 From 62c30c544359aa18b8fb2734166467a07d435c2d Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Thu, 14 Aug 2025 09:25:57 +0800 Subject: net: ethernet: mtk_ppe: add RCU lock around dev_fill_forward_path Ensure ndo_fill_forward_path() is called with RCU lock held. Fixes: 2830e314778d ("net: ethernet: mtk-ppe: fix traffic offload with bridged wlan") Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20250814012559.3705-1-dqfext@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mediatek/mtk_ppe_offload.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c index c855fb799ce1..e9bd32741983 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c @@ -101,7 +101,9 @@ mtk_flow_get_wdma_info(struct net_device *dev, const u8 *addr, struct mtk_wdma_i if (!IS_ENABLED(CONFIG_NET_MEDIATEK_SOC_WED)) return -1; + rcu_read_lock(); err = dev_fill_forward_path(dev, addr, &stack); + rcu_read_unlock(); if (err) return err; -- cgit v1.2.3 From 0417adf367a0af11adf7ace849af4638cfb573f7 Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Thu, 14 Aug 2025 09:25:58 +0800 Subject: ppp: fix race conditions in ppp_fill_forward_path ppp_fill_forward_path() has two race conditions: 1. The ppp->channels list can change between list_empty() and list_first_entry(), as ppp_lock() is not held. If the only channel is deleted in ppp_disconnect_channel(), list_first_entry() may access an empty head or a freed entry, and trigger a panic. 2. pch->chan can be NULL. When ppp_unregister_channel() is called, pch->chan is set to NULL before pch is removed from ppp->channels. Fix these by using a lockless RCU approach: - Use list_first_or_null_rcu() to safely test and access the first list entry. - Convert list modifications on ppp->channels to their RCU variants and add synchronize_net() after removal. - Check for a NULL pch->chan before dereferencing it. Fixes: f6efc675c9dd ("net: ppp: resolve forwarding path for bridge pppoe devices") Signed-off-by: Qingfang Deng Link: https://patch.msgid.link/20250814012559.3705-2-dqfext@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ppp/ppp_generic.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 8c98cbd4b06d..824c8dc4120b 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1598,11 +1599,14 @@ static int ppp_fill_forward_path(struct net_device_path_ctx *ctx, if (ppp->flags & SC_MULTILINK) return -EOPNOTSUPP; - if (list_empty(&ppp->channels)) + pch = list_first_or_null_rcu(&ppp->channels, struct channel, clist); + if (!pch) + return -ENODEV; + + chan = READ_ONCE(pch->chan); + if (!chan) return -ENODEV; - pch = list_first_entry(&ppp->channels, struct channel, clist); - chan = pch->chan; if (!chan->ops->fill_forward_path) return -EOPNOTSUPP; @@ -2994,7 +2998,7 @@ ppp_unregister_channel(struct ppp_channel *chan) */ down_write(&pch->chan_sem); spin_lock_bh(&pch->downl); - pch->chan = NULL; + WRITE_ONCE(pch->chan, NULL); spin_unlock_bh(&pch->downl); up_write(&pch->chan_sem); ppp_disconnect_channel(pch); @@ -3515,7 +3519,7 @@ ppp_connect_channel(struct channel *pch, int unit) hdrlen = pch->file.hdrlen + 2; /* for protocol bytes */ if (hdrlen > ppp->dev->hard_header_len) ppp->dev->hard_header_len = hdrlen; - list_add_tail(&pch->clist, &ppp->channels); + list_add_tail_rcu(&pch->clist, &ppp->channels); ++ppp->n_channels; pch->ppp = ppp; refcount_inc(&ppp->file.refcnt); @@ -3545,10 +3549,11 @@ ppp_disconnect_channel(struct channel *pch) if (ppp) { /* remove it from the ppp unit's list */ ppp_lock(ppp); - list_del(&pch->clist); + list_del_rcu(&pch->clist); if (--ppp->n_channels == 0) wake_up_interruptible(&ppp->file.rwait); ppp_unlock(ppp); + synchronize_net(); if (refcount_dec_and_test(&ppp->file.refcnt)) ppp_destroy_interface(ppp); err = 0; -- cgit v1.2.3 From 01792bc3e5bdafa171dd83c7073f00e7de93a653 Mon Sep 17 00:00:00 2001 From: MD Danish Anwar Date: Thu, 14 Aug 2025 16:21:06 +0530 Subject: net: ti: icssg-prueth: Fix HSR and switch offload Enablement during firwmare reload. To enable HSR / Switch offload, certain configurations are needed. Currently they are done inside icssg_change_mode(). This function only gets called if we move from one mode to another without bringing the links up / down. Once in HSR / Switch mode, if we bring the links down and bring it back up again. The callback sequence is, - emac_ndo_stop() Firmwares are stopped - emac_ndo_open() Firmwares are loaded In this path icssg_change_mode() doesn't get called and as a result the configurations needed for HSR / Switch is not done. To fix this, put all these configurations in a separate function icssg_enable_fw_offload() and call this from both icssg_change_mode() and emac_ndo_open() Fixes: 56375086d093 ("net: ti: icssg-prueth: Enable HSR Tx duplication, Tx Tag and Rx Tag offload") Signed-off-by: MD Danish Anwar Link: https://patch.msgid.link/20250814105106.1491871-1-danishanwar@ti.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 72 ++++++++++++++++------------ 1 file changed, 41 insertions(+), 31 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 6c7d776ae4ee..dadce6009791 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -203,6 +203,44 @@ static void prueth_emac_stop(struct prueth *prueth) } } +static void icssg_enable_fw_offload(struct prueth *prueth) +{ + struct prueth_emac *emac; + int mac; + + for (mac = PRUETH_MAC0; mac < PRUETH_NUM_MACS; mac++) { + emac = prueth->emac[mac]; + if (prueth->is_hsr_offload_mode) { + if (emac->ndev->features & NETIF_F_HW_HSR_TAG_RM) + icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_ENABLE); + else + icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_DISABLE); + } + + if (prueth->is_switch_mode || prueth->is_hsr_offload_mode) { + if (netif_running(emac->ndev)) { + icssg_fdb_add_del(emac, eth_stp_addr, prueth->default_vlan, + ICSSG_FDB_ENTRY_P0_MEMBERSHIP | + ICSSG_FDB_ENTRY_P1_MEMBERSHIP | + ICSSG_FDB_ENTRY_P2_MEMBERSHIP | + ICSSG_FDB_ENTRY_BLOCK, + true); + icssg_vtbl_modify(emac, emac->port_vlan | DEFAULT_VID, + BIT(emac->port_id) | DEFAULT_PORT_MASK, + BIT(emac->port_id) | DEFAULT_UNTAG_MASK, + true); + if (prueth->is_hsr_offload_mode) + icssg_vtbl_modify(emac, DEFAULT_VID, + DEFAULT_PORT_MASK, + DEFAULT_UNTAG_MASK, true); + icssg_set_pvid(prueth, emac->port_vlan, emac->port_id); + if (prueth->is_switch_mode) + icssg_set_port_state(emac, ICSSG_EMAC_PORT_VLAN_AWARE_ENABLE); + } + } + } +} + static int prueth_emac_common_start(struct prueth *prueth) { struct prueth_emac *emac; @@ -753,6 +791,7 @@ static int emac_ndo_open(struct net_device *ndev) ret = prueth_emac_common_start(prueth); if (ret) goto free_rx_irq; + icssg_enable_fw_offload(prueth); } flow_cfg = emac->dram.va + ICSSG_CONFIG_OFFSET + PSI_L_REGULAR_FLOW_ID_BASE_OFFSET; @@ -1360,8 +1399,7 @@ static int prueth_emac_restart(struct prueth *prueth) static void icssg_change_mode(struct prueth *prueth) { - struct prueth_emac *emac; - int mac, ret; + int ret; ret = prueth_emac_restart(prueth); if (ret) { @@ -1369,35 +1407,7 @@ static void icssg_change_mode(struct prueth *prueth) return; } - for (mac = PRUETH_MAC0; mac < PRUETH_NUM_MACS; mac++) { - emac = prueth->emac[mac]; - if (prueth->is_hsr_offload_mode) { - if (emac->ndev->features & NETIF_F_HW_HSR_TAG_RM) - icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_ENABLE); - else - icssg_set_port_state(emac, ICSSG_EMAC_HSR_RX_OFFLOAD_DISABLE); - } - - if (netif_running(emac->ndev)) { - icssg_fdb_add_del(emac, eth_stp_addr, prueth->default_vlan, - ICSSG_FDB_ENTRY_P0_MEMBERSHIP | - ICSSG_FDB_ENTRY_P1_MEMBERSHIP | - ICSSG_FDB_ENTRY_P2_MEMBERSHIP | - ICSSG_FDB_ENTRY_BLOCK, - true); - icssg_vtbl_modify(emac, emac->port_vlan | DEFAULT_VID, - BIT(emac->port_id) | DEFAULT_PORT_MASK, - BIT(emac->port_id) | DEFAULT_UNTAG_MASK, - true); - if (prueth->is_hsr_offload_mode) - icssg_vtbl_modify(emac, DEFAULT_VID, - DEFAULT_PORT_MASK, - DEFAULT_UNTAG_MASK, true); - icssg_set_pvid(prueth, emac->port_vlan, emac->port_id); - if (prueth->is_switch_mode) - icssg_set_port_state(emac, ICSSG_EMAC_PORT_VLAN_AWARE_ENABLE); - } - } + icssg_enable_fw_offload(prueth); } static int prueth_netdevice_port_link(struct net_device *ndev, -- cgit v1.2.3 From 4a73a36cb704813f588af13d9842d0ba5a185758 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 14 Aug 2025 17:42:14 +0200 Subject: cdc_ncm: Flag Intel OEM version of Fibocom L850-GL as WWAN This lets NetworkManager/ModemManager know that this is a modem and needs to be connected first. Signed-off-by: Lubomir Rintel Link: https://patch.msgid.link/20250814154214.250103-1-lkundrak@v3.sk Signed-off-by: Jakub Kicinski --- drivers/net/usb/cdc_ncm.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/net') diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index ea0e5e276cd6..5d123df0a866 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -2087,6 +2087,13 @@ static const struct usb_device_id cdc_devs[] = { .driver_info = (unsigned long)&wwan_info, }, + /* Intel modem (label from OEM reads Fibocom L850-GL) */ + { USB_DEVICE_AND_INTERFACE_INFO(0x8087, 0x095a, + USB_CLASS_COMM, + USB_CDC_SUBCLASS_NCM, USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_info, + }, + /* DisplayLink docking stations */ { .match_flags = USB_DEVICE_ID_MATCH_INT_INFO | USB_DEVICE_ID_MATCH_VENDOR, -- cgit v1.2.3 From bc1a59cff9f797bfbf8f3104507584d89e9ecf2e Mon Sep 17 00:00:00 2001 From: Horatiu Vultur Date: Mon, 18 Aug 2025 10:10:29 +0200 Subject: phy: mscc: Fix timestamping for vsc8584 There was a problem when we received frames and the frames were timestamped. The driver is configured to store the nanosecond part of the timestmap in the ptp reserved bits and it would take the second part by reading the LTC. The problem is that when reading the LTC we are in atomic context and to read the second part will go over mdio bus which might sleep, so we get an error. The fix consists in actually put all the frames in a queue and start the aux work and in that work to read the LTC and then calculate the full received time. Fixes: 7d272e63e0979d ("net: phy: mscc: timestamping and PHC support") Signed-off-by: Horatiu Vultur Reviewed-by: Vadim Fedorenko Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/20250818081029.1300780-1-horatiu.vultur@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/mscc/mscc.h | 12 ++++++++++ drivers/net/phy/mscc/mscc_main.c | 12 ++++++++++ drivers/net/phy/mscc/mscc_ptp.c | 49 ++++++++++++++++++++++++++++++---------- 3 files changed, 61 insertions(+), 12 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/phy/mscc/mscc.h b/drivers/net/phy/mscc/mscc.h index 6a3d8a754eb8..58c6d47fbe04 100644 --- a/drivers/net/phy/mscc/mscc.h +++ b/drivers/net/phy/mscc/mscc.h @@ -362,6 +362,13 @@ struct vsc85xx_hw_stat { u16 mask; }; +struct vsc8531_skb_cb { + u32 ns; +}; + +#define VSC8531_SKB_CB(skb) \ + ((struct vsc8531_skb_cb *)((skb)->cb)) + struct vsc8531_private { int rate_magic; u16 supp_led_modes; @@ -410,6 +417,11 @@ struct vsc8531_private { */ struct mutex ts_lock; struct mutex phc_lock; + + /* list of skbs that were received and need timestamp information but it + * didn't received it yet + */ + struct sk_buff_head rx_skbs_list; }; /* Shared structure between the PHYs of the same package. diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c index 7ed6522fb0ef..f1c9ce351ab4 100644 --- a/drivers/net/phy/mscc/mscc_main.c +++ b/drivers/net/phy/mscc/mscc_main.c @@ -2335,6 +2335,13 @@ static int vsc85xx_probe(struct phy_device *phydev) return vsc85xx_dt_led_modes_get(phydev, default_mode); } +static void vsc85xx_remove(struct phy_device *phydev) +{ + struct vsc8531_private *priv = phydev->priv; + + skb_queue_purge(&priv->rx_skbs_list); +} + /* Microsemi VSC85xx PHYs */ static struct phy_driver vsc85xx_driver[] = { { @@ -2589,6 +2596,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8574_probe, .set_wol = &vsc85xx_wol_set, .get_wol = &vsc85xx_wol_get, @@ -2614,6 +2622,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8574_probe, .set_wol = &vsc85xx_wol_set, .get_wol = &vsc85xx_wol_get, @@ -2639,6 +2648,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8584_probe, .get_tunable = &vsc85xx_get_tunable, .set_tunable = &vsc85xx_set_tunable, @@ -2662,6 +2672,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8584_probe, .get_tunable = &vsc85xx_get_tunable, .set_tunable = &vsc85xx_set_tunable, @@ -2685,6 +2696,7 @@ static struct phy_driver vsc85xx_driver[] = { .config_intr = &vsc85xx_config_intr, .suspend = &genphy_suspend, .resume = &genphy_resume, + .remove = &vsc85xx_remove, .probe = &vsc8584_probe, .get_tunable = &vsc85xx_get_tunable, .set_tunable = &vsc85xx_set_tunable, diff --git a/drivers/net/phy/mscc/mscc_ptp.c b/drivers/net/phy/mscc/mscc_ptp.c index 275706de5847..de6c7312e8f2 100644 --- a/drivers/net/phy/mscc/mscc_ptp.c +++ b/drivers/net/phy/mscc/mscc_ptp.c @@ -1194,9 +1194,7 @@ static bool vsc85xx_rxtstamp(struct mii_timestamper *mii_ts, { struct vsc8531_private *vsc8531 = container_of(mii_ts, struct vsc8531_private, mii_ts); - struct skb_shared_hwtstamps *shhwtstamps = NULL; struct vsc85xx_ptphdr *ptphdr; - struct timespec64 ts; unsigned long ns; if (!vsc8531->ptp->configured) @@ -1206,27 +1204,52 @@ static bool vsc85xx_rxtstamp(struct mii_timestamper *mii_ts, type == PTP_CLASS_NONE) return false; - vsc85xx_gettime(&vsc8531->ptp->caps, &ts); - ptphdr = get_ptp_header_rx(skb, vsc8531->ptp->rx_filter); if (!ptphdr) return false; - shhwtstamps = skb_hwtstamps(skb); - memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps)); - ns = ntohl(ptphdr->rsrvd2); - /* nsec is in reserved field */ - if (ts.tv_nsec < ns) - ts.tv_sec--; + VSC8531_SKB_CB(skb)->ns = ns; + skb_queue_tail(&vsc8531->rx_skbs_list, skb); - shhwtstamps->hwtstamp = ktime_set(ts.tv_sec, ns); - netif_rx(skb); + ptp_schedule_worker(vsc8531->ptp->ptp_clock, 0); return true; } +static long vsc85xx_do_aux_work(struct ptp_clock_info *info) +{ + struct vsc85xx_ptp *ptp = container_of(info, struct vsc85xx_ptp, caps); + struct skb_shared_hwtstamps *shhwtstamps = NULL; + struct phy_device *phydev = ptp->phydev; + struct vsc8531_private *priv = phydev->priv; + struct sk_buff_head received; + struct sk_buff *rx_skb; + struct timespec64 ts; + unsigned long flags; + + __skb_queue_head_init(&received); + spin_lock_irqsave(&priv->rx_skbs_list.lock, flags); + skb_queue_splice_tail_init(&priv->rx_skbs_list, &received); + spin_unlock_irqrestore(&priv->rx_skbs_list.lock, flags); + + vsc85xx_gettime(info, &ts); + while ((rx_skb = __skb_dequeue(&received)) != NULL) { + shhwtstamps = skb_hwtstamps(rx_skb); + memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps)); + + if (ts.tv_nsec < VSC8531_SKB_CB(rx_skb)->ns) + ts.tv_sec--; + + shhwtstamps->hwtstamp = ktime_set(ts.tv_sec, + VSC8531_SKB_CB(rx_skb)->ns); + netif_rx(rx_skb); + } + + return -1; +} + static const struct ptp_clock_info vsc85xx_clk_caps = { .owner = THIS_MODULE, .name = "VSC85xx timer", @@ -1240,6 +1263,7 @@ static const struct ptp_clock_info vsc85xx_clk_caps = { .adjfine = &vsc85xx_adjfine, .gettime64 = &vsc85xx_gettime, .settime64 = &vsc85xx_settime, + .do_aux_work = &vsc85xx_do_aux_work, }; static struct vsc8531_private *vsc8584_base_priv(struct phy_device *phydev) @@ -1567,6 +1591,7 @@ int vsc8584_ptp_probe(struct phy_device *phydev) mutex_init(&vsc8531->phc_lock); mutex_init(&vsc8531->ts_lock); + skb_queue_head_init(&vsc8531->rx_skbs_list); /* Retrieve the shared load/save GPIO. Request it as non exclusive as * the same GPIO can be requested by all the PHYs of the same package. -- cgit v1.2.3 From 24ef2f53c07f273bad99173e27ee88d44d135b1c Mon Sep 17 00:00:00 2001 From: Yuichiro Tsuji Date: Mon, 18 Aug 2025 17:45:07 +0900 Subject: net: usb: asix_devices: Fix PHY address mask in MDIO bus initialization Syzbot reported shift-out-of-bounds exception on MDIO bus initialization. The PHY address should be masked to 5 bits (0-31). Without this mask, invalid PHY addresses could be used, potentially causing issues with MDIO bus operations. Fix this by masking the PHY address with 0x1f (31 decimal) to ensure it stays within the valid range. Fixes: 4faff70959d5 ("net: usb: asix_devices: add phy_mask for ax88772 mdio bus") Reported-by: syzbot+20537064367a0f98d597@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=20537064367a0f98d597 Tested-by: syzbot+20537064367a0f98d597@syzkaller.appspotmail.com Signed-off-by: Yuichiro Tsuji Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250818084541.1958-1-yuichtsu@amazon.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/asix_devices.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index d9f5942ccc44..792ddda1ad49 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -676,7 +676,7 @@ static int ax88772_init_mdio(struct usbnet *dev) priv->mdio->read = &asix_mdio_bus_read; priv->mdio->write = &asix_mdio_bus_write; priv->mdio->name = "Asix MDIO Bus"; - priv->mdio->phy_mask = ~(BIT(priv->phy_addr) | BIT(AX_EMBD_PHY_ADDR)); + priv->mdio->phy_mask = ~(BIT(priv->phy_addr & 0x1f) | BIT(AX_EMBD_PHY_ADDR)); /* mii bus name is usb-- */ snprintf(priv->mdio->id, MII_BUS_ID_SIZE, "usb-%03d:%03d", dev->udev->bus->busnum, dev->udev->devnum); -- cgit v1.2.3 From 75a9a46d67f46d608205888f9b34e315c1786345 Mon Sep 17 00:00:00 2001 From: Jordan Rhee Date: Mon, 18 Aug 2025 14:12:45 -0700 Subject: gve: prevent ethtool ops after shutdown A crash can occur if an ethtool operation is invoked after shutdown() is called. shutdown() is invoked during system shutdown to stop DMA operations without performing expensive deallocations. It is discouraged to unregister the netdev in this path, so the device may still be visible to userspace and kernel helpers. In gve, shutdown() tears down most internal data structures. If an ethtool operation is dispatched after shutdown(), it will dereference freed or NULL pointers, leading to a kernel panic. While graceful shutdown normally quiesces userspace before invoking the reboot syscall, forced shutdowns (as observed on GCP VMs) can still trigger this path. Fix by calling netif_device_detach() in shutdown(). This marks the device as detached so the ethtool ioctl handler will skip dispatching operations to the driver. Fixes: 974365e51861 ("gve: Implement suspend/resume/shutdown") Signed-off-by: Jordan Rhee Signed-off-by: Jeroen de Borst Link: https://patch.msgid.link/20250818211245.1156919-1-jeroendb@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve_main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c index 1f411d7c4373..1be1b1ef31ee 100644 --- a/drivers/net/ethernet/google/gve/gve_main.c +++ b/drivers/net/ethernet/google/gve/gve_main.c @@ -2870,6 +2870,8 @@ static void gve_shutdown(struct pci_dev *pdev) struct gve_priv *priv = netdev_priv(netdev); bool was_up = netif_running(priv->dev); + netif_device_detach(netdev); + rtnl_lock(); netdev_lock(netdev); if (was_up && gve_close(priv->dev)) { -- cgit v1.2.3 From 6d6714bf0c4e8eb2274081b4b023dfa01581c123 Mon Sep 17 00:00:00 2001 From: Yao Zi Date: Fri, 15 Aug 2025 10:48:03 +0000 Subject: net: stmmac: thead: Enable TX clock before MAC initialization The clk_tx_i clock must be supplied to the MAC for successful initialization. On TH1520 SoC, the clock is provided by an internal divider configured through GMAC_PLLCLK_DIV register when using RGMII interface. However, currently we don't setup the divider before initialization of the MAC, resulting in DMA reset failures if the bootloader/firmware doesn't enable the divider, [ 7.839601] thead-dwmac ffe7060000.ethernet eth0: Register MEM_TYPE_PAGE_POOL RxQ-0 [ 7.938338] thead-dwmac ffe7060000.ethernet eth0: PHY [stmmac-0:02] driver [RTL8211F Gigabit Ethernet] (irq=POLL) [ 8.160746] thead-dwmac ffe7060000.ethernet eth0: Failed to reset the dma [ 8.170118] thead-dwmac ffe7060000.ethernet eth0: stmmac_hw_setup: DMA engine initialization failed [ 8.179384] thead-dwmac ffe7060000.ethernet eth0: __stmmac_open: Hw setup failed Let's simply write GMAC_PLLCLK_DIV_EN to GMAC_PLLCLK_DIV to enable the divider before MAC initialization. Note that for reconfiguring the divisor, the divider must be disabled first and re-enabled later to make sure the new divisor take effect. The exact clock rate doesn't affect MAC's initialization according to my test. It's set to the speed required by RGMII when the linkspeed is 1Gbps and could be reclocked later after link is up if necessary. Fixes: 33a1a01e3afa ("net: stmmac: Add glue layer for T-HEAD TH1520 SoC") Signed-off-by: Yao Zi Reviewed-by: Drew Fustini Link: https://patch.msgid.link/20250815104803.55294-1-ziyao@disroot.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c index f2946bea0bc2..6c6c49e4b66f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-thead.c @@ -152,7 +152,7 @@ static int thead_set_clk_tx_rate(void *bsp_priv, struct clk *clk_tx_i, static int thead_dwmac_enable_clk(struct plat_stmmacenet_data *plat) { struct thead_dwmac *dwmac = plat->bsp_priv; - u32 reg; + u32 reg, div; switch (plat->mac_interface) { case PHY_INTERFACE_MODE_MII: @@ -164,6 +164,13 @@ static int thead_dwmac_enable_clk(struct plat_stmmacenet_data *plat) case PHY_INTERFACE_MODE_RGMII_RXID: case PHY_INTERFACE_MODE_RGMII_TXID: /* use pll */ + div = clk_get_rate(plat->stmmac_clk) / rgmii_clock(SPEED_1000); + reg = FIELD_PREP(GMAC_PLLCLK_DIV_EN, 1) | + FIELD_PREP(GMAC_PLLCLK_DIV_NUM, div); + + writel(0, dwmac->apb_base + GMAC_PLLCLK_DIV); + writel(reg, dwmac->apb_base + GMAC_PLLCLK_DIV); + writel(GMAC_GTXCLK_SEL_PLL, dwmac->apb_base + GMAC_GTXCLK_SEL); reg = GMAC_TX_CLK_EN | GMAC_TX_CLK_N_EN | GMAC_TX_CLK_OUT_EN | GMAC_RX_CLK_EN | GMAC_RX_CLK_N_EN; -- cgit v1.2.3 From 2462c1b9217246a889ec318b3894d84e4dd709c6 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:17 +0300 Subject: net/mlx5: HWS, fix bad parameter in CQ creation 'cqe_sz' valid value should be 0 for 64-byte CQE. Fixes: 2ca62599aa0b ("net/mlx5: HWS, added send engine and context handling") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c index c4b22be19a9b..b0595c9b09e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/send.c @@ -964,7 +964,6 @@ static int hws_send_ring_open_cq(struct mlx5_core_dev *mdev, return -ENOMEM; MLX5_SET(cqc, cqc_data, uar_page, mdev->priv.uar->index); - MLX5_SET(cqc, cqc_data, cqe_sz, queue->num_entries); MLX5_SET(cqc, cqc_data, log_cq_size, ilog2(queue->num_entries)); err = hws_send_ring_alloc_cq(mdev, numa_node, queue, cqc_data, cq); -- cgit v1.2.3 From 615b690612b7785ab8632f6a5a941550622e4e36 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:18 +0300 Subject: net/mlx5: HWS, fix simple rules rehash error flow Moving rules from matcher to matcher should not fail. However, if it does fail due to various reasons, the error flow should allow the kernel to continue functioning (albeit with broken steering rules) instead of going into series of soft lock-ups or some other problematic behaviour. This patch fixes the error flow for moving simple rules: - If new rule creation fails before it was even enqeued, do not poll for completion - If TIMEOUT happened while moving the rule, no point trying to poll for completions for other rules. Something is broken, completion won't come, just abort the rehash sequence. - If some other completion with error received, don't give up. Continue handling rest of the rules to minimize the damage. - Make sure that the first error code that was received will be actually returned to the caller instead of replacing it with the generic error code. All the aforementioned issues stem from the same bad error flow, so no point fixing them one by one and leaving partially broken code - fixing them in one patch. Fixes: ef94799a8741 ("net/mlx5: HWS, rework rehash loop") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../ethernet/mellanox/mlx5/core/steering/hws/bwc.c | 61 +++++++++++++++------- 1 file changed, 43 insertions(+), 18 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 92de4b761a83..0219a49b2326 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -74,9 +74,9 @@ static void hws_bwc_matcher_init_attr(struct mlx5hws_bwc_matcher *bwc_matcher, static int hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) { - bool move_error = false, poll_error = false, drain_error = false; struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; struct mlx5hws_matcher *matcher = bwc_matcher->matcher; + int drain_error = 0, move_error = 0, poll_error = 0; u16 bwc_queues = mlx5hws_bwc_queues(ctx); struct mlx5hws_rule_attr rule_attr; struct mlx5hws_bwc_rule *bwc_rule; @@ -99,11 +99,15 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) ret = mlx5hws_matcher_resize_rule_move(matcher, bwc_rule->rule, &rule_attr); - if (unlikely(ret && !move_error)) { - mlx5hws_err(ctx, - "Moving BWC rule: move failed (%d), attempting to move rest of the rules\n", - ret); - move_error = true; + if (unlikely(ret)) { + if (!move_error) { + mlx5hws_err(ctx, + "Moving BWC rule: move failed (%d), attempting to move rest of the rules\n", + ret); + move_error = ret; + } + /* Rule wasn't queued, no need to poll */ + continue; } pending_rules++; @@ -111,11 +115,19 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) rule_attr.queue_id, &pending_rules, false); - if (unlikely(ret && !poll_error)) { - mlx5hws_err(ctx, - "Moving BWC rule: poll failed (%d), attempting to move rest of the rules\n", - ret); - poll_error = true; + if (unlikely(ret)) { + if (ret == -ETIMEDOUT) { + mlx5hws_err(ctx, + "Moving BWC rule: timeout polling for completions (%d), aborting rehash\n", + ret); + return ret; + } + if (!poll_error) { + mlx5hws_err(ctx, + "Moving BWC rule: polling for completions failed (%d), attempting to move rest of the rules\n", + ret); + poll_error = ret; + } } } @@ -126,17 +138,30 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) rule_attr.queue_id, &pending_rules, true); - if (unlikely(ret && !drain_error)) { - mlx5hws_err(ctx, - "Moving BWC rule: drain failed (%d), attempting to move rest of the rules\n", - ret); - drain_error = true; + if (unlikely(ret)) { + if (ret == -ETIMEDOUT) { + mlx5hws_err(ctx, + "Moving bwc rule: timeout draining completions (%d), aborting rehash\n", + ret); + return ret; + } + if (!drain_error) { + mlx5hws_err(ctx, + "Moving bwc rule: drain failed (%d), attempting to move rest of the rules\n", + ret); + drain_error = ret; + } } } } - if (move_error || poll_error || drain_error) - ret = -EINVAL; + /* Return the first error that happened */ + if (unlikely(move_error)) + return move_error; + if (unlikely(poll_error)) + return poll_error; + if (unlikely(drain_error)) + return drain_error; return ret; } -- cgit v1.2.3 From 4a842b1bf18a32ee0c25dd6dd98728b786a76fe4 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:19 +0300 Subject: net/mlx5: HWS, fix complex rules rehash error flow Moving rules from matcher to matcher should not fail. However, if it does fail due to various reasons, the error flow should allow the kernel to continue functioning (albeit with broken steering rules) instead of going into series of soft lock-ups or some other problematic behaviour. Similar to the simple rules, complex rules rehash logic suffers from the same problems. This patch fixes the error flow for moving complex rules: - If new rule creation fails before it was even enqeued, do not poll for completion - If TIMEOUT happened while moving the rule, no point trying to poll for completions for other rules. Something is broken, completion won't come, just abort the rehash sequence. - If some other completion with error received, don't give up. Continue handling rest of the rules to minimize the damage. - Make sure that the first error code that was received will be actually returned to the caller instead of replacing it with the generic error code. All the aforementioned issues stem from the same bad error flow, so no point fixing them one by one and leaving partially broken code - fixing them in one patch. Fixes: 17e0accac577 ("net/mlx5: HWS, support complex matchers") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/steering/hws/bwc_complex.c | 41 +++++++++++++++------- 1 file changed, 28 insertions(+), 13 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c index ca7501c57468..14e79579c719 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc_complex.c @@ -1328,11 +1328,11 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) { struct mlx5hws_context *ctx = bwc_matcher->matcher->tbl->ctx; struct mlx5hws_matcher *matcher = bwc_matcher->matcher; - bool move_error = false, poll_error = false; u16 bwc_queues = mlx5hws_bwc_queues(ctx); struct mlx5hws_bwc_rule *tmp_bwc_rule; struct mlx5hws_rule_attr rule_attr; struct mlx5hws_table *isolated_tbl; + int move_error = 0, poll_error = 0; struct mlx5hws_rule *tmp_rule; struct list_head *rules_list; u32 expected_completions = 1; @@ -1391,11 +1391,15 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) ret = mlx5hws_matcher_resize_rule_move(matcher, tmp_rule, &rule_attr); - if (unlikely(ret && !move_error)) { - mlx5hws_err(ctx, - "Moving complex BWC rule failed (%d), attempting to move rest of the rules\n", - ret); - move_error = true; + if (unlikely(ret)) { + if (!move_error) { + mlx5hws_err(ctx, + "Moving complex BWC rule: move failed (%d), attempting to move rest of the rules\n", + ret); + move_error = ret; + } + /* Rule wasn't queued, no need to poll */ + continue; } expected_completions = 1; @@ -1403,11 +1407,19 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) rule_attr.queue_id, &expected_completions, true); - if (unlikely(ret && !poll_error)) { - mlx5hws_err(ctx, - "Moving complex BWC rule: poll failed (%d), attempting to move rest of the rules\n", - ret); - poll_error = true; + if (unlikely(ret)) { + if (ret == -ETIMEDOUT) { + mlx5hws_err(ctx, + "Moving complex BWC rule: timeout polling for completions (%d), aborting rehash\n", + ret); + return ret; + } + if (!poll_error) { + mlx5hws_err(ctx, + "Moving complex BWC rule: polling for completions failed (%d), attempting to move rest of the rules\n", + ret); + poll_error = ret; + } } /* Done moving the rule to the new matcher, @@ -1422,8 +1434,11 @@ mlx5hws_bwc_matcher_move_all_complex(struct mlx5hws_bwc_matcher *bwc_matcher) } } - if (move_error || poll_error) - ret = -EINVAL; + /* Return the first error that happened */ + if (unlikely(move_error)) + return move_error; + if (unlikely(poll_error)) + return poll_error; return ret; } -- cgit v1.2.3 From 1a72298d27ce4d41b3fd405f6921e8711815767a Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:20 +0300 Subject: net/mlx5: HWS, prevent rehash from filling up the queues While moving the rules during rehash, CQ is not drained. The flush and drain happens only when all the rules of a certain queue have been moved. This behaviour can lead to accumulating large quantity of rules that haven't got their completion yet, and eventually will fill up the queue and will cause the rehash to fail. Fix this problem by requiring drain once the number of outstanding completions reaches a certain threshold. Fixes: ef94799a8741 ("net/mlx5: HWS, rework rehash loop") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 0219a49b2326..2a59be11fe55 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -84,6 +84,7 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) struct list_head *rules_list; u32 pending_rules; int i, ret = 0; + bool drain; mlx5hws_bwc_rule_fill_attr(bwc_matcher, 0, 0, &rule_attr); @@ -111,10 +112,12 @@ hws_bwc_matcher_move_all_simple(struct mlx5hws_bwc_matcher *bwc_matcher) } pending_rules++; + drain = pending_rules >= + hws_bwc_get_burst_th(ctx, rule_attr.queue_id); ret = mlx5hws_bwc_queue_poll(ctx, rule_attr.queue_id, &pending_rules, - false); + drain); if (unlikely(ret)) { if (ret == -ETIMEDOUT) { mlx5hws_err(ctx, -- cgit v1.2.3 From 7c60952f83584bc4950057cfed2cc3c87343b5db Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 17 Aug 2025 23:23:21 +0300 Subject: net/mlx5: HWS, don't rehash on every kind of insertion failure If rule creation failed due to a full queue, due to timeout in polling for completion, or due to matcher being in resize, don't try to initiate rehash sequence - rehash would have failed anyway. Fixes: 2111bb970c78 ("net/mlx5: HWS, added backward-compatible API handling") Signed-off-by: Yevgeny Kliteynik Reviewed-by: Vlad Dogaru Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-6-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 2a59be11fe55..adeccc588e5d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -1063,6 +1063,21 @@ int mlx5hws_bwc_rule_create_simple(struct mlx5hws_bwc_rule *bwc_rule, return 0; /* rule inserted successfully */ } + /* Rule insertion could fail due to queue being full, timeout, or + * matcher in resize. In such cases, no point in trying to rehash. + */ + if (ret == -EBUSY || ret == -ETIMEDOUT || ret == -EAGAIN) { + mutex_unlock(queue_lock); + mlx5hws_err(ctx, + "BWC rule insertion failed - %s (%d)\n", + ret == -EBUSY ? "queue is full" : + ret == -ETIMEDOUT ? "timeout" : + ret == -EAGAIN ? "matcher in resize" : "N/A", + ret); + hws_bwc_rule_cnt_dec(bwc_rule); + return ret; + } + /* At this point the rule wasn't added. * It could be because there was collision, or some other problem. * Try rehash by size and insert rule again - last chance. -- cgit v1.2.3 From 8a51507320ebddaab32610199774f69cd7d53e78 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Sun, 17 Aug 2025 23:23:22 +0300 Subject: net/mlx5: HWS, Fix table creation UID During table creation, caller passes a UID using ft_attr. The UID value was ignored, which leads to problems when the caller sets the UID to a non-zero value, such as SHARED_RESOURCE_UID (0xffff) - the internal FT objects will be created with UID=0. Fixes: 0869701cba3d ("net/mlx5: HWS, added FW commands handling") Signed-off-by: Alex Vesker Reviewed-by: Yevgeny Kliteynik Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-7-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h | 1 + .../net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c | 1 + .../net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c | 5 ++++- .../net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h | 1 + .../net/ethernet/mellanox/mlx5/core/steering/hws/table.c | 13 ++++++++++--- .../net/ethernet/mellanox/mlx5/core/steering/hws/table.h | 3 ++- 7 files changed, 20 insertions(+), 5 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c index 9c83753e4592..0bdcab2e5cf3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.c @@ -55,6 +55,7 @@ int mlx5hws_cmd_flow_table_create(struct mlx5_core_dev *mdev, MLX5_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); MLX5_SET(create_flow_table_in, in, table_type, ft_attr->type); + MLX5_SET(create_flow_table_in, in, uid, ft_attr->uid); ft_ctx = MLX5_ADDR_OF(create_flow_table_in, in, flow_table_context); MLX5_SET(flow_table_context, ft_ctx, level, ft_attr->level); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h index fa6bff210266..122ccc671628 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/cmd.h @@ -36,6 +36,7 @@ struct mlx5hws_cmd_set_fte_attr { struct mlx5hws_cmd_ft_create_attr { u8 type; u8 level; + u16 uid; bool rtc_valid; bool decap_en; bool reformat_en; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c index 57592b92e24b..131e74b2b774 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/fs_hws.c @@ -267,6 +267,7 @@ static int mlx5_cmd_hws_create_flow_table(struct mlx5_flow_root_namespace *ns, tbl_attr.type = MLX5HWS_TABLE_TYPE_FDB; tbl_attr.level = ft_attr->level; + tbl_attr.uid = ft_attr->uid; tbl = mlx5hws_table_create(ctx, &tbl_attr); if (!tbl) { mlx5_core_err(ns->dev, "Failed creating hws flow_table\n"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c index f3ea09caba2b..32f87fdf3213 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/matcher.c @@ -85,6 +85,7 @@ static int hws_matcher_create_end_ft_isolated(struct mlx5hws_matcher *matcher) ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, + 0, &matcher->end_ft_id); if (ret) { mlx5hws_err(tbl->ctx, "Isolated matcher: failed to create end flow table\n"); @@ -112,7 +113,9 @@ static int hws_matcher_create_end_ft(struct mlx5hws_matcher *matcher) if (mlx5hws_matcher_is_isolated(matcher)) ret = hws_matcher_create_end_ft_isolated(matcher); else - ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, + ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, + tbl, + 0, &matcher->end_ft_id); if (ret) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h index 59c14745ed0c..2498ceff2060 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/mlx5hws.h @@ -75,6 +75,7 @@ struct mlx5hws_context_attr { struct mlx5hws_table_attr { enum mlx5hws_table_type type; u32 level; + u16 uid; }; enum mlx5hws_matcher_flow_src { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c index 568f691733f3..6113383ae47b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.c @@ -9,6 +9,7 @@ u32 mlx5hws_table_get_id(struct mlx5hws_table *tbl) } static void hws_table_init_next_ft_attr(struct mlx5hws_table *tbl, + u16 uid, struct mlx5hws_cmd_ft_create_attr *ft_attr) { ft_attr->type = tbl->fw_ft_type; @@ -16,7 +17,9 @@ static void hws_table_init_next_ft_attr(struct mlx5hws_table *tbl, ft_attr->level = tbl->ctx->caps->fdb_ft.max_level - 1; else ft_attr->level = tbl->ctx->caps->nic_ft.max_level - 1; + ft_attr->rtc_valid = true; + ft_attr->uid = uid; } static void hws_table_set_cap_attr(struct mlx5hws_table *tbl, @@ -119,12 +122,12 @@ static int hws_table_connect_to_default_miss_tbl(struct mlx5hws_table *tbl, u32 int mlx5hws_table_create_default_ft(struct mlx5_core_dev *mdev, struct mlx5hws_table *tbl, - u32 *ft_id) + u16 uid, u32 *ft_id) { struct mlx5hws_cmd_ft_create_attr ft_attr = {0}; int ret; - hws_table_init_next_ft_attr(tbl, &ft_attr); + hws_table_init_next_ft_attr(tbl, uid, &ft_attr); hws_table_set_cap_attr(tbl, &ft_attr); ret = mlx5hws_cmd_flow_table_create(mdev, &ft_attr, ft_id); @@ -189,7 +192,10 @@ static int hws_table_init(struct mlx5hws_table *tbl) } mutex_lock(&ctx->ctrl_lock); - ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, tbl, &tbl->ft_id); + ret = mlx5hws_table_create_default_ft(tbl->ctx->mdev, + tbl, + tbl->uid, + &tbl->ft_id); if (ret) { mlx5hws_err(tbl->ctx, "Failed to create flow table object\n"); mutex_unlock(&ctx->ctrl_lock); @@ -239,6 +245,7 @@ struct mlx5hws_table *mlx5hws_table_create(struct mlx5hws_context *ctx, tbl->ctx = ctx; tbl->type = attr->type; tbl->level = attr->level; + tbl->uid = attr->uid; ret = hws_table_init(tbl); if (ret) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h index 0400cce0c317..1246f9bd8422 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/table.h @@ -18,6 +18,7 @@ struct mlx5hws_table { enum mlx5hws_table_type type; u32 fw_ft_type; u32 level; + u16 uid; struct list_head matchers_list; struct list_head tbl_list_node; struct mlx5hws_default_miss default_miss; @@ -47,7 +48,7 @@ u32 mlx5hws_table_get_res_fw_ft_type(enum mlx5hws_table_type tbl_type, int mlx5hws_table_create_default_ft(struct mlx5_core_dev *mdev, struct mlx5hws_table *tbl, - u32 *ft_id); + u16 uid, u32 *ft_id); void mlx5hws_table_destroy_default_ft(struct mlx5hws_table *tbl, u32 ft_id); -- cgit v1.2.3 From d2d6f950cb43be6845a41cac5956cb2a10e657e5 Mon Sep 17 00:00:00 2001 From: Vlad Dogaru Date: Sun, 17 Aug 2025 23:23:23 +0300 Subject: net/mlx5: CT: Use the correct counter offset Specifying the counter action is not enough, as it is used by multiple counters that were allocated in a bulk. By omitting the offset, rules will be associated with a different counter from the same bulk. Subsequently, the CT subsystem checks the correct counter, assumes that no traffic has triggered the rule, and ages out the rule. The end result is intermittent offloading of long lived connections, as rules are aged out then promptly re-added. Fix this by specifying the correct offset along with the counter rule. Fixes: 34eea5b12a10 ("net/mlx5e: CT: Add initial support for Hardware Steering") Signed-off-by: Vlad Dogaru Reviewed-by: Yevgeny Kliteynik Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250817202323.308604-8-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c index a4263137fef5..01d522b02947 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc/ct_fs_hmfs.c @@ -173,6 +173,8 @@ static void mlx5_ct_fs_hmfs_fill_rule_actions(struct mlx5_ct_fs_hmfs *fs_hmfs, memset(rule_actions, 0, NUM_CT_HMFS_RULES * sizeof(*rule_actions)); rule_actions[0].action = mlx5_fc_get_hws_action(fs_hmfs->ctx, attr->counter); + rule_actions[0].counter.offset = + attr->counter->id - attr->counter->bulk->base_id; /* Modify header is special, it may require extra arguments outside the action itself. */ if (mh_action->mh_data) { rule_actions[1].modify_header.offset = mh_action->mh_data->offset; -- cgit v1.2.3 From 1683fd1b2fa79864d3c7a951d9cea0a9ba1a1923 Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Mon, 18 Aug 2025 11:35:13 +0530 Subject: microchip: lan865x: fix missing netif_start_queue() call on device open This fixes an issue where the transmit queue is started implicitly only the very first time the device is registered. When the device is taken down and brought back up again (using `ip` or `ifconfig`), the transmit queue is not restarted, causing packet transmission to hang. Adding an explicit call to netif_start_queue() in lan865x_net_open() ensures the transmit queue is properly started every time the device is reopened. Fixes: 5cd2340cb6a3 ("microchip: lan865x: add driver support for Microchip's LAN865X MAC-PHY") Signed-off-by: Parthiban Veerasooran Link: https://patch.msgid.link/20250818060514.52795-2-parthiban.veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan865x/lan865x.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/microchip/lan865x/lan865x.c b/drivers/net/ethernet/microchip/lan865x/lan865x.c index dd436bdff0f8..d03f5a8de58d 100644 --- a/drivers/net/ethernet/microchip/lan865x/lan865x.c +++ b/drivers/net/ethernet/microchip/lan865x/lan865x.c @@ -311,6 +311,8 @@ static int lan865x_net_open(struct net_device *netdev) phy_start(netdev->phydev); + netif_start_queue(netdev); + return 0; } -- cgit v1.2.3 From 2cd58fec912acec273cb155911ab8f06ddbb131a Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Mon, 18 Aug 2025 11:35:14 +0530 Subject: microchip: lan865x: fix missing Timer Increment config for Rev.B0/B1 Fix missing configuration for LAN865x silicon revisions B0 and B1 as per Microchip Application Note AN1760 (Rev F, June 2024). The Timer Increment register was not being set, which is required for accurate timestamping. As per the application note, configure the MAC to set timestamping at the end of the Start of Frame Delimiter (SFD), and set the Timer Increment register to 40 ns (corresponding to a 25 MHz internal clock). Link: https://www.microchip.com/en-us/application-notes/an1760 Fixes: 5cd2340cb6a3 ("microchip: lan865x: add driver support for Microchip's LAN865X MAC-PHY") Signed-off-by: Parthiban Veerasooran Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250818060514.52795-3-parthiban.veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan865x/lan865x.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/microchip/lan865x/lan865x.c b/drivers/net/ethernet/microchip/lan865x/lan865x.c index d03f5a8de58d..84c41f193561 100644 --- a/drivers/net/ethernet/microchip/lan865x/lan865x.c +++ b/drivers/net/ethernet/microchip/lan865x/lan865x.c @@ -32,6 +32,10 @@ /* MAC Specific Addr 1 Top Reg */ #define LAN865X_REG_MAC_H_SADDR1 0x00010023 +/* MAC TSU Timer Increment Register */ +#define LAN865X_REG_MAC_TSU_TIMER_INCR 0x00010077 +#define MAC_TSU_TIMER_INCR_COUNT_NANOSECONDS 0x0028 + struct lan865x_priv { struct work_struct multicast_work; struct net_device *netdev; @@ -346,6 +350,21 @@ static int lan865x_probe(struct spi_device *spi) goto free_netdev; } + /* LAN865x Rev.B0/B1 configuration parameters from AN1760 + * As per the Configuration Application Note AN1760 published in the + * link, https://www.microchip.com/en-us/application-notes/an1760 + * Revision F (DS60001760G - June 2024), configure the MAC to set time + * stamping at the end of the Start of Frame Delimiter (SFD) and set the + * Timer Increment reg to 40 ns to be used as a 25 MHz internal clock. + */ + ret = oa_tc6_write_register(priv->tc6, LAN865X_REG_MAC_TSU_TIMER_INCR, + MAC_TSU_TIMER_INCR_COUNT_NANOSECONDS); + if (ret) { + dev_err(&spi->dev, "Failed to config TSU Timer Incr reg: %d\n", + ret); + goto oa_tc6_exit; + } + /* As per the point s3 in the below errata, SPI receive Ethernet frame * transfer may halt when starting the next frame in the same data block * (chunk) as the end of a previous frame. The RFA field should be -- cgit v1.2.3 From 4d4d9ef9dfee877d494e5418f68a1016ef08cad6 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Tue, 19 Aug 2025 15:19:57 -0700 Subject: ixgbe: xsk: resolve the negative overflow of budget in ixgbe_xmit_zc Resolve the budget negative overflow which leads to returning true in ixgbe_xmit_zc even when the budget of descs are thoroughly consumed. Before this patch, when the budget is decreased to zero and finishes sending the last allowed desc in ixgbe_xmit_zc, it will always turn back and enter into the while() statement to see if it should keep processing packets, but in the meantime it unexpectedly decreases the value again to 'unsigned int (0--)', namely, UINT_MAX. Finally, the ixgbe_xmit_zc returns true, showing 'we complete cleaning the budget'. That also means 'clean_complete = true' in ixgbe_poll. The true theory behind this is if that budget number of descs are consumed, it implies that we might have more descs to be done. So we should return false in ixgbe_xmit_zc to tell napi poll to find another chance to start polling to handle the rest of descs. On the contrary, returning true here means job done and we know we finish all the possible descs this time and we don't intend to start a new napi poll. It is apparently against our expectations. Please also see how ixgbe_clean_tx_irq() handles the problem: it uses do..while() statement to make sure the budget can be decreased to zero at most and the negative overflow never happens. The patch adds 'likely' because we rarely would not hit the loop condition since the standard budget is 256. Fixes: 8221c5eba8c1 ("ixgbe: add AF_XDP zero-copy Tx support") Signed-off-by: Jason Xing Reviewed-by: Larysa Zaremba Reviewed-by: Paul Menzel Reviewed-by: Aleksandr Loktionov Tested-by: Priya Singh Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250819222000.3504873-4-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index ac58964b2f08..7b941505a9d0 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -398,7 +398,7 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget) dma_addr_t dma; u32 cmd_type; - while (budget-- > 0) { + while (likely(budget)) { if (unlikely(!ixgbe_desc_unused(xdp_ring))) { work_done = false; break; @@ -433,6 +433,8 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget) xdp_ring->next_to_use++; if (xdp_ring->next_to_use == xdp_ring->count) xdp_ring->next_to_use = 0; + + budget--; } if (tx_desc) { -- cgit v1.2.3 From f3d9f7fa7f5dbfd4fdb1e69c25fc5627700d19dd Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 19 Aug 2025 15:19:58 -0700 Subject: ixgbe: fix ndo_xdp_xmit() workloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently ixgbe driver checks periodically in its watchdog subtask if there is anything to be transmitted (considering both Tx and XDP rings) under state of carrier not being 'ok'. Such event is interpreted as Tx hang and therefore results in interface reset. This is currently problematic for ndo_xdp_xmit() as it is allowed to produce descriptors when interface is going through reset or its carrier is turned off. Furthermore, XDP rings should not really be objects of Tx hang detection. This mechanism is rather a matter of ndo_tx_timeout() being called from dev_watchdog against Tx rings exposed to networking stack. Taking into account issues described above, let us have a two fold fix - do not respect XDP rings in local ixgbe watchdog and do not produce Tx descriptors in ndo_xdp_xmit callback when there is some problem with carrier currently. For now, keep the Tx hang checks in clean Tx irq routine, but adjust it to not execute for XDP rings. Cc: Tobias Böhm Reported-by: Marcus Wichelmann Closes: https://lore.kernel.org/netdev/eca1880f-253a-4955-afe6-732d7c6926ee@hetzner-cloud.de/ Fixes: 6453073987ba ("ixgbe: add initial support for xdp redirect") Fixes: 33fdc82f0883 ("ixgbe: add support for XDP_TX action") Reviewed-by: Aleksandr Loktionov Tested-by: Marcus Wichelmann Signed-off-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250819222000.3504873-5-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 34 +++++++++------------------ 1 file changed, 11 insertions(+), 23 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 6122a0abb41f..80e6a2ef1350 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -968,10 +968,6 @@ static void ixgbe_update_xoff_rx_lfc(struct ixgbe_adapter *adapter) for (i = 0; i < adapter->num_tx_queues; i++) clear_bit(__IXGBE_HANG_CHECK_ARMED, &adapter->tx_ring[i]->state); - - for (i = 0; i < adapter->num_xdp_queues; i++) - clear_bit(__IXGBE_HANG_CHECK_ARMED, - &adapter->xdp_ring[i]->state); } static void ixgbe_update_xoff_received(struct ixgbe_adapter *adapter) @@ -1214,7 +1210,7 @@ static void ixgbe_pf_handle_tx_hang(struct ixgbe_ring *tx_ring, struct ixgbe_adapter *adapter = netdev_priv(tx_ring->netdev); struct ixgbe_hw *hw = &adapter->hw; - e_err(drv, "Detected Tx Unit Hang%s\n" + e_err(drv, "Detected Tx Unit Hang\n" " Tx Queue <%d>\n" " TDH, TDT <%x>, <%x>\n" " next_to_use <%x>\n" @@ -1222,16 +1218,14 @@ static void ixgbe_pf_handle_tx_hang(struct ixgbe_ring *tx_ring, "tx_buffer_info[next_to_clean]\n" " time_stamp <%lx>\n" " jiffies <%lx>\n", - ring_is_xdp(tx_ring) ? " (XDP)" : "", tx_ring->queue_index, IXGBE_READ_REG(hw, IXGBE_TDH(tx_ring->reg_idx)), IXGBE_READ_REG(hw, IXGBE_TDT(tx_ring->reg_idx)), tx_ring->next_to_use, next, tx_ring->tx_buffer_info[next].time_stamp, jiffies); - if (!ring_is_xdp(tx_ring)) - netif_stop_subqueue(tx_ring->netdev, - tx_ring->queue_index); + netif_stop_subqueue(tx_ring->netdev, + tx_ring->queue_index); } /** @@ -1451,6 +1445,9 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, total_bytes); adapter->tx_ipsec += total_ipsec; + if (ring_is_xdp(tx_ring)) + return !!budget; + if (check_for_tx_hang(tx_ring) && ixgbe_check_tx_hang(tx_ring)) { if (adapter->hw.mac.type == ixgbe_mac_e610) ixgbe_handle_mdd_event(adapter, tx_ring); @@ -1468,9 +1465,6 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, return true; } - if (ring_is_xdp(tx_ring)) - return !!budget; - #define TX_WAKE_THRESHOLD (DESC_NEEDED * 2) txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->queue_index); if (!__netif_txq_completed_wake(txq, total_packets, total_bytes, @@ -7974,12 +7968,9 @@ static void ixgbe_check_hang_subtask(struct ixgbe_adapter *adapter) return; /* Force detection of hung controller */ - if (netif_carrier_ok(adapter->netdev)) { + if (netif_carrier_ok(adapter->netdev)) for (i = 0; i < adapter->num_tx_queues; i++) set_check_for_tx_hang(adapter->tx_ring[i]); - for (i = 0; i < adapter->num_xdp_queues; i++) - set_check_for_tx_hang(adapter->xdp_ring[i]); - } if (!(adapter->flags & IXGBE_FLAG_MSIX_ENABLED)) { /* @@ -8199,13 +8190,6 @@ static bool ixgbe_ring_tx_pending(struct ixgbe_adapter *adapter) return true; } - for (i = 0; i < adapter->num_xdp_queues; i++) { - struct ixgbe_ring *ring = adapter->xdp_ring[i]; - - if (ring->next_to_use != ring->next_to_clean) - return true; - } - return false; } @@ -11005,6 +10989,10 @@ static int ixgbe_xdp_xmit(struct net_device *dev, int n, if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state))) return -ENETDOWN; + if (!netif_carrier_ok(adapter->netdev) || + !netif_running(adapter->netdev)) + return -ENETDOWN; + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) return -EINVAL; -- cgit v1.2.3 From 1468c1f97cf32418e34dbb40b784ed9333b9e123 Mon Sep 17 00:00:00 2001 From: ValdikSS Date: Tue, 19 Aug 2025 15:19:59 -0700 Subject: igc: fix disabling L1.2 PCI-E link substate on I226 on init Device ID comparison in igc_is_device_id_i226 is performed before the ID is set, resulting in always failing check on init. Before the patch: * L1.2 is not disabled on init * L1.2 is properly disabled after suspend-resume cycle With the patch: * L1.2 is properly disabled both on init and after suspend-resume How to test: Connect to the 1G link with 300+ mbit/s Internet speed, and run the download speed test, such as: curl -o /dev/null http://speedtest.selectel.ru/1GB Without L1.2 disabled, the speed would be no more than ~200 mbit/s. With L1.2 disabled, the speed would reach 1 gbit/s. Note: it's required that the latency between your host and the remote be around 3-5 ms, the test inside LAN (<1 ms latency) won't trigger the issue. Link: https://lore.kernel.org/intel-wired-lan/15248b4f-3271-42dd-8e35-02bfc92b25e1@intel.com Fixes: 0325143b59c6 ("igc: disable L1.2 PCI-E link substate to avoid performance issue") Signed-off-by: ValdikSS Reviewed-by: Vitaly Lifshits Reviewed-by: Paul Menzel Signed-off-by: Tony Nguyen Link: https://patch.msgid.link/20250819222000.3504873-6-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc_main.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 458e5eaa92e5..e79b14d50b24 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -7149,6 +7149,13 @@ static int igc_probe(struct pci_dev *pdev, adapter->port_num = hw->bus.func; adapter->msg_enable = netif_msg_init(debug, DEFAULT_MSG_ENABLE); + /* PCI config space info */ + hw->vendor_id = pdev->vendor; + hw->device_id = pdev->device; + hw->revision_id = pdev->revision; + hw->subsystem_vendor_id = pdev->subsystem_vendor; + hw->subsystem_device_id = pdev->subsystem_device; + /* Disable ASPM L1.2 on I226 devices to avoid packet loss */ if (igc_is_device_id_i226(hw)) pci_disable_link_state(pdev, PCIE_LINK_STATE_L1_2); @@ -7175,13 +7182,6 @@ static int igc_probe(struct pci_dev *pdev, netdev->mem_start = pci_resource_start(pdev, 0); netdev->mem_end = pci_resource_end(pdev, 0); - /* PCI config space info */ - hw->vendor_id = pdev->vendor; - hw->device_id = pdev->device; - hw->revision_id = pdev->revision; - hw->subsystem_vendor_id = pdev->subsystem_vendor; - hw->subsystem_device_id = pdev->subsystem_device; - /* Copy the default MAC and PHY function pointers */ memcpy(&hw->mac.ops, ei->mac_ops, sizeof(hw->mac.ops)); memcpy(&hw->phy.ops, ei->phy_ops, sizeof(hw->phy.ops)); -- cgit v1.2.3 From e318cd6714592fb762fcab59c5684a442243a12f Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Mon, 18 Aug 2025 18:04:57 -0700 Subject: net: dsa: microchip: Fix KSZ9477 HSR port setup issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ksz9477_hsr_join() is called once to setup the HSR port membership, but the port can be enabled later, or disabled and enabled back and the port membership is not set correctly inside ksz_update_port_member(). The added code always use the correct HSR port membership for HSR port that is enabled. Fixes: 2d61298fdd7b ("net: dsa: microchip: Enable HSR offloading for KSZ9477") Reported-by: Frieder Schrempf Signed-off-by: Tristram Ha Reviewed-by: Łukasz Majewski Tested-by: Frieder Schrempf Reviewed-by: Frieder Schrempf Link: https://patch.msgid.link/20250819010457.563286-1-Tristram.Ha@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/net') diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 4cb14288ff0f..9568cc391fe3 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -2457,6 +2457,12 @@ static void ksz_update_port_member(struct ksz_device *dev, int port) dev->dev_ops->cfg_port_member(dev, i, val | cpu_port); } + /* HSR ports are setup once so need to use the assigned membership + * when the port is enabled. + */ + if (!port_member && p->stp_state == BR_STATE_FORWARDING && + (dev->hsr_ports & BIT(port))) + port_member = dev->hsr_ports; dev->dev_ops->cfg_port_member(dev, port, port_member | cpu_port); } -- cgit v1.2.3 From c42be534547d6e45c155c347dd792b6ad9c24def Mon Sep 17 00:00:00 2001 From: Ryan Wanner Date: Tue, 19 Aug 2025 09:32:30 -0700 Subject: Revert "net: cadence: macb: sama7g5_emac: Remove USARIO CLKEN flag" This reverts commit db400061b5e7cc55f9b4dd15443e9838964119ea. This commit can cause a Devicetree ABI break for older DTS files that rely this flag for RMII configuration. Adding this back in ensures that the older DTBs will not break. Fixes: db400061b5e7 ("net: cadence: macb: sama7g5_emac: Remove USARIO CLKEN flag") Signed-off-by: Ryan Wanner Link: https://patch.msgid.link/20250819163236.100680-1-Ryan.Wanner@microchip.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/cadence/macb_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index ce95fad8cedd..9693f0289435 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -5113,7 +5113,8 @@ static const struct macb_config sama7g5_gem_config = { static const struct macb_config sama7g5_emac_config = { .caps = MACB_CAPS_USRIO_DEFAULT_IS_MII_GMII | - MACB_CAPS_MIIONRGMII | MACB_CAPS_GEM_HAS_PTP, + MACB_CAPS_USRIO_HAS_CLKEN | MACB_CAPS_MIIONRGMII | + MACB_CAPS_GEM_HAS_PTP, .dma_burst_length = 16, .clk_init = macb_clk_init, .init = macb_init, -- cgit v1.2.3 From b64d035f77b1f02ab449393342264b44950a75ae Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 15 Aug 2025 06:19:58 +0000 Subject: bonding: update LACP activity flag after setting lacp_active The port's actor_oper_port_state activity flag should be updated immediately after changing the lacp_active option to reflect the current mode correctly. Fixes: 3a755cd8b7c6 ("bonding: add new option lacp_active") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250815062000.22220-2-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_3ad.c | 25 +++++++++++++++++++++++++ drivers/net/bonding/bond_options.c | 1 + 2 files changed, 26 insertions(+) (limited to 'drivers/net') diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 2fca8e84ab10..414fecfd2a0e 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -2883,6 +2883,31 @@ void bond_3ad_update_lacp_rate(struct bonding *bond) spin_unlock_bh(&bond->mode_lock); } +/** + * bond_3ad_update_lacp_active - change the lacp active + * @bond: bonding struct + * + * Update actor_oper_port_state when lacp_active is modified. + */ +void bond_3ad_update_lacp_active(struct bonding *bond) +{ + struct port *port = NULL; + struct list_head *iter; + struct slave *slave; + int lacp_active; + + lacp_active = bond->params.lacp_active; + spin_lock_bh(&bond->mode_lock); + bond_for_each_slave(bond, slave, iter) { + port = &(SLAVE_AD_INFO(slave)->port); + if (lacp_active) + port->actor_oper_port_state |= LACP_STATE_LACP_ACTIVITY; + else + port->actor_oper_port_state &= ~LACP_STATE_LACP_ACTIVITY; + } + spin_unlock_bh(&bond->mode_lock); +} + size_t bond_3ad_stats_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_RX */ diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 1d639a3be6ba..3b6f815c55ff 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -1660,6 +1660,7 @@ static int bond_option_lacp_active_set(struct bonding *bond, netdev_dbg(bond->dev, "Setting LACP active to %s (%llu)\n", newval->string, newval->value); bond->params.lacp_active = newval->value; + bond_3ad_update_lacp_active(bond); return 0; } -- cgit v1.2.3 From 0599640a21e98f0d6a3e9ff85c0a687c90a8103b Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 15 Aug 2025 06:19:59 +0000 Subject: bonding: send LACPDUs periodically in passive mode after receiving partner's LACPDU When `lacp_active` is set to `off`, the bond operates in passive mode, meaning it only "speaks when spoken to." However, the current kernel implementation only sends an LACPDU in response when the partner's state changes. As a result, once LACP negotiation succeeds, the actor stops sending LACPDUs until the partner times out and sends an "expired" LACPDU. This causes continuous LACP state flapping. According to IEEE 802.1AX-2014, 6.4.13 Periodic Transmission machine. The values of Partner_Oper_Port_State.LACP_Activity and Actor_Oper_Port_State.LACP_Activity determine whether periodic transmissions take place. If either or both parameters are set to Active LACP, then periodic transmissions occur; if both are set to Passive LACP, then periodic transmissions do not occur. To comply with this, we remove the `!bond->params.lacp_active` check in `ad_periodic_machine()`. Instead, we initialize the actor's port's `LACP_STATE_LACP_ACTIVITY` state based on `lacp_active` setting. Additionally, we avoid setting the partner's state to `LACP_STATE_LACP_ACTIVITY` in the EXPIRED state, since we should not assume the partner is active by default. This ensures that in passive mode, the bond starts sending periodic LACPDUs after receiving one from the partner, and avoids flapping due to inactivity. Fixes: 3a755cd8b7c6 ("bonding: add new option lacp_active") Signed-off-by: Hangbin Liu Link: https://patch.msgid.link/20250815062000.22220-3-liuhangbin@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_3ad.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 414fecfd2a0e..4edc8e6b6b64 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -95,13 +95,13 @@ static int ad_marker_send(struct port *port, struct bond_marker *marker); static void ad_mux_machine(struct port *port, bool *update_slave_arr); static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port); static void ad_tx_machine(struct port *port); -static void ad_periodic_machine(struct port *port, struct bond_params *bond_params); +static void ad_periodic_machine(struct port *port); static void ad_port_selection_logic(struct port *port, bool *update_slave_arr); static void ad_agg_selection_logic(struct aggregator *aggregator, bool *update_slave_arr); static void ad_clear_agg(struct aggregator *aggregator); static void ad_initialize_agg(struct aggregator *aggregator); -static void ad_initialize_port(struct port *port, int lacp_fast); +static void ad_initialize_port(struct port *port, const struct bond_params *bond_params); static void ad_enable_collecting(struct port *port); static void ad_disable_distributing(struct port *port, bool *update_slave_arr); @@ -1307,10 +1307,16 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) * case of EXPIRED even if LINK_DOWN didn't arrive for * the port. */ - port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION; port->sm_vars &= ~AD_PORT_MATCHED; + /* Based on IEEE 8021AX-2014, Figure 6-18 - Receive + * machine state diagram, the statue should be + * Partner_Oper_Port_State.Synchronization = FALSE; + * Partner_Oper_Port_State.LACP_Timeout = Short Timeout; + * start current_while_timer(Short Timeout); + * Actor_Oper_Port_State.Expired = TRUE; + */ + port->partner_oper.port_state &= ~LACP_STATE_SYNCHRONIZATION; port->partner_oper.port_state |= LACP_STATE_LACP_TIMEOUT; - port->partner_oper.port_state |= LACP_STATE_LACP_ACTIVITY; port->sm_rx_timer_counter = __ad_timer_to_ticks(AD_CURRENT_WHILE_TIMER, (u16)(AD_SHORT_TIMEOUT)); port->actor_oper_port_state |= LACP_STATE_EXPIRED; port->sm_vars |= AD_PORT_CHURNED; @@ -1417,11 +1423,10 @@ static void ad_tx_machine(struct port *port) /** * ad_periodic_machine - handle a port's periodic state machine * @port: the port we're looking at - * @bond_params: bond parameters we will use * * Turn ntt flag on priodically to perform periodic transmission of lacpdu's. */ -static void ad_periodic_machine(struct port *port, struct bond_params *bond_params) +static void ad_periodic_machine(struct port *port) { periodic_states_t last_state; @@ -1430,8 +1435,7 @@ static void ad_periodic_machine(struct port *port, struct bond_params *bond_para /* check if port was reinitialized */ if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) || - (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) || - !bond_params->lacp_active) { + (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY))) { port->sm_periodic_state = AD_NO_PERIODIC; } /* check if state machine should change state */ @@ -1955,16 +1959,16 @@ static void ad_initialize_agg(struct aggregator *aggregator) /** * ad_initialize_port - initialize a given port's parameters * @port: the port we're looking at - * @lacp_fast: boolean. whether fast periodic should be used + * @bond_params: bond parameters we will use */ -static void ad_initialize_port(struct port *port, int lacp_fast) +static void ad_initialize_port(struct port *port, const struct bond_params *bond_params) { static const struct port_params tmpl = { .system_priority = 0xffff, .key = 1, .port_number = 1, .port_priority = 0xff, - .port_state = 1, + .port_state = 0, }; static const struct lacpdu lacpdu = { .subtype = 0x01, @@ -1982,12 +1986,14 @@ static void ad_initialize_port(struct port *port, int lacp_fast) port->actor_port_priority = 0xff; port->actor_port_aggregator_identifier = 0; port->ntt = false; - port->actor_admin_port_state = LACP_STATE_AGGREGATION | - LACP_STATE_LACP_ACTIVITY; - port->actor_oper_port_state = LACP_STATE_AGGREGATION | - LACP_STATE_LACP_ACTIVITY; + port->actor_admin_port_state = LACP_STATE_AGGREGATION; + port->actor_oper_port_state = LACP_STATE_AGGREGATION; + if (bond_params->lacp_active) { + port->actor_admin_port_state |= LACP_STATE_LACP_ACTIVITY; + port->actor_oper_port_state |= LACP_STATE_LACP_ACTIVITY; + } - if (lacp_fast) + if (bond_params->lacp_fast) port->actor_oper_port_state |= LACP_STATE_LACP_TIMEOUT; memcpy(&port->partner_admin, &tmpl, sizeof(tmpl)); @@ -2201,7 +2207,7 @@ void bond_3ad_bind_slave(struct slave *slave) /* port initialization */ port = &(SLAVE_AD_INFO(slave)->port); - ad_initialize_port(port, bond->params.lacp_fast); + ad_initialize_port(port, &bond->params); port->slave = slave; port->actor_port_number = SLAVE_AD_INFO(slave)->id; @@ -2513,7 +2519,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work) } ad_rx_machine(NULL, port); - ad_periodic_machine(port, &bond->params); + ad_periodic_machine(port); ad_port_selection_logic(port, &update_slave_arr); ad_mux_machine(port, &update_slave_arr); ad_tx_machine(port); -- cgit v1.2.3 From 9f6b606b6b37e61427412708411e8e04b1a858e8 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 18 Aug 2025 11:58:25 +0200 Subject: net: airoha: ppe: Do not invalid PPE entries in case of SW hash collision SW hash computed by airoha_ppe_foe_get_entry_hash routine (used for foe_flow hlist) can theoretically produce collisions between two different HW PPE entries. In airoha_ppe_foe_insert_entry() if the collision occurs we will mark the second PPE entry in the list as stale (setting the hw hash to 0xffff). Stale entries are no more updated in airoha_ppe_foe_flow_entry_update routine and so they are removed by Netfilter. Fix the problem not marking the second entry as stale in airoha_ppe_foe_insert_entry routine if we have already inserted the brand new entry in the PPE table and let Netfilter remove real stale entries according to their timestamp. Please note this is just a theoretical issue spotted reviewing the code and not faced running the system. Fixes: cd53f622611f9 ("net: airoha: Add L2 hw acceleration support") Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250818-airoha-en7581-hash-collision-fix-v1-1-d190c4b53d1c@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/airoha/airoha_ppe.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/airoha/airoha_ppe.c b/drivers/net/ethernet/airoha/airoha_ppe.c index 47411d2cbd28..88694b08afa1 100644 --- a/drivers/net/ethernet/airoha/airoha_ppe.c +++ b/drivers/net/ethernet/airoha/airoha_ppe.c @@ -736,10 +736,8 @@ static void airoha_ppe_foe_insert_entry(struct airoha_ppe *ppe, continue; } - if (commit_done || !airoha_ppe_foe_compare_entry(e, hwe)) { - e->hash = 0xffff; + if (!airoha_ppe_foe_compare_entry(e, hwe)) continue; - } airoha_ppe_foe_commit_entry(ppe, &e->data, hash); commit_done = true; -- cgit v1.2.3 From 8c5d95988c34f0aeba1f34cd5e4ba69494c90c5f Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Wed, 20 Aug 2025 12:09:18 +0530 Subject: Octeontx2-af: Skip overlap check for SPI field Octeontx2/CN10K silicon supports generating a 256-bit key per packet. The specific fields to be extracted from a packet for key generation are configurable via a Key Extraction (MKEX) Profile. The AF driver scans the configured extraction profile to ensure that fields from upper layers do not overwrite fields from lower layers in the key. Example Packet Field Layout: LA: DMAC + SMAC LB: VLAN LC: IPv4/IPv6 LD: TCP/UDP Valid MKEX Profile Configuration: LA -> DMAC -> key_offset[0-5] LC -> SIP -> key_offset[20-23] LD -> SPORT -> key_offset[30-31] Invalid MKEX profile configuration: LA -> DMAC -> key_offset[0-5] LC -> SIP -> key_offset[20-23] LD -> SPORT -> key_offset[2-3] // Overlaps with DMAC field In another scenario, if the MKEX profile is configured to extract the SPI field from both AH and ESP headers at the same key offset, the driver rejecting this configuration. In a regular traffic, ipsec packet will be having either AH(LD) or ESP (LE). This patch relaxes the check for the same. Fixes: 12aa0a3b93f3 ("octeontx2-af: Harden rule validation.") Signed-off-by: Hariprasad Kelam Link: https://patch.msgid.link/20250820063919.1463518-1-hkelam@marvell.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c index 1b765045aa63..b56395ac5a74 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c @@ -606,8 +606,8 @@ static void npc_set_features(struct rvu *rvu, int blkaddr, u8 intf) if (!npc_check_field(rvu, blkaddr, NPC_LB, intf)) *features &= ~BIT_ULL(NPC_OUTER_VID); - /* Set SPI flag only if AH/ESP and IPSEC_SPI are in the key */ - if (npc_check_field(rvu, blkaddr, NPC_IPSEC_SPI, intf) && + /* Allow extracting SPI field from AH and ESP headers at same offset */ + if (npc_is_field_present(rvu, NPC_IPSEC_SPI, intf) && (*features & (BIT_ULL(NPC_IPPROTO_ESP) | BIT_ULL(NPC_IPPROTO_AH)))) *features |= BIT_ULL(NPC_IPSEC_SPI); -- cgit v1.2.3 From 1c67f9c54cdc70627e3f6472b89cd3d895df974c Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 20 Aug 2025 15:27:07 +0200 Subject: net: pse-pd: pd692x0: Fix power budget leak in manager setup error path Fix a resource leak where manager power budgets were freed on both success and error paths during manager setup. Power budgets should only be freed on error paths after regulator registration or during driver removal. Refactor cleanup logic by extracting OF node cleanup and power budget freeing into separate helper functions for better maintainability. Fixes: 359754013e6a ("net: pse-pd: pd692x0: Add support for PSE PI priority feature") Signed-off-by: Kory Maincent Link: https://patch.msgid.link/20250820132708.837255-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/pd692x0.c | 59 +++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 15 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/pse-pd/pd692x0.c b/drivers/net/pse-pd/pd692x0.c index 399ce9febda4..395f6c662175 100644 --- a/drivers/net/pse-pd/pd692x0.c +++ b/drivers/net/pse-pd/pd692x0.c @@ -1162,12 +1162,44 @@ pd692x0_write_ports_matrix(struct pd692x0_priv *priv, return 0; } +static void pd692x0_of_put_managers(struct pd692x0_priv *priv, + struct pd692x0_manager *manager, + int nmanagers) +{ + int i, j; + + for (i = 0; i < nmanagers; i++) { + for (j = 0; j < manager[i].nports; j++) + of_node_put(manager[i].port_node[j]); + of_node_put(manager[i].node); + } +} + +static void pd692x0_managers_free_pw_budget(struct pd692x0_priv *priv) +{ + int i; + + for (i = 0; i < PD692X0_MAX_MANAGERS; i++) { + struct regulator *supply; + + if (!priv->manager_reg[i] || !priv->manager_pw_budget[i]) + continue; + + supply = priv->manager_reg[i]->supply; + if (!supply) + continue; + + regulator_free_power_budget(supply, + priv->manager_pw_budget[i]); + } +} + static int pd692x0_setup_pi_matrix(struct pse_controller_dev *pcdev) { struct pd692x0_manager *manager __free(kfree) = NULL; struct pd692x0_priv *priv = to_pd692x0_priv(pcdev); struct pd692x0_matrix port_matrix[PD692X0_MAX_PIS]; - int ret, i, j, nmanagers; + int ret, nmanagers; /* Should we flash the port matrix */ if (priv->fw_state != PD692X0_FW_OK && @@ -1185,31 +1217,27 @@ static int pd692x0_setup_pi_matrix(struct pse_controller_dev *pcdev) nmanagers = ret; ret = pd692x0_register_managers_regulator(priv, manager, nmanagers); if (ret) - goto out; + goto err_of_managers; ret = pd692x0_configure_managers(priv, nmanagers); if (ret) - goto out; + goto err_of_managers; ret = pd692x0_set_ports_matrix(priv, manager, nmanagers, port_matrix); if (ret) - goto out; + goto err_managers_req_pw; ret = pd692x0_write_ports_matrix(priv, port_matrix); if (ret) - goto out; + goto err_managers_req_pw; -out: - for (i = 0; i < nmanagers; i++) { - struct regulator *supply = priv->manager_reg[i]->supply; - - regulator_free_power_budget(supply, - priv->manager_pw_budget[i]); + pd692x0_of_put_managers(priv, manager, nmanagers); + return 0; - for (j = 0; j < manager[i].nports; j++) - of_node_put(manager[i].port_node[j]); - of_node_put(manager[i].node); - } +err_managers_req_pw: + pd692x0_managers_free_pw_budget(priv); +err_of_managers: + pd692x0_of_put_managers(priv, manager, nmanagers); return ret; } @@ -1748,6 +1776,7 @@ static void pd692x0_i2c_remove(struct i2c_client *client) { struct pd692x0_priv *priv = i2c_get_clientdata(client); + pd692x0_managers_free_pw_budget(priv); firmware_upload_unregister(priv->fwl); } -- cgit v1.2.3 From 7ef353879f714602b43f98662069f4fb86536761 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 20 Aug 2025 15:33:21 +0200 Subject: net: pse-pd: pd692x0: Skip power budget configuration when undefined If the power supply's power budget is not defined in the device tree, the current code still requests power and configures the PSE manager with a 0W power limit, which is undesirable behavior. Skip power budget configuration entirely when the budget is zero, avoiding unnecessary power requests and preventing invalid 0W limits from being set on the PSE manager. Fixes: 359754013e6a ("net: pse-pd: pd692x0: Add support for PSE PI priority feature") Signed-off-by: Kory Maincent Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20250820133321.841054-1-kory.maincent@bootlin.com Signed-off-by: Jakub Kicinski --- drivers/net/pse-pd/pd692x0.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/net') diff --git a/drivers/net/pse-pd/pd692x0.c b/drivers/net/pse-pd/pd692x0.c index 395f6c662175..f4e91ba64a66 100644 --- a/drivers/net/pse-pd/pd692x0.c +++ b/drivers/net/pse-pd/pd692x0.c @@ -1041,6 +1041,10 @@ pd692x0_configure_managers(struct pd692x0_priv *priv, int nmanagers) int pw_budget; pw_budget = regulator_get_unclaimed_power_budget(supply); + if (!pw_budget) + /* Do nothing if no power budget */ + continue; + /* Max power budget per manager */ if (pw_budget > 6000000) pw_budget = 6000000; -- cgit v1.2.3 From bc17455bc843b2f4b206e0bb8139013eb3d3c08b Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Wed, 20 Aug 2025 16:32:02 +0300 Subject: net/mlx5: Base ECVF devlink port attrs from 0 Adjust the vport number by the base ECVF vport number so the port attributes start at 0. Previously the port attributes would start 1 after the maximum number of host VFs. Fixes: dc13180824b7 ("net/mlx5: Enable devlink port for embedded cpu VF vports") Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c index b7102e14d23d..c33accadae0f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -47,10 +47,12 @@ static void mlx5_esw_offloads_pf_vf_devlink_port_attrs_set(struct mlx5_eswitch * devlink_port_attrs_pci_vf_set(dl_port, controller_num, pfnum, vport_num - 1, external); } else if (mlx5_core_is_ec_vf_vport(esw->dev, vport_num)) { + u16 base_vport = mlx5_core_ec_vf_vport_base(dev); + memcpy(dl_port->attrs.switch_id.id, ppid.id, ppid.id_len); dl_port->attrs.switch_id.id_len = ppid.id_len; devlink_port_attrs_pci_vf_set(dl_port, 0, pfnum, - vport_num - 1, false); + vport_num - base_vport, false); } } -- cgit v1.2.3 From 330f0f6713a39581936decac72331e6ab7f13529 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:03 +0300 Subject: net/mlx5: Remove default QoS group and attach vports directly to root TSAR Currently, the driver creates a default group (`node0`) and attaches all vports to it unless the user explicitly sets a parent group. As a result, when a user configures tx_share on a group and tx_share on a VF, the expectation is for the group and the VF to share bandwidth relatively. However, since the VF is not connected to the same parent (but to the default node), the proportional share logic is not applied correctly. To fix this, remove the default group (`node0`) and instead connect vports directly to the root TSAR when no parent is specified. This ensures that vports and groups share the same root scheduler and their tx_share values are compared directly under the same hierarchy. Fixes: 0fe132eac38c ("net/mlx5: E-switch, Allow to add vports to rate groups") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 97 ++++++++--------------- drivers/net/ethernet/mellanox/mlx5/core/eswitch.h | 5 -- 2 files changed, 33 insertions(+), 69 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 91d863c8c152..cd58d3934596 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -462,6 +462,7 @@ static int esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node, struct netlink_ext_ack *extack) { + struct mlx5_esw_sched_node *parent = vport_node->parent; u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {}; struct mlx5_core_dev *dev = vport_node->esw->dev; void *attr; @@ -477,7 +478,7 @@ esw_qos_vport_create_sched_element(struct mlx5_esw_sched_node *vport_node, attr = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes); MLX5_SET(vport_element, attr, vport_number, vport_node->vport->vport); MLX5_SET(scheduling_context, sched_ctx, parent_element_id, - vport_node->parent->ix); + parent ? parent->ix : vport_node->esw->qos.root_tsar_ix); MLX5_SET(scheduling_context, sched_ctx, max_average_bw, vport_node->max_rate); @@ -786,48 +787,15 @@ static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *exta return err; } - if (MLX5_CAP_QOS(dev, log_esw_max_sched_depth)) { - esw->qos.node0 = __esw_qos_create_vports_sched_node(esw, NULL, extack); - } else { - /* The eswitch doesn't support scheduling nodes. - * Create a software-only node0 using the root TSAR to attach vport QoS to. - */ - if (!__esw_qos_alloc_node(esw, - esw->qos.root_tsar_ix, - SCHED_NODE_TYPE_VPORTS_TSAR, - NULL)) - esw->qos.node0 = ERR_PTR(-ENOMEM); - else - list_add_tail(&esw->qos.node0->entry, - &esw->qos.domain->nodes); - } - if (IS_ERR(esw->qos.node0)) { - err = PTR_ERR(esw->qos.node0); - esw_warn(dev, "E-Switch create rate node 0 failed (%d)\n", err); - goto err_node0; - } refcount_set(&esw->qos.refcnt, 1); return 0; - -err_node0: - if (mlx5_destroy_scheduling_element_cmd(esw->dev, SCHEDULING_HIERARCHY_E_SWITCH, - esw->qos.root_tsar_ix)) - esw_warn(esw->dev, "E-Switch destroy root TSAR failed.\n"); - - return err; } static void esw_qos_destroy(struct mlx5_eswitch *esw) { int err; - if (esw->qos.node0->ix != esw->qos.root_tsar_ix) - __esw_qos_destroy_node(esw->qos.node0, NULL); - else - __esw_qos_free_node(esw->qos.node0); - esw->qos.node0 = NULL; - err = mlx5_destroy_scheduling_element_cmd(esw->dev, SCHEDULING_HIERARCHY_E_SWITCH, esw->qos.root_tsar_ix); @@ -990,13 +958,16 @@ esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type, struct netlink_ext_ack *extack) { struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; - int err, new_level, max_level; + struct mlx5_esw_sched_node *parent = vport_node->parent; + int err; if (type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) { + int new_level, max_level; + /* Increase the parent's level by 2 to account for both the * TC arbiter and the vports TC scheduling element. */ - new_level = vport_node->parent->level + 2; + new_level = (parent ? parent->level : 2) + 2; max_level = 1 << MLX5_CAP_QOS(vport_node->esw->dev, log_esw_max_sched_depth); if (new_level > max_level) { @@ -1033,9 +1004,7 @@ esw_qos_vport_tc_enable(struct mlx5_vport *vport, enum sched_node_type type, err_sched_nodes: if (type == SCHED_NODE_TYPE_RATE_LIMITER) { esw_qos_node_destroy_sched_element(vport_node, NULL); - list_add_tail(&vport_node->entry, - &vport_node->parent->children); - vport_node->level = vport_node->parent->level + 1; + esw_qos_node_attach_to_parent(vport_node); } else { esw_qos_tc_arbiter_scheduling_teardown(vport_node, NULL); } @@ -1083,7 +1052,6 @@ err_out: static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_ack *extack) { struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; - struct mlx5_esw_sched_node *parent = vport_node->parent; enum sched_node_type curr_type = vport_node->type; if (curr_type == SCHED_NODE_TYPE_VPORT) @@ -1093,7 +1061,7 @@ static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_a vport_node->bw_share = 0; list_del_init(&vport_node->entry); - esw_qos_normalize_min_rate(parent->esw, parent, extack); + esw_qos_normalize_min_rate(vport_node->esw, vport_node->parent, extack); trace_mlx5_esw_vport_qos_destroy(vport_node->esw->dev, vport); } @@ -1103,25 +1071,23 @@ static int esw_qos_vport_enable(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) { + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; int err; esw_assert_qos_lock_held(vport->dev->priv.eswitch); - esw_qos_node_set_parent(vport->qos.sched_node, parent); - if (type == SCHED_NODE_TYPE_VPORT) { - err = esw_qos_vport_create_sched_element(vport->qos.sched_node, - extack); - } else { + esw_qos_node_set_parent(vport_node, parent); + if (type == SCHED_NODE_TYPE_VPORT) + err = esw_qos_vport_create_sched_element(vport_node, extack); + else err = esw_qos_vport_tc_enable(vport, type, extack); - } if (err) return err; - vport->qos.sched_node->type = type; - esw_qos_normalize_min_rate(parent->esw, parent, extack); - trace_mlx5_esw_vport_qos_create(vport->dev, vport, - vport->qos.sched_node->max_rate, - vport->qos.sched_node->bw_share); + vport_node->type = type; + esw_qos_normalize_min_rate(vport_node->esw, parent, extack); + trace_mlx5_esw_vport_qos_create(vport->dev, vport, vport_node->max_rate, + vport_node->bw_share); return 0; } @@ -1132,6 +1098,7 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t { struct mlx5_eswitch *esw = vport->dev->priv.eswitch; struct mlx5_esw_sched_node *sched_node; + struct mlx5_eswitch *parent_esw; int err; esw_assert_qos_lock_held(esw); @@ -1139,10 +1106,12 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t if (err) return err; - parent = parent ?: esw->qos.node0; - sched_node = __esw_qos_alloc_node(parent->esw, 0, type, parent); + parent_esw = parent ? parent->esw : esw; + sched_node = __esw_qos_alloc_node(parent_esw, 0, type, parent); if (!sched_node) return -ENOMEM; + if (!parent) + list_add_tail(&sched_node->entry, &esw->qos.domain->nodes); sched_node->max_rate = max_rate; sched_node->min_rate = min_rate; @@ -1168,7 +1137,7 @@ void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport) goto unlock; parent = vport->qos.sched_node->parent; - WARN(parent != esw->qos.node0, "Disabling QoS on port before detaching it from node"); + WARN(parent, "Disabling QoS on port before detaching it from node"); esw_qos_vport_disable(vport, NULL); mlx5_esw_qos_vport_qos_free(vport); @@ -1268,7 +1237,6 @@ static int esw_qos_vport_update(struct mlx5_vport *vport, int err; esw_assert_qos_lock_held(vport->dev->priv.eswitch); - parent = parent ?: curr_parent; if (curr_type == type && curr_parent == parent) return 0; @@ -1306,16 +1274,16 @@ static int esw_qos_vport_update_parent(struct mlx5_vport *vport, struct mlx5_esw esw_assert_qos_lock_held(esw); curr_parent = vport->qos.sched_node->parent; - parent = parent ?: esw->qos.node0; if (curr_parent == parent) return 0; /* Set vport QoS type based on parent node type if different from * default QoS; otherwise, use the vport's current QoS type. */ - if (parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + if (parent && parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) type = SCHED_NODE_TYPE_RATE_LIMITER; - else if (curr_parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) + else if (curr_parent && + curr_parent->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) type = SCHED_NODE_TYPE_VPORT; else type = vport->qos.sched_node->type; @@ -1654,9 +1622,10 @@ static bool esw_qos_validate_unsupported_tc_bw(struct mlx5_eswitch *esw, static bool esw_qos_vport_validate_unsupported_tc_bw(struct mlx5_vport *vport, u32 *tc_bw) { - struct mlx5_eswitch *esw = vport->qos.sched_node ? - vport->qos.sched_node->parent->esw : - vport->dev->priv.eswitch; + struct mlx5_esw_sched_node *node = vport->qos.sched_node; + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; + + esw = (node && node->parent) ? node->parent->esw : esw; return esw_qos_validate_unsupported_tc_bw(esw, tc_bw); } @@ -1763,7 +1732,7 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf, if (disable) { if (vport_node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_VPORT, - NULL, extack); + vport_node->parent, extack); goto unlock; } @@ -1775,7 +1744,7 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf, } else { err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_TC_ARBITER_TSAR, - NULL, extack); + vport_node->parent, extack); } if (!err) esw_qos_set_tc_arbiter_bw_shares(vport_node, tc_bw, extack); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h index b0b8ef3ec3c4..45506ad56847 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h @@ -373,11 +373,6 @@ struct mlx5_eswitch { refcount_t refcnt; u32 root_tsar_ix; struct mlx5_qos_domain *domain; - /* Contains all vports with QoS enabled but no explicit node. - * Cannot be NULL if QoS is enabled, but may be a fake node - * referencing the root TSAR if the esw doesn't support nodes. - */ - struct mlx5_esw_sched_node *node0; } qos; struct mlx5_esw_bridge_offloads *br_offloads; -- cgit v1.2.3 From e8f973576ca5387ffd2917b8ae661d3f9acde526 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:04 +0300 Subject: net/mlx5e: Preserve tc-bw during parent changes When changing parent of a node/leaf with tc-bw configured, the code saves and restores tc-bw values. However, it was reading the converted hardware bw_share values (where 0 becomes 1) instead of the original user values, causing incorrect tc-bw calculations after parent change. Store original tc-bw values in the node structure and use them directly for save/restore operations. Fixes: cf7e73770d1b ("net/mlx5: Manage TC arbiter nodes and implement full support for tc-bw") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index cd58d3934596..4ed5968f1638 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -102,6 +102,8 @@ struct mlx5_esw_sched_node { u8 level; /* Valid only when this node represents a traffic class. */ u8 tc; + /* Valid only for a TC arbiter node or vport TC arbiter. */ + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; }; static void esw_qos_node_attach_to_parent(struct mlx5_esw_sched_node *node) @@ -609,10 +611,7 @@ static void esw_qos_tc_arbiter_get_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node, u32 *tc_bw) { - struct mlx5_esw_sched_node *vports_tc_node; - - list_for_each_entry(vports_tc_node, &tc_arbiter_node->children, entry) - tc_bw[vports_tc_node->tc] = vports_tc_node->bw_share; + memcpy(tc_bw, tc_arbiter_node->tc_bw, sizeof(tc_arbiter_node->tc_bw)); } static void @@ -629,6 +628,7 @@ esw_qos_set_tc_arbiter_bw_shares(struct mlx5_esw_sched_node *tc_arbiter_node, u8 tc = vports_tc_node->tc; u32 bw_share; + tc_arbiter_node->tc_bw[tc] = tc_bw[tc]; bw_share = tc_bw[tc] * fw_max_bw_share; bw_share = esw_qos_calc_bw_share(bw_share, divider, fw_max_bw_share); @@ -1060,6 +1060,7 @@ static void esw_qos_vport_disable(struct mlx5_vport *vport, struct netlink_ext_a esw_qos_vport_tc_disable(vport, extack); vport_node->bw_share = 0; + memset(vport_node->tc_bw, 0, sizeof(vport_node->tc_bw)); list_del_init(&vport_node->entry); esw_qos_normalize_min_rate(vport_node->esw, vport_node->parent, extack); @@ -1231,8 +1232,9 @@ static int esw_qos_vport_update(struct mlx5_vport *vport, struct mlx5_esw_sched_node *parent, struct netlink_ext_ack *extack) { - struct mlx5_esw_sched_node *curr_parent = vport->qos.sched_node->parent; - enum sched_node_type curr_type = vport->qos.sched_node->type; + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + struct mlx5_esw_sched_node *curr_parent = vport_node->parent; + enum sched_node_type curr_type = vport_node->type; u32 curr_tc_bw[DEVLINK_RATE_TCS_MAX] = {0}; int err; @@ -1244,10 +1246,8 @@ static int esw_qos_vport_update(struct mlx5_vport *vport, if (err) return err; - if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) { - esw_qos_tc_arbiter_get_bw_shares(vport->qos.sched_node, - curr_tc_bw); - } + if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) + esw_qos_tc_arbiter_get_bw_shares(vport_node, curr_tc_bw); esw_qos_vport_disable(vport, extack); @@ -1258,8 +1258,8 @@ static int esw_qos_vport_update(struct mlx5_vport *vport, } if (curr_type == SCHED_NODE_TYPE_TC_ARBITER_TSAR && curr_type == type) { - esw_qos_set_tc_arbiter_bw_shares(vport->qos.sched_node, - curr_tc_bw, extack); + esw_qos_set_tc_arbiter_bw_shares(vport_node, curr_tc_bw, + extack); } return err; -- cgit v1.2.3 From b697ef4d1d136948d282384e6cc3d1af469ea123 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:05 +0300 Subject: net/mlx5: Destroy vport QoS element when no configuration remains If a VF has been configured and the user later clears all QoS settings, the vport element remains in the firmware QoS tree. This leads to inconsistent behavior compared to VFs that were never configured, since the FW assumes that unconfigured VFs are outside the QoS hierarchy. As a result, the bandwidth share across VFs may differ, even though none of them appear to have any configuration. Align the driver behavior with the FW expectation by destroying the vport QoS element when all configurations are removed. Fixes: c9497c98901c ("net/mlx5: Add support for setting VF min rate") Fixes: cf7e73770d1b ("net/mlx5: Manage TC arbiter nodes and implement full support for tc-bw") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20250820133209.389065-5-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 57 +++++++++++++++++++---- 1 file changed, 49 insertions(+), 8 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 4ed5968f1638..452a948a3e6d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1127,6 +1127,19 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t return err; } +static void mlx5_esw_qos_vport_disable_locked(struct mlx5_vport *vport) +{ + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; + + esw_assert_qos_lock_held(esw); + if (!vport->qos.sched_node) + return; + + esw_qos_vport_disable(vport, NULL); + mlx5_esw_qos_vport_qos_free(vport); + esw_qos_put(esw); +} + void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport) { struct mlx5_eswitch *esw = vport->dev->priv.eswitch; @@ -1140,9 +1153,7 @@ void mlx5_esw_qos_vport_disable(struct mlx5_vport *vport) parent = vport->qos.sched_node->parent; WARN(parent, "Disabling QoS on port before detaching it from node"); - esw_qos_vport_disable(vport, NULL); - mlx5_esw_qos_vport_qos_free(vport); - esw_qos_put(esw); + mlx5_esw_qos_vport_disable_locked(vport); unlock: esw_qos_unlock(esw); } @@ -1642,6 +1653,21 @@ static bool esw_qos_tc_bw_disabled(u32 *tc_bw) return true; } +static void esw_vport_qos_prune_empty(struct mlx5_vport *vport) +{ + struct mlx5_esw_sched_node *vport_node = vport->qos.sched_node; + + esw_assert_qos_lock_held(vport->dev->priv.eswitch); + if (!vport_node) + return; + + if (vport_node->parent || vport_node->max_rate || + vport_node->min_rate || !esw_qos_tc_bw_disabled(vport_node->tc_bw)) + return; + + mlx5_esw_qos_vport_disable_locked(vport); +} + int mlx5_esw_qos_init(struct mlx5_eswitch *esw) { if (esw->qos.domain) @@ -1675,6 +1701,10 @@ int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void esw_qos_lock(esw); err = mlx5_esw_qos_set_vport_min_rate(vport, tx_share, extack); + if (err) + goto out; + esw_vport_qos_prune_empty(vport); +out: esw_qos_unlock(esw); return err; } @@ -1696,6 +1726,10 @@ int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void * esw_qos_lock(esw); err = mlx5_esw_qos_set_vport_max_rate(vport, tx_max, extack); + if (err) + goto out; + esw_vport_qos_prune_empty(vport); +out: esw_qos_unlock(esw); return err; } @@ -1733,6 +1767,7 @@ int mlx5_esw_devlink_rate_leaf_tc_bw_set(struct devlink_rate *rate_leaf, if (vport_node->type == SCHED_NODE_TYPE_TC_ARBITER_TSAR) err = esw_qos_vport_update(vport, SCHED_NODE_TYPE_VPORT, vport_node->parent, extack); + esw_vport_qos_prune_empty(vport); goto unlock; } @@ -1893,14 +1928,20 @@ int mlx5_esw_devlink_rate_leaf_parent_set(struct devlink_rate *devlink_rate, void *priv, void *parent_priv, struct netlink_ext_ack *extack) { - struct mlx5_esw_sched_node *node; + struct mlx5_esw_sched_node *node = parent ? parent_priv : NULL; struct mlx5_vport *vport = priv; + int err; - if (!parent) - return mlx5_esw_qos_vport_update_parent(vport, NULL, extack); + err = mlx5_esw_qos_vport_update_parent(vport, node, extack); + if (!err) { + struct mlx5_eswitch *esw = vport->dev->priv.eswitch; + + esw_qos_lock(esw); + esw_vport_qos_prune_empty(vport); + esw_qos_unlock(esw); + } - node = parent_priv; - return mlx5_esw_qos_vport_update_parent(vport, node, extack); + return err; } static bool esw_qos_is_node_empty(struct mlx5_esw_sched_node *node) -- cgit v1.2.3 From 3c114fb2afe493066df5b9e560ef37216b153c90 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:06 +0300 Subject: net/mlx5: Fix QoS reference leak in vport enable error path Add missing esw_qos_put() call when __esw_qos_alloc_node() fails in mlx5_esw_qos_vport_enable(). Fixes: be034baba83e ("net/mlx5: Make vport QoS enablement more flexible for future extensions") Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-6-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 452a948a3e6d..41aec07bb6c2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1109,8 +1109,10 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t parent_esw = parent ? parent->esw : esw; sched_node = __esw_qos_alloc_node(parent_esw, 0, type, parent); - if (!sched_node) + if (!sched_node) { + esw_qos_put(esw); return -ENOMEM; + } if (!parent) list_add_tail(&sched_node->entry, &esw->qos.domain->nodes); -- cgit v1.2.3 From 51b17c98e3dbb2093a81b0490050a0eaa919ebee Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Wed, 20 Aug 2025 16:32:07 +0300 Subject: net/mlx5: Restore missing scheduling node cleanup on vport enable failure Restore the __esw_qos_free_node() call removed by the offending commit. Fixes: 97733d1e00a0 ("net/mlx5: Add traffic class scheduling support for vport QoS") Signed-off-by: Carolina Jubran Reviewed-by: Tariq Toukan Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-7-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 41aec07bb6c2..8b4977650183 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -1122,6 +1122,7 @@ static int mlx5_esw_qos_vport_enable(struct mlx5_vport *vport, enum sched_node_t vport->qos.sched_node = sched_node; err = esw_qos_vport_enable(vport, type, parent, extack); if (err) { + __esw_qos_free_node(sched_node); esw_qos_put(esw); vport->qos.sched_node = NULL; } -- cgit v1.2.3 From 451d2849ea66659040b59ae3cb7e50cc97404733 Mon Sep 17 00:00:00 2001 From: Alexei Lazar Date: Wed, 20 Aug 2025 16:32:08 +0300 Subject: net/mlx5e: Query FW for buffer ownership The SW currently saves local buffer ownership when setting the buffer. This means that the SW assumes it has ownership of the buffer after the command is set. If setting the buffer fails and we remain in FW ownership, the local buffer ownership state incorrectly remains as SW-owned. This leads to incorrect behavior in subsequent PFC commands, causing failures. Instead of saving local buffer ownership in SW, query the FW for buffer ownership when setting the buffer. This ensures that the buffer ownership state is accurately reflected, avoiding the issues caused by incorrect ownership states. Fixes: ecdf2dadee8e ("net/mlx5e: Receive buffer support for DCBX") Signed-off-by: Alexei Lazar Reviewed-by: Shahar Shitrit Reviewed-by: Dragos Tatulea Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250820133209.389065-8-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 12 +++++++++--- drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h | 2 ++ drivers/net/ethernet/mellanox/mlx5/core/port.c | 20 ++++++++++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h index b59aee75de94..2c98a5299df3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/dcbnl.h @@ -26,7 +26,6 @@ struct mlx5e_dcbx { u8 cap; /* Buffer configuration */ - bool manual_buffer; u32 cable_len; u32 xoff; u16 port_buff_cell_sz; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c index 5fe016e477b3..d166c0d5189e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c @@ -362,6 +362,7 @@ static int mlx5e_dcbnl_ieee_getpfc(struct net_device *dev, static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev, struct ieee_pfc *pfc) { + u8 buffer_ownership = MLX5_BUF_OWNERSHIP_UNKNOWN; struct mlx5e_priv *priv = netdev_priv(dev); struct mlx5_core_dev *mdev = priv->mdev; u32 old_cable_len = priv->dcbx.cable_len; @@ -389,7 +390,14 @@ static int mlx5e_dcbnl_ieee_setpfc(struct net_device *dev, if (MLX5_BUFFER_SUPPORTED(mdev)) { pfc_new.pfc_en = (changed & MLX5E_PORT_BUFFER_PFC) ? pfc->pfc_en : curr_pfc_en; - if (priv->dcbx.manual_buffer) + ret = mlx5_query_port_buffer_ownership(mdev, + &buffer_ownership); + if (ret) + netdev_err(dev, + "%s, Failed to get buffer ownership: %d\n", + __func__, ret); + + if (buffer_ownership == MLX5_BUF_OWNERSHIP_SW_OWNED) ret = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, &pfc_new, NULL, NULL); @@ -982,7 +990,6 @@ static int mlx5e_dcbnl_setbuffer(struct net_device *dev, if (!changed) return 0; - priv->dcbx.manual_buffer = true; err = mlx5e_port_manual_buffer_config(priv, changed, dev->mtu, NULL, buffer_size, prio2buffer); return err; @@ -1252,7 +1259,6 @@ void mlx5e_dcbnl_initialize(struct mlx5e_priv *priv) priv->dcbx.cap |= DCB_CAP_DCBX_HOST; priv->dcbx.port_buff_cell_sz = mlx5e_query_port_buffers_cell_size(priv); - priv->dcbx.manual_buffer = false; priv->dcbx.cable_len = MLX5E_DEFAULT_CABLE_LEN; mlx5e_ets_init(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index b6d53db27cd5..9d3504f5abfa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -367,6 +367,8 @@ int mlx5_query_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *out); int mlx5_set_port_dcbx_param(struct mlx5_core_dev *mdev, u32 *in); int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state); int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state); +int mlx5_query_port_buffer_ownership(struct mlx5_core_dev *mdev, + u8 *buffer_ownership); int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio); int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c index 549f1066d2a5..2d7adf7444ba 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c @@ -968,6 +968,26 @@ int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state) return err; } +int mlx5_query_port_buffer_ownership(struct mlx5_core_dev *mdev, + u8 *buffer_ownership) +{ + u32 out[MLX5_ST_SZ_DW(pfcc_reg)] = {}; + int err; + + if (!MLX5_CAP_PCAM_FEATURE(mdev, buffer_ownership)) { + *buffer_ownership = MLX5_BUF_OWNERSHIP_UNKNOWN; + return 0; + } + + err = mlx5_query_pfcc_reg(mdev, out, sizeof(out)); + if (err) + return err; + + *buffer_ownership = MLX5_GET(pfcc_reg, out, buf_ownership); + + return 0; +} + int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio) { int sz = MLX5_ST_SZ_BYTES(qpdpm_reg); -- cgit v1.2.3 From 8b0587a885fdb34fd6090a3f8625cb7ac1444826 Mon Sep 17 00:00:00 2001 From: Armen Ratner Date: Wed, 20 Aug 2025 16:32:09 +0300 Subject: net/mlx5e: Preserve shared buffer capacity during headroom updates When port buffer headroom changes, port_update_shared_buffer() recalculates the shared buffer size and splits it in a 3:1 ratio (lossy:lossless) - Currently, the calculation is: lossless = shared / 4; lossy = (shared / 4) * 3; Meaning, the calculation dropped the remainder of shared % 4 due to integer division, unintentionally reducing the total shared buffer by up to three cells on each update. Over time, this could shrink the buffer below usable size. Fix it by changing the calculation to: lossless = shared / 4; lossy = shared - lossless; This retains all buffer cells while still approximating the intended 3:1 split, preventing capacity loss over time. While at it, perform headroom calculations in units of cells rather than in bytes for more accurate calculations avoiding extra divisions. Fixes: a440030d8946 ("net/mlx5e: Update shared buffer along with device buffer changes") Signed-off-by: Armen Ratner Signed-off-by: Maher Sanalla Reviewed-by: Tariq Toukan Signed-off-by: Alexei Lazar Signed-off-by: Mark Bloch Reviewed-by: Przemek Kitszel Link: https://patch.msgid.link/20250820133209.389065-9-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en/port_buffer.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'drivers/net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c index 5ae787656a7c..3efa8bf1d14e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/port_buffer.c @@ -272,8 +272,8 @@ static int port_update_shared_buffer(struct mlx5_core_dev *mdev, /* Total shared buffer size is split in a ratio of 3:1 between * lossy and lossless pools respectively. */ - lossy_epool_size = (shared_buffer_size / 4) * 3; lossless_ipool_size = shared_buffer_size / 4; + lossy_epool_size = shared_buffer_size - lossless_ipool_size; mlx5e_port_set_sbpr(mdev, 0, MLX5_EGRESS_DIR, MLX5_LOSSY_POOL, 0, lossy_epool_size); @@ -288,14 +288,12 @@ static int port_set_buffer(struct mlx5e_priv *priv, u16 port_buff_cell_sz = priv->dcbx.port_buff_cell_sz; struct mlx5_core_dev *mdev = priv->mdev; int sz = MLX5_ST_SZ_BYTES(pbmc_reg); - u32 new_headroom_size = 0; - u32 current_headroom_size; + u32 current_headroom_cells = 0; + u32 new_headroom_cells = 0; void *in; int err; int i; - current_headroom_size = port_buffer->headroom_size; - in = kzalloc(sz, GFP_KERNEL); if (!in) return -ENOMEM; @@ -306,12 +304,14 @@ static int port_set_buffer(struct mlx5e_priv *priv, for (i = 0; i < MLX5E_MAX_NETWORK_BUFFER; i++) { void *buffer = MLX5_ADDR_OF(pbmc_reg, in, buffer[i]); + current_headroom_cells += MLX5_GET(bufferx_reg, buffer, size); + u64 size = port_buffer->buffer[i].size; u64 xoff = port_buffer->buffer[i].xoff; u64 xon = port_buffer->buffer[i].xon; - new_headroom_size += size; do_div(size, port_buff_cell_sz); + new_headroom_cells += size; do_div(xoff, port_buff_cell_sz); do_div(xon, port_buff_cell_sz); MLX5_SET(bufferx_reg, buffer, size, size); @@ -320,10 +320,8 @@ static int port_set_buffer(struct mlx5e_priv *priv, MLX5_SET(bufferx_reg, buffer, xon_threshold, xon); } - new_headroom_size /= port_buff_cell_sz; - current_headroom_size /= port_buff_cell_sz; - err = port_update_shared_buffer(priv->mdev, current_headroom_size, - new_headroom_size); + err = port_update_shared_buffer(priv->mdev, current_headroom_cells, + new_headroom_cells); if (err) goto out; -- cgit v1.2.3