summaryrefslogtreecommitdiff
path: root/net/core/netdev_rx_queue.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 18:36:10 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 18:36:10 -0700
commit91a4855d6c03e770e42f17c798a36a3c46e63de2 (patch)
tree5103bfe3aea2aab7e8b358c5c9329539508f648d /net/core/netdev_rx_queue.c
parentf5ad4101009e7f5f5984ffea6923d4fcd470932a (diff)
parent35c2c39832e569449b9192fa1afbbc4c66227af7 (diff)
Merge tag 'net-next-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core & protocols: - Support HW queue leasing, allowing containers to be granted access to HW queues for zero-copy operations and AF_XDP - Number of code moves to help the compiler with inlining. Avoid output arguments for returning drop reason where possible - Rework drop handling within qdiscs to include more metadata about the reason and dropping qdisc in the tracepoints - Remove the rtnl_lock use from IP Multicast Routing - Pack size information into the Rx Flow Steering table pointer itself. This allows making the table itself a flat array of u32s, thus making the table allocation size a power of two - Report TCP delayed ack timer information via socket diag - Add ip_local_port_step_width sysctl to allow distributing the randomly selected ports more evenly throughout the allowed space - Add support for per-route tunsrc in IPv6 segment routing - Start work of switching sockopt handling to iov_iter - Improve dynamic recvbuf sizing in MPTCP, limit burstiness and avoid buffer size drifting up - Support MSG_EOR in MPTCP - Add stp_mode attribute to the bridge driver for STP mode selection. This addresses concerns about call_usermodehelper() usage - Remove UDP-Lite support (as announced in 2023) - Remove support for building IPv6 as a module. Remove the now unnecessary function calling indirection Cross-tree stuff: - Move Michael MIC code from generic crypto into wireless, it's considered insecure but some WiFi networks still need it Netfilter: - Switch nft_fib_ipv6 module to no longer need temporary dst_entry object allocations by using fib6_lookup() + RCU. Florian W reports this gets us ~13% higher packet rate - Convert IPVS's global __ip_vs_mutex to per-net service_mutex and switch the service tables to be per-net. Convert some code that walks the service lists to use RCU instead of the service_mutex - Add more opinionated input validation to lower security exposure - Make IPVS hash tables to be per-netns and resizable Wireless: - Finished assoc frame encryption/EPPKE/802.1X-over-auth - Radar detection improvements - Add 6 GHz incumbent signal detection APIs - Multi-link support for FILS, probe response templates and client probing - New APIs and mac80211 support for NAN (Neighbor Aware Networking, aka Wi-Fi Aware) so less work must be in firmware Driver API: - Add numerical ID for devlink instances (to avoid having to create fake bus/device pairs just to have an ID). Support shared devlink instances which span multiple PFs - Add standard counters for reporting pause storm events (implement in mlx5 and fbnic) - Add configuration API for completion writeback buffering (implement in mana) - Support driver-initiated change of RSS context sizes - Support DPLL monitoring input frequency (implement in zl3073x) - Support per-port resources in devlink (implement in mlx5) Misc: - Expand the YAML spec for Netfilter Drivers - Software: - macvlan: support multicast rx for bridge ports with shared source MAC address - team: decouple receive and transmit enablement for IEEE 802.3ad LACP "independent control" - Ethernet high-speed NICs: - nVidia/Mellanox: - support high order pages in zero-copy mode (for payload coalescing) - support multiple packets in a page (for systems with 64kB pages) - Broadcom 25-400GE (bnxt): - implement XDP RSS hash metadata extraction - add software fallback for UDP GSO, lowering the IOMMU cost - Broadcom 800GE (bnge): - add link status and configuration handling - add various HW and SW statistics - Marvell/Cavium: - NPC HW block support for cn20k - Huawei (hinic3): - add mailbox / control queue - add rx VLAN offload - add driver info and link management - Ethernet NICs: - Marvell/Aquantia: - support reading SFP module info on some AQC100 cards - Realtek PCI (r8169): - add support for RTL8125cp - Realtek USB (r8152): - support for the RTL8157 5Gbit chip - add 2500baseT EEE status/configuration support - Ethernet NICs embedded and off-the-shelf IP: - Synopsys (stmmac): - cleanup and reorganize SerDes handling and PCS support - cleanup descriptor handling and per-platform data - cleanup and consolidate MDIO defines and handling - shrink driver memory use for internal structures - improve Tx IRQ coalescing - improve TCP segmentation handling - add support for Spacemit K3 - Cadence (macb): - support PHYs that have inband autoneg disabled with GEM - support IEEE 802.3az EEE - rework usrio capabilities and handling - AMD (xgbe): - improve power management for S0i3 - improve TX resilience for link-down handling - Virtual: - Google cloud vNIC: - support larger ring sizes in DQO-QPL mode - improve HW-GRO handling - support UDP GSO for DQO format - PCIe NTB: - support queue count configuration - Ethernet PHYs: - automatically disable PHY autonomous EEE if MAC is in charge - Broadcom: - add BCM84891/BCM84892 support - Micrel: - support for LAN9645X internal PHY - Realtek: - add RTL8224 pair order support - support PHY LEDs on RTL8211F-VD - support spread spectrum clocking (SSC) - Maxlinear: - add PHY-level statistics via ethtool - Ethernet switches: - Maxlinear (mxl862xx): - support for bridge offloading - support for VLANs - support driver statistics - Bluetooth: - large number of fixes and new device IDs - Mediatek: - support MT6639 (MT7927) - support MT7902 SDIO - WiFi: - Intel (iwlwifi): - UNII-9 and continuing UHR work - MediaTek (mt76): - mt7996/mt7925 MLO fixes/improvements - mt7996 NPU support (HW eth/wifi traffic offload) - Qualcomm (ath12k): - monitor mode support on IPQ5332 - basic hwmon temperature reporting - support IPQ5424 - Realtek: - add USB RX aggregation to improve performance - add USB TX flow control by tracking in-flight URBs - Cellular: - IPA v5.2 support" * tag 'net-next-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1561 commits) net: pse-pd: fix kernel-doc function name for pse_control_find_by_id() wireguard: device: use exit_rtnl callback instead of manual rtnl_lock in pre_exit wireguard: allowedips: remove redundant space tools: ynl: add sample for wireguard wireguard: allowedips: Use kfree_rcu() instead of call_rcu() MAINTAINERS: Add netkit selftest files selftests/net: Add additional test coverage in nk_qlease selftests/net: Split netdevsim tests from HW tests in nk_qlease tools/ynl: Make YnlFamily closeable as a context manager net: airoha: Add missing PPE configurations in airoha_ppe_hw_init() net: airoha: Fix VIP configuration for AN7583 SoC net: caif: clear client service pointer on teardown net: strparser: fix skb_head leak in strp_abort_strp() net: usb: cdc-phonet: fix skb frags[] overflow in rx_complete() selftests/bpf: add test for xdp_master_redirect with bond not up net, bpf: fix null-ptr-deref in xdp_master_redirect() for down master net: airoha: Remove PCE_MC_EN_MASK bit in REG_FE_PCE_CFG configuration sctp: disable BH before calling udp_tunnel_xmit_skb() sctp: fix missing encap_port propagation for GSO fragments net: airoha: Rely on net_device pointer in ETS callbacks ...
Diffstat (limited to 'net/core/netdev_rx_queue.c')
-rw-r--r--net/core/netdev_rx_queue.c182
1 files changed, 154 insertions, 28 deletions
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index 05fd2875d725..de4dac4c88b3 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -10,15 +10,91 @@
#include "dev.h"
#include "page_pool_priv.h"
-/* See also page_pool_is_unreadable() */
-bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
+void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src)
+{
+ netdev_assert_locked(rxq_src->dev);
+ netdev_assert_locked(rxq_dst->dev);
+
+ netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL);
+
+ WRITE_ONCE(rxq_src->lease, rxq_dst);
+ WRITE_ONCE(rxq_dst->lease, rxq_src);
+}
+
+void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src)
+{
+ netdev_assert_locked(rxq_dst->dev);
+ netdev_assert_locked(rxq_src->dev);
+
+ netif_rxq_cleanup_unlease(rxq_src, rxq_dst);
+
+ WRITE_ONCE(rxq_src->lease, NULL);
+ WRITE_ONCE(rxq_dst->lease, NULL);
+
+ netdev_put(rxq_src->dev, &rxq_src->lease_tracker);
+}
+
+bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx)
+{
+ if (rxq_idx < dev->real_num_rx_queues)
+ return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease);
+ return false;
+}
+
+/* Virtual devices eligible for leasing have no dev->dev.parent, while
+ * physical devices always have one. Use this to enforce the correct
+ * lease traversal direction.
+ */
+static bool netif_lease_dir_ok(const struct net_device *dev,
+ enum netif_lease_dir dir)
+{
+ if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent)
+ return true;
+ if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent)
+ return true;
+ return false;
+}
+
+bool netif_is_queue_leasee(const struct net_device *dev)
{
- struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx);
+ return netif_lease_dir_ok(dev, NETIF_VIRT_TO_PHYS);
+}
- return !!rxq->mp_params.mp_ops;
+struct netdev_rx_queue *
+__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx,
+ enum netif_lease_dir dir)
+{
+ struct net_device *orig_dev = *dev;
+ struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx);
+
+ if (rxq->lease) {
+ if (!netif_lease_dir_ok(orig_dev, dir))
+ return NULL;
+ rxq = rxq->lease;
+ *rxq_idx = get_netdev_rx_queue_index(rxq);
+ *dev = rxq->dev;
+ }
+ return rxq;
+}
+
+/* See also page_pool_is_unreadable() */
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx)
+{
+ if (rxq_idx < dev->real_num_rx_queues)
+ return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops;
+ return false;
}
EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
+bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx)
+{
+ if (rxq_idx < dev->real_num_rx_queues)
+ return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv;
+ return false;
+}
+
static int netdev_rx_queue_reconfig(struct net_device *dev,
unsigned int rxq_idx,
struct netdev_queue_config *qcfg_old,
@@ -108,9 +184,9 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
}
EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
-int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
- const struct pp_memory_provider_params *p,
- struct netlink_ext_ack *extack)
+static int __netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *p,
+ struct netlink_ext_ack *extack)
{
const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
struct netdev_queue_config qcfg[2];
@@ -120,12 +196,6 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
if (!qops)
return -EOPNOTSUPP;
- if (rxq_idx >= dev->real_num_rx_queues) {
- NL_SET_ERR_MSG(extack, "rx queue index out of range");
- return -ERANGE;
- }
- rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues);
-
if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
NL_SET_ERR_MSG(extack, "tcp-data-split is disabled");
return -EINVAL;
@@ -172,28 +242,47 @@ err_clear_mp:
return ret;
}
-int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
- struct pp_memory_provider_params *p)
+int netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *p,
+ struct netlink_ext_ack *extack)
{
int ret;
+ if (!netdev_need_ops_lock(dev))
+ return -EOPNOTSUPP;
+
+ if (rxq_idx >= dev->real_num_rx_queues) {
+ NL_SET_ERR_MSG(extack, "rx queue index out of range");
+ return -ERANGE;
+ }
+ rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues);
+
+ if (!netif_rxq_is_leased(dev, rxq_idx))
+ return __netif_mp_open_rxq(dev, rxq_idx, p, extack);
+
+ if (!__netif_get_rx_queue_lease(&dev, &rxq_idx, NETIF_VIRT_TO_PHYS)) {
+ NL_SET_ERR_MSG(extack, "rx queue leased to a virtual netdev");
+ return -EBUSY;
+ }
+ if (!dev->dev.parent) {
+ NL_SET_ERR_MSG(extack, "rx queue belongs to a virtual netdev");
+ return -EOPNOTSUPP;
+ }
+
netdev_lock(dev);
- ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL);
+ ret = __netif_mp_open_rxq(dev, rxq_idx, p, extack);
netdev_unlock(dev);
return ret;
}
-void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
- const struct pp_memory_provider_params *old_p)
+static void __netif_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *old_p)
{
struct netdev_queue_config qcfg[2];
struct netdev_rx_queue *rxq;
int err;
- if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues))
- return;
-
- rxq = __netif_get_rx_queue(dev, ifq_idx);
+ rxq = __netif_get_rx_queue(dev, rxq_idx);
/* Callers holding a netdev ref may get here after we already
* went thru shutdown via dev_memory_provider_uninstall().
@@ -206,18 +295,55 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
rxq->mp_params.mp_priv != old_p->mp_priv))
return;
- netdev_queue_config(dev, ifq_idx, &qcfg[0]);
+ netdev_queue_config(dev, rxq_idx, &qcfg[0]);
memset(&rxq->mp_params, 0, sizeof(rxq->mp_params));
- netdev_queue_config(dev, ifq_idx, &qcfg[1]);
+ netdev_queue_config(dev, rxq_idx, &qcfg[1]);
- err = netdev_rx_queue_reconfig(dev, ifq_idx, &qcfg[0], &qcfg[1]);
+ err = netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg[0], &qcfg[1]);
WARN_ON(err && err != -ENETDOWN);
}
-void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
- struct pp_memory_provider_params *old_p)
+void netif_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *old_p)
{
+ if (WARN_ON_ONCE(rxq_idx >= dev->real_num_rx_queues))
+ return;
+ if (!netif_rxq_is_leased(dev, rxq_idx))
+ return __netif_mp_close_rxq(dev, rxq_idx, old_p);
+
+ if (!__netif_get_rx_queue_lease(&dev, &rxq_idx, NETIF_VIRT_TO_PHYS)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
netdev_lock(dev);
- __net_mp_close_rxq(dev, ifq_idx, old_p);
+ __netif_mp_close_rxq(dev, rxq_idx, old_p);
netdev_unlock(dev);
}
+
+void __netif_mp_uninstall_rxq(struct netdev_rx_queue *rxq,
+ const struct pp_memory_provider_params *p)
+{
+ if (p->mp_ops && p->mp_ops->uninstall)
+ p->mp_ops->uninstall(p->mp_priv, rxq);
+}
+
+/* Clean up memory provider state when a queue lease is torn down. If
+ * a memory provider was installed on the physical queue via the lease,
+ * close it now. The memory provider is a property of the queue itself,
+ * and it was _guaranteed_ to be installed on the physical queue via
+ * the lease redirection. The extra __netif_mp_close_rxq is needed
+ * since the physical queue can outlive the virtual queue in the lease
+ * case, so it needs to be reconfigured to clear the memory provider.
+ */
+void netif_rxq_cleanup_unlease(struct netdev_rx_queue *phys_rxq,
+ struct netdev_rx_queue *virt_rxq)
+{
+ struct pp_memory_provider_params *p = &phys_rxq->mp_params;
+ unsigned int rxq_idx = get_netdev_rx_queue_index(phys_rxq);
+
+ if (!p->mp_ops)
+ return;
+
+ __netif_mp_uninstall_rxq(virt_rxq, p);
+ __netif_mp_close_rxq(phys_rxq->dev, rxq_idx, p);
+}