summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorPaolo Abeni <pabeni@redhat.com>2026-01-20 11:58:52 +0100
committerPaolo Abeni <pabeni@redhat.com>2026-01-20 12:25:29 +0100
commit77b9c4a438fc66e2ab004c411056b3fb71a54f2c (patch)
tree1a19c67570e38e8c1754b06bbc75bf3a8d0ebbf4 /include
parent4515ec4ad58a37e70a9e1256c0b993958c9b7497 (diff)
parent931420a2fc363817c92990fa14eb1bdec024ce04 (diff)
Merge branch 'netkit-support-for-io_uring-zero-copy-and-af_xdp'
Daniel Borkmann says: ==================== netkit: Support for io_uring zero-copy and AF_XDP Containers use virtual netdevs to route traffic from a physical netdev in the host namespace. They do not have access to the physical netdev in the host and thus can't use memory providers or AF_XDP that require reconfiguring/restarting queues in the physical netdev. This patchset adds the concept of queue leasing to virtual netdevs that allow containers to use memory providers and AF_XDP at native speed. Leased queues are bound to a real queue in a physical netdev and act as a proxy. Memory providers and AF_XDP operations take an ifindex and queue id, so containers would pass in an ifindex for a virtual netdev and a queue id of a leased queue, which then gets proxied to the underlying real queue. We have implemented support for this concept in netkit and tested the latter against Nvidia ConnectX-6 (mlx5) as well as Broadcom BCM957504 (bnxt_en) 100G NICs. For more details see the individual patches. ==================== Link: https://patch.msgid.link/20260115082603.219152-1-daniel@iogearbox.net Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Diffstat (limited to 'include')
-rw-r--r--include/linux/netdevice.h6
-rw-r--r--include/net/netdev_queues.h19
-rw-r--r--include/net/netdev_rx_queue.h21
-rw-r--r--include/net/page_pool/memory_provider.h4
-rw-r--r--include/net/xdp_sock_drv.h2
-rw-r--r--include/uapi/linux/if_link.h6
-rw-r--r--include/uapi/linux/netdev.h11
7 files changed, 62 insertions, 7 deletions
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d99b0fbc1942..4d146c000e21 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3400,11 +3400,17 @@ static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
int register_netdevice(struct net_device *dev);
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
void unregister_netdevice_many(struct list_head *head);
+
static inline void unregister_netdevice(struct net_device *dev)
{
unregister_netdevice_queue(dev, NULL);
}
+static inline bool unregister_netdevice_queued(const struct net_device *dev)
+{
+ return !list_empty(&dev->unreg_list);
+}
+
int netdev_refcnt_read(const struct net_device *dev);
void free_netdev(struct net_device *dev);
diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h
index b55d3b9cb9c2..81dc7cb2360c 100644
--- a/include/net/netdev_queues.h
+++ b/include/net/netdev_queues.h
@@ -130,6 +130,11 @@ void netdev_stat_queue_sum(struct net_device *netdev,
* @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used
* for this queue. Return NULL on error.
*
+ * @ndo_queue_create: Create a new RX queue which can be leased to another queue.
+ * Ops on this queue are redirected to the leased queue e.g.
+ * when opening a memory provider. Return the new queue id on
+ * success. Return negative error code on failure.
+ *
* Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while
* the interface is closed. @ndo_queue_start and @ndo_queue_stop will only
* be called for an interface which is open.
@@ -149,9 +154,12 @@ struct netdev_queue_mgmt_ops {
int idx);
struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev,
int idx);
+ int (*ndo_queue_create)(struct net_device *dev);
};
-bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx);
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx);
+bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx);
+bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx);
/**
* DOC: Lockless queue stopping / waking helpers.
@@ -340,5 +348,10 @@ static inline unsigned int netif_xmit_timeout_ms(struct netdev_queue *txq)
})
struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx);
-
-#endif
+bool netdev_can_create_queue(const struct net_device *dev,
+ struct netlink_ext_ack *extack);
+bool netdev_can_lease_queue(const struct net_device *dev,
+ struct netlink_ext_ack *extack);
+bool netdev_queue_busy(struct net_device *dev, int idx,
+ struct netlink_ext_ack *extack);
+#endif /* _LINUX_NET_QUEUES_H */
diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h
index 8cdcd138b33f..508d11afaecb 100644
--- a/include/net/netdev_rx_queue.h
+++ b/include/net/netdev_rx_queue.h
@@ -28,6 +28,8 @@ struct netdev_rx_queue {
#endif
struct napi_struct *napi;
struct pp_memory_provider_params mp_params;
+ struct netdev_rx_queue *lease;
+ netdevice_tracker lease_tracker;
} ____cacheline_aligned_in_smp;
/*
@@ -57,5 +59,22 @@ get_netdev_rx_queue_index(struct netdev_rx_queue *queue)
}
int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq);
+void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src);
+void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst,
+ struct netdev_rx_queue *rxq_src);
+bool netif_rx_queue_lease_get_owner(struct net_device **dev, unsigned int *rxq);
-#endif
+enum netif_lease_dir {
+ NETIF_VIRT_TO_PHYS,
+ NETIF_PHYS_TO_VIRT,
+};
+
+struct netdev_rx_queue *
+__netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq,
+ enum netif_lease_dir dir);
+struct netdev_rx_queue *
+netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq);
+void netif_put_rx_queue_lease_locked(struct net_device *orig_dev,
+ struct net_device *dev);
+#endif /* _LINUX_NETDEV_RX_QUEUE_H */
diff --git a/include/net/page_pool/memory_provider.h b/include/net/page_pool/memory_provider.h
index ada4f968960a..b6f811c3416b 100644
--- a/include/net/page_pool/memory_provider.h
+++ b/include/net/page_pool/memory_provider.h
@@ -23,12 +23,12 @@ bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr);
void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov);
void net_mp_niov_clear_page_pool(struct net_iov *niov);
-int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx,
+int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
struct pp_memory_provider_params *p);
int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
const struct pp_memory_provider_params *p,
struct netlink_ext_ack *extack);
-void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
+void net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
struct pp_memory_provider_params *old_p);
void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx,
const struct pp_memory_provider_params *old_p);
diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
index 242e34f771cc..c07cfb431eac 100644
--- a/include/net/xdp_sock_drv.h
+++ b/include/net/xdp_sock_drv.h
@@ -28,7 +28,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc);
u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max);
void xsk_tx_release(struct xsk_buff_pool *pool);
-struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
+struct xsk_buff_pool *xsk_get_pool_from_qid(const struct net_device *dev,
u16 queue_id);
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool);
void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 3b491d96e52e..bbd565757298 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1296,6 +1296,11 @@ enum netkit_mode {
NETKIT_L3,
};
+enum netkit_pairing {
+ NETKIT_DEVICE_PAIR,
+ NETKIT_DEVICE_SINGLE,
+};
+
/* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to
* the BPF program if attached. This also means the latter can
* consume the two fields if they were populated earlier.
@@ -1320,6 +1325,7 @@ enum {
IFLA_NETKIT_PEER_SCRUB,
IFLA_NETKIT_HEADROOM,
IFLA_NETKIT_TAILROOM,
+ IFLA_NETKIT_PAIRING,
__IFLA_NETKIT_MAX,
};
#define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1)
diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index e0b579a1df4f..7df1056a35fd 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -160,6 +160,7 @@ enum {
NETDEV_A_QUEUE_DMABUF,
NETDEV_A_QUEUE_IO_URING,
NETDEV_A_QUEUE_XSK,
+ NETDEV_A_QUEUE_LEASE,
__NETDEV_A_QUEUE_MAX,
NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1)
@@ -203,6 +204,15 @@ enum {
};
enum {
+ NETDEV_A_LEASE_IFINDEX = 1,
+ NETDEV_A_LEASE_QUEUE,
+ NETDEV_A_LEASE_NETNS_ID,
+
+ __NETDEV_A_LEASE_MAX,
+ NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1)
+};
+
+enum {
NETDEV_A_DMABUF_IFINDEX = 1,
NETDEV_A_DMABUF_QUEUES,
NETDEV_A_DMABUF_FD,
@@ -228,6 +238,7 @@ enum {
NETDEV_CMD_BIND_RX,
NETDEV_CMD_NAPI_SET,
NETDEV_CMD_BIND_TX,
+ NETDEV_CMD_QUEUE_CREATE,
__NETDEV_CMD_MAX,
NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)