summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason Xing <kernelxing@tencent.com>2026-05-02 23:07:22 +0300
committerJakub Kicinski <kuba@kernel.org>2026-05-05 19:27:51 -0700
commit203cee647f551abc87b992045cd920b117ff990a (patch)
treee5a45083eb2049055b5ae4a675fac831b49b42a5
parente0f229025a8e774a695017a376c4a01279c0e66e (diff)
xsk: fix u64 descriptor address truncation on 32-bit architectures
In copy mode TX, xsk_skb_destructor_set_addr() stores the 64-bit descriptor address into skb_shinfo(skb)->destructor_arg (void *) via a uintptr_t cast: skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); On 32-bit architectures uintptr_t is 32 bits, so the upper 32 bits of the descriptor address are silently dropped. In XDP_ZEROCOPY unaligned mode the chunk offset is encoded in bits 48-63 of the descriptor address (XSK_UNALIGNED_BUF_OFFSET_SHIFT = 48), meaning the offset is lost entirely. The completion queue then returns a truncated address to userspace, making buffer recycling impossible. Fix this by handling the 32-bit case directly in xsk_skb_destructor_set_addr(): when !CONFIG_64BIT, allocate an xsk_addrs struct (the same path already used for multi-descriptor SKBs) to store the full u64 address. The existing tagged-pointer logic in xsk_skb_destructor_is_addr() stays unchanged: slab pointers returned from kmem_cache_zalloc() are always word-aligned and therefore have bit 0 clear, which correctly identifies them as a struct pointer rather than an inline tagged address on every architecture. Factor the shared kmem_cache_zalloc + destructor_arg assignment into __xsk_addrs_alloc() and add a wrapper xsk_addrs_alloc() that handles the inline-to-list upgrade (is_addr check + get_addr + num_descs = 1). The three former open-coded kmem_cache_zalloc call sites now reduce to a single call each. Propagate the -ENOMEM from xsk_skb_destructor_set_addr() through xsk_skb_init_misc() so the caller can clean up the skb via kfree_skb() before skb->destructor is installed. The overhead is one extra kmem_cache_zalloc per first descriptor on 32-bit only; 64-bit builds are completely unchanged. Closes: https://lore.kernel.org/all/20260419045824.D9E5EC2BCAF@smtp.kernel.org/ Fixes: 0ebc27a4c67d ("xsk: avoid data corruption on cq descriptor number") Signed-off-by: Jason Xing <kernelxing@tencent.com> Acked-by: Stanislav Fomichev <sdf@fomichev.me> Reviewed-by: Alexander Lobakin <aleksander.lobakin@intel.com> Link: https://patch.msgid.link/20260502200722.53960-9-kerneljasonxing@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--net/xdp/xsk.c88
1 files changed, 56 insertions, 32 deletions
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 079abd4bcb69..5e5786cd9af5 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -646,9 +646,42 @@ static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
}
-static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
+static struct xsk_addrs *__xsk_addrs_alloc(struct sk_buff *skb, u64 addr)
{
- skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
+ struct xsk_addrs *xsk_addr;
+
+ xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+ if (unlikely(!xsk_addr))
+ return NULL;
+
+ xsk_addr->addrs[0] = addr;
+ skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
+ return xsk_addr;
+}
+
+static struct xsk_addrs *xsk_addrs_alloc(struct sk_buff *skb)
+{
+ struct xsk_addrs *xsk_addr;
+
+ if (!xsk_skb_destructor_is_addr(skb))
+ return (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+
+ xsk_addr = __xsk_addrs_alloc(skb, xsk_skb_destructor_get_addr(skb));
+ if (likely(xsk_addr))
+ xsk_addr->num_descs = 1;
+ return xsk_addr;
+}
+
+static int xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
+{
+ if (IS_ENABLED(CONFIG_64BIT)) {
+ skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
+ return 0;
+ }
+
+ if (unlikely(!__xsk_addrs_alloc(skb, addr)))
+ return -ENOMEM;
+ return 0;
}
static void xsk_inc_num_desc(struct sk_buff *skb)
@@ -724,14 +757,20 @@ void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}
-static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
- u64 addr)
+static int xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
+ u64 addr)
{
+ int err;
+
+ err = xsk_skb_destructor_set_addr(skb, addr);
+ if (unlikely(err))
+ return err;
+
skb->dev = xs->dev;
skb->priority = READ_ONCE(xs->sk.sk_priority);
skb->mark = READ_ONCE(xs->sk.sk_mark);
skb->destructor = xsk_destruct_skb;
- xsk_skb_destructor_set_addr(skb, addr);
+ return 0;
}
static void xsk_consume_skb(struct sk_buff *skb)
@@ -829,18 +868,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
} else {
struct xsk_addrs *xsk_addr;
- if (xsk_skb_destructor_is_addr(skb)) {
- xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
- GFP_KERNEL);
- if (!xsk_addr)
- return ERR_PTR(-ENOMEM);
-
- xsk_addr->num_descs = 1;
- xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
- skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
- } else {
- xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
- }
+ xsk_addr = xsk_addrs_alloc(skb);
+ if (!xsk_addr)
+ return ERR_PTR(-ENOMEM);
/* in case of -EOVERFLOW that could happen below,
* xsk_consume_skb() will release this node as whole skb
@@ -929,19 +959,10 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
struct page *page;
u8 *vaddr;
- if (xsk_skb_destructor_is_addr(skb)) {
- xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
- GFP_KERNEL);
- if (!xsk_addr) {
- err = -ENOMEM;
- goto free_err;
- }
-
- xsk_addr->num_descs = 1;
- xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
- skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
- } else {
- xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
+ xsk_addr = xsk_addrs_alloc(skb);
+ if (!xsk_addr) {
+ err = -ENOMEM;
+ goto free_err;
}
if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
@@ -966,8 +987,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
}
}
- if (!xs->skb)
- xsk_skb_init_misc(skb, xs, desc->addr);
+ if (!xs->skb) {
+ err = xsk_skb_init_misc(skb, xs, desc->addr);
+ if (unlikely(err))
+ goto free_err;
+ }
xsk_inc_num_desc(skb);
return skb;