summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2026-05-25 10:37:27 -0700
committerJakub Kicinski <kuba@kernel.org>2026-05-25 10:37:28 -0700
commitf6f1bfc1980a08a8d1eb8eaf161b79c7c67095ba (patch)
tree9ae7d76510ebfbc26aae408ee615eeb77237879a
parent86f1d0f063e423a5c1982db1e5e7a8eac511e603 (diff)
parent18014147d3ee7831dce53fe65d7fc8d428b02552 (diff)
Merge tag 'nf-26-05-22' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf
Florian Westphal says: ==================== netfilter: updates for net Patches 7+8 fix a regression from 7.1-rc1. Everything else is from 2.6.x to 5.3 releases. There are additional known issues with these patches (drive-by-findings in related code). There are many old bugs all over netfilter and our ability to review feature patches has come to a complete halt due to lack of time. There are further security bugs that we cannot address due to lack of time, maintainers and reviewers. Other remarks: The xtables 32bit compat interface is already off in many vendor kernels, the plan is to remove it soon. 1) Prevent RST packets with invalid sequence numbers from forcing TCP connections into the CLOSE state without a direction check. From Hamza Mahfooz. 2) Re-derive the TCP header pointer after skb_ensure_writable in synproxy_tstamp_adjust. Prevent use-after-free and invalid checksum updates caused by stale pointers during buffer expansion. From Chris Mason. 3) Fix a race condition causing keymap list corruption in conntracks gre/pptp helper. 4) Use raw_smp_processor_id() in xt_cpu to prevent splats under PREEMPT_RCU. 5) Disable netfilter payload mangling in user namespaces (nft_payload.c and nf_queue). TCP option mangling via nft_exthdr.c remains enabled. There will be followups here to restrict resp. revalidate headers. 6) Fix an out-of-bounds read in ebtables's compat_mtw_from_user function. 7) Use list_for_each_entry_rcu() to traverse fib6_siblings in nft_fib6_info_nh_uses_dev(). Ensure safe list walking under RCU. 8) Fix an out-of-bounds read in nft_fib_ipv6 caused by incorrect list traversal. 9) Add nft_fib_nexthop selftest to netfilter. Cover nexthop enumeration for single, group, and multipath route shapes. All three nft_fib6 fixes from Jiayuan Chen. 10) Fix destination corruption in shift operations when source and destination registers overlap. Reject partial register overlap for all operations from control plane. From Fernando Fernandez Mancera. * tag 'nf-26-05-22' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf: netfilter: nf_tables: fix dst corruption in same register operation selftests: netfilter: add nft_fib_nexthop test netfilter: nft_fib_ipv6: handle routes via external nexthop netfilter: nft_fib_ipv6: walk fib6_siblings under RCU netfilter: ebtables: fix OOB read in compat_mtw_from_user netfilter: disable payload mangling in userns netfilter: xt_cpu: prefer raw_smp_processor_id netfilter: nf_conntrack_gre: fix gre keymap list corruption netfilter: synproxy: refresh tcphdr after skb_ensure_writable netfilter: conntrack: tcp: do not force CLOSE on invalid-seq RST without direction check ==================== Link: https://patch.msgid.link/20260522104257.2008-1-fw@strlen.de Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--include/linux/netfilter/nf_conntrack_proto_gre.h7
-rw-r--r--include/net/netfilter/nf_tables.h7
-rw-r--r--net/bridge/netfilter/ebtables.c30
-rw-r--r--net/ipv6/netfilter/nft_fib_ipv6.c18
-rw-r--r--net/netfilter/nf_conntrack_core.c8
-rw-r--r--net/netfilter/nf_conntrack_pptp.c8
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c106
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c3
-rw-r--r--net/netfilter/nf_synproxy_core.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c6
-rw-r--r--net/netfilter/nft_bitwise.c18
-rw-r--r--net/netfilter/nft_byteorder.c13
-rw-r--r--net/netfilter/nft_payload.c3
-rw-r--r--net/netfilter/xt_cpu.c2
-rw-r--r--tools/testing/selftests/net/netfilter/Makefile1
-rwxr-xr-xtools/testing/selftests/net/netfilter/nft_fib_nexthop.sh152
16 files changed, 338 insertions, 46 deletions
diff --git a/include/linux/netfilter/nf_conntrack_proto_gre.h b/include/linux/netfilter/nf_conntrack_proto_gre.h
index 9ee7014400e8..ad5563f0f864 100644
--- a/include/linux/netfilter/nf_conntrack_proto_gre.h
+++ b/include/linux/netfilter/nf_conntrack_proto_gre.h
@@ -18,9 +18,10 @@ struct nf_ct_gre_keymap {
struct rcu_head rcu;
};
-/* add new tuple->key_reply pair to keymap */
-int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
- struct nf_conntrack_tuple *t);
+/* add tuple->key_reply pairs to keymap */
+bool nf_ct_gre_keymap_add(struct nf_conn *ct,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl);
/* delete keymap entries */
void nf_ct_gre_keymap_destroy(struct nf_conn *ct);
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index cff7b773e972..9d844354c4d9 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -180,6 +180,13 @@ static inline u64 nft_reg_load64(const u32 *sreg)
return get_unaligned((u64 *)sreg);
}
+static inline bool nft_reg_overlap(u8 src, u8 dst, u32 len)
+{
+ unsigned int n = DIV_ROUND_UP(len, sizeof(u32));
+
+ return src != dst && src < dst + n && dst < src + n;
+}
+
static inline void nft_data_copy(u32 *dst, const struct nft_data *src,
unsigned int len)
{
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index b9f4daac09af..8a6a069329d2 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1956,6 +1956,25 @@ enum compat_mwt {
EBT_COMPAT_TARGET,
};
+static bool match_size_ok(const struct xt_match *match, unsigned int match_size)
+{
+ u16 csize;
+
+ if (match->matchsize == -1) /* cannot validate ebt_among */
+ return true;
+
+ csize = match->compatsize ? : match->matchsize;
+
+ return match_size >= csize;
+}
+
+static bool tgt_size_ok(const struct xt_target *tgt, unsigned int tgt_size)
+{
+ u16 csize = tgt->compatsize ? : tgt->targetsize;
+
+ return tgt_size >= csize;
+}
+
static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt,
enum compat_mwt compat_mwt,
struct ebt_entries_buf_state *state,
@@ -1981,6 +2000,11 @@ static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt,
if (IS_ERR(match))
return PTR_ERR(match);
+ if (!match_size_ok(match, match_size)) {
+ module_put(match->me);
+ return -EINVAL;
+ }
+
off = ebt_compat_match_offset(match, match_size);
if (dst) {
if (match->compat_from_user)
@@ -2000,6 +2024,12 @@ static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt,
mwt->u.revision);
if (IS_ERR(wt))
return PTR_ERR(wt);
+
+ if (!tgt_size_ok(wt, match_size)) {
+ module_put(wt->me);
+ return -EINVAL;
+ }
+
off = xt_compat_target_offset(wt);
if (dst) {
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index 8b2dba88ee96..c0a0075e2590 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -160,17 +160,33 @@ static bool nft_fib6_info_nh_dev_match(const struct net_device *nh_dev,
l3mdev_master_ifindex_rcu(nh_dev) == dev->ifindex;
}
+static int nft_fib6_nh_match_dev_cb(struct fib6_nh *nh, void *arg)
+{
+ const struct net_device *dev = arg;
+
+ return nft_fib6_info_nh_dev_match(nh->fib_nh_dev, dev);
+}
+
static bool nft_fib6_info_nh_uses_dev(struct fib6_info *rt,
const struct net_device *dev)
{
const struct net_device *nh_dev;
struct fib6_info *iter;
+ /* External nexthop: fib6_siblings slot aliases nh_list, walk via nh. */
+ if (rt->nh)
+ return nexthop_for_each_fib6_nh(rt->nh,
+ nft_fib6_nh_match_dev_cb,
+ (void *)dev);
+
nh_dev = fib6_info_nh_dev(rt);
if (nft_fib6_info_nh_dev_match(nh_dev, dev))
return true;
- list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
+ if (!READ_ONCE(rt->fib6_nsiblings))
+ return false;
+
+ list_for_each_entry_rcu(iter, &rt->fib6_siblings, fib6_siblings) {
nh_dev = fib6_info_nh_dev(iter);
if (nft_fib6_info_nh_dev_match(nh_dev, dev))
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8ba5b22a1eef..b521b5ebd664 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -568,6 +568,13 @@ static void destroy_gre_conntrack(struct nf_conn *ct)
#endif
}
+static void warn_on_keymap_list_leak(const struct net *net)
+{
+#ifdef CONFIG_NF_CT_PROTO_GRE
+ WARN_ON_ONCE(!list_empty(&net->ct.nf_ct_proto.gre.keymap_list));
+#endif
+}
+
void nf_ct_destroy(struct nf_conntrack *nfct)
{
struct nf_conn *ct = (struct nf_conn *)nfct;
@@ -2510,6 +2517,7 @@ i_see_dead_people:
}
list_for_each_entry(net, net_exit_list, exit_list) {
+ warn_on_keymap_list_leak(net);
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
free_percpu(net->ct.stat);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 4c679638df06..dc23e4181618 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -225,13 +225,9 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid)
if (nf_ct_expect_related(exp_reply, 0) != 0)
goto out_unexpect_orig;
- /* Add GRE keymap entries */
- if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_ORIGINAL, &exp_orig->tuple) != 0)
+ if (!nf_ct_gre_keymap_add(ct, &exp_orig->tuple,
+ &exp_reply->tuple))
goto out_unexpect_both;
- if (nf_ct_gre_keymap_add(ct, IP_CT_DIR_REPLY, &exp_reply->tuple) != 0) {
- nf_ct_gre_keymap_destroy(ct);
- goto out_unexpect_both;
- }
ret = 0;
out_put_both:
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 94c19bc4edc5..35e22082d65a 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -87,41 +87,97 @@ static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t)
return key;
}
-/* add a single keymap entry, associate with specified master ct */
-int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir,
- struct nf_conntrack_tuple *t)
+enum nf_ct_gre_km_act {
+ NF_CT_GRE_KM_NEW,
+ NF_CT_GRE_KM_BAD,
+ NF_CT_GRE_KM_DUP
+};
+
+static enum nf_ct_gre_km_act
+nf_ct_gre_km_acceptable(const struct nf_ct_pptp_master *ct_pptp_info,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl)
+{
+ struct nf_ct_gre_keymap *km_orig, *km_repl;
+
+ lockdep_assert_held(&keymap_lock);
+
+ km_orig = ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL];
+ km_repl = ct_pptp_info->keymap[IP_CT_DIR_REPLY];
+
+ if (km_orig && km_repl) {
+ if (!gre_key_cmpfn(km_orig, orig))
+ return NF_CT_GRE_KM_BAD;
+
+ if (!gre_key_cmpfn(km_repl, repl))
+ return NF_CT_GRE_KM_BAD;
+
+ return NF_CT_GRE_KM_DUP;
+ }
+
+ DEBUG_NET_WARN_ON_ONCE(km_orig);
+ DEBUG_NET_WARN_ON_ONCE(km_repl);
+ return NF_CT_GRE_KM_NEW;
+}
+
+/* add keymap entries, associate with specified master ct */
+bool nf_ct_gre_keymap_add(struct nf_conn *ct,
+ const struct nf_conntrack_tuple *orig,
+ const struct nf_conntrack_tuple *repl)
{
struct net *net = nf_ct_net(ct);
struct nf_gre_net *net_gre = gre_pernet(net);
struct nf_ct_pptp_master *ct_pptp_info = nfct_help_data(ct);
- struct nf_ct_gre_keymap **kmp, *km;
-
- kmp = &ct_pptp_info->keymap[dir];
- if (*kmp) {
- /* check whether it's a retransmission */
- list_for_each_entry_rcu(km, &net_gre->keymap_list, list) {
- if (gre_key_cmpfn(km, t) && km == *kmp)
- return 0;
- }
- pr_debug("trying to override keymap_%s for ct %p\n",
- dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct);
- return -EEXIST;
- }
+ struct nf_ct_gre_keymap *km_orig, *km_repl;
+ bool ret = false;
- km = kmalloc_obj(*km, GFP_ATOMIC);
- if (!km)
- return -ENOMEM;
- memcpy(&km->tuple, t, sizeof(*t));
- *kmp = km;
+ km_orig = kmalloc_obj(*km_orig, GFP_ATOMIC);
+ if (!km_orig)
+ return false;
+ km_repl = kmalloc_obj(*km_repl, GFP_ATOMIC);
+ if (!km_repl)
+ goto km_free;
- pr_debug("adding new entry %p: ", km);
- nf_ct_dump_tuple(&km->tuple);
+ memcpy(&km_orig->tuple, orig, sizeof(*orig));
+ memcpy(&km_repl->tuple, repl, sizeof(*repl));
spin_lock_bh(&keymap_lock);
- list_add_tail(&km->list, &net_gre->keymap_list);
+ if (nf_ct_is_dying(ct))
+ goto unlock_free;
+
+ switch (nf_ct_gre_km_acceptable(ct_pptp_info, orig, repl)) {
+ case NF_CT_GRE_KM_NEW:
+ break;
+ case NF_CT_GRE_KM_DUP:
+ ret = true;
+ goto unlock_free;
+ case NF_CT_GRE_KM_BAD:
+ pr_debug("trying to override keymap for ct %p\n", ct);
+ goto unlock_free;
+ }
+
+ if (ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL] ||
+ ct_pptp_info->keymap[IP_CT_DIR_REPLY])
+ goto unlock_free;
+
+ pr_debug("adding new entries %p,%p: ", km_orig, km_repl);
+ nf_ct_dump_tuple(&km_orig->tuple);
+ nf_ct_dump_tuple(&km_repl->tuple);
+
+ list_add_tail_rcu(&km_orig->list, &net_gre->keymap_list);
+ list_add_tail_rcu(&km_repl->list, &net_gre->keymap_list);
+ ct_pptp_info->keymap[IP_CT_DIR_ORIGINAL] = km_orig;
+ ct_pptp_info->keymap[IP_CT_DIR_REPLY] = km_repl;
spin_unlock_bh(&keymap_lock);
- return 0;
+ return true;
+
+unlock_free:
+ spin_unlock_bh(&keymap_lock);
+km_free:
+ kfree(km_orig);
+ kfree(km_repl);
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add);
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b67426c2189b..e99ab1e88e9f 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1221,7 +1221,8 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct,
new_state = old_state;
}
if (((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
- && ct->proto.tcp.last_index == TCP_SYN_SET)
+ && ct->proto.tcp.last_index == TCP_SYN_SET
+ && ct->proto.tcp.last_dir != dir)
|| (!test_bit(IPS_ASSURED_BIT, &ct->status)
&& ct->proto.tcp.last_index == TCP_ACK_SET))
&& ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 57f57e2fc80a..036c8586f49b 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -200,6 +200,8 @@ synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff,
if (skb_ensure_writable(skb, optend))
return 0;
+ th = (struct tcphdr *)(skb->data + protoff);
+
while (optoff < optend) {
unsigned char *op = skb->data + optoff;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 984a0eb9e149..60ab88d45096 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1141,6 +1141,9 @@ nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int di
{
struct sk_buff *nskb;
+ if (e->state.net->user_ns != &init_user_ns)
+ return -EPERM;
+
if (diff < 0) {
unsigned int min_len = skb_transport_offset(e->skb);
@@ -1537,8 +1540,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]),
payload_len, entry, diff) < 0)
verdict = NF_DROP;
-
- if (ct && diff)
+ else if (ct && diff)
nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff);
}
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index 94dccdcfa06b..785b8e9731d1 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -43,8 +43,10 @@ static void nft_bitwise_eval_lshift(u32 *dst, const u32 *src,
u32 carry = 0;
for (i = DIV_ROUND_UP(priv->len, sizeof(u32)); i > 0; i--) {
- dst[i - 1] = (src[i - 1] << shift) | carry;
- carry = src[i - 1] >> (BITS_PER_TYPE(u32) - shift);
+ u32 tmp_src = src[i - 1];
+
+ dst[i - 1] = (tmp_src << shift) | carry;
+ carry = tmp_src >> (BITS_PER_TYPE(u32) - shift);
}
}
@@ -56,8 +58,10 @@ static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src,
u32 carry = 0;
for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++) {
- dst[i] = carry | (src[i] >> shift);
- carry = src[i] << (BITS_PER_TYPE(u32) - shift);
+ u32 tmp_src = src[i];
+
+ dst[i] = carry | (tmp_src >> shift);
+ carry = tmp_src << (BITS_PER_TYPE(u32) - shift);
}
}
@@ -235,6 +239,9 @@ static int nft_bitwise_init_bool(const struct nft_ctx *ctx,
&priv->sreg2, priv->len);
if (err < 0)
return err;
+
+ if (nft_reg_overlap(priv->sreg2, priv->dreg, priv->len))
+ return -EINVAL;
}
return 0;
@@ -265,6 +272,9 @@ static int nft_bitwise_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
+ if (nft_reg_overlap(priv->sreg, priv->dreg, priv->len))
+ return -EINVAL;
+
if (tb[NFTA_BITWISE_OP]) {
priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP]));
switch (priv->op) {
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index e00dddfa2fc0..2316c77f4228 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -144,9 +144,16 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
- return nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG],
- &priv->dreg, NULL, NFT_DATA_VALUE,
- priv->len);
+ err = nft_parse_register_store(ctx, tb[NFTA_BYTEORDER_DREG],
+ &priv->dreg, NULL, NFT_DATA_VALUE,
+ priv->len);
+ if (err < 0)
+ return err;
+
+ if (nft_reg_overlap(priv->sreg, priv->dreg, priv->len))
+ return -EINVAL;
+
+ return 0;
}
static int nft_byteorder_dump(struct sk_buff *skb,
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 01e13e5255a9..484a5490832e 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -917,6 +917,9 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
struct nft_payload_set *priv = nft_expr_priv(expr);
int err;
+ if (ctx->net->user_ns != &init_user_ns)
+ return -EPERM;
+
priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c
index 3bdc302a0f91..9cb259902a58 100644
--- a/net/netfilter/xt_cpu.c
+++ b/net/netfilter/xt_cpu.c
@@ -34,7 +34,7 @@ static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_cpu_info *info = par->matchinfo;
- return (info->cpu == smp_processor_id()) ^ info->invert;
+ return (info->cpu == raw_smp_processor_id()) ^ info->invert;
}
static struct xt_match cpu_mt_reg __read_mostly = {
diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
index ee2d1a5254f8..d953ee218c0f 100644
--- a/tools/testing/selftests/net/netfilter/Makefile
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -26,6 +26,7 @@ TEST_PROGS := \
nft_concat_range.sh \
nft_conntrack_helper.sh \
nft_fib.sh \
+ nft_fib_nexthop.sh \
nft_flowtable.sh \
nft_interface_stress.sh \
nft_meta.sh \
diff --git a/tools/testing/selftests/net/netfilter/nft_fib_nexthop.sh b/tools/testing/selftests/net/netfilter/nft_fib_nexthop.sh
new file mode 100755
index 000000000000..c4f203057382
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nft_fib_nexthop.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# shellcheck disable=SC2154
+#
+# Exercise nft_fib6_eval()'s sibling/nh enumeration on three route shapes:
+# 1) route via a single external nexthop (nhid)
+# 2) route via an external nexthop group (nhid -> group, two members)
+# 3) route via old-style multipath (nexthop ... nexthop ...)
+#
+# In each scenario the route's nexthop set contains veth0 (the iif of the
+# test packet). nft_fib6_info_nh_uses_dev() must walk the set and report
+# veth0 as a valid oif. For (2) and (3) the matching nexthop is the second
+# member, so the walk has to traverse beyond the primary nh.
+#
+# After sending $PKTS ICMPv6 echo requests from ns1, check two counters on
+# nsrouter:
+# nf_ok -- `fib daddr . iif oif eq "veth0"` must equal $PKTS
+# nf_bad -- `fib daddr . iif oif missing` must stay at 0
+# Both rules also match on iif veth0 and ip6 daddr dead:dead::/64 so that
+# kernel-generated ND/MLD/RA traffic cannot pollute the counters.
+#
+# Topology similar to nft_fib.sh, without ns2; two dummy interfaces on
+# nsrouter host extra nh devices:
+#
+# dead:1::99 dead:1::1
+# ns1 <----veth----> nsrouter --- dummy0 dead:2::1
+# \-- dummy1 dead:9::1
+
+source lib.sh
+
+ret=0
+PKTS=3
+
+checktool "nft --version" "run test without nft"
+checktool "ip -V" "run test without iproute2"
+
+setup_ns nsrouter ns1
+trap cleanup_all_ns EXIT
+
+if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" \
+ > /dev/null 2>&1; then
+ echo "SKIP: No virtual ethernet pair device support in kernel"
+ exit $ksft_skip
+fi
+
+ip -net "$ns1" link set lo up
+ip -net "$ns1" link set eth0 up
+ip -net "$ns1" -6 addr add dead:1::99/64 dev eth0 nodad
+ip -net "$ns1" -6 route add default via dead:1::1
+
+ip -net "$nsrouter" link set lo up
+ip -net "$nsrouter" link set veth0 up
+ip -net "$nsrouter" -6 addr add dead:1::1/64 dev veth0 nodad
+
+if ! ip -net "$nsrouter" link add dummy0 type dummy 2>/dev/null; then
+ echo "SKIP: dummy netdev not available"
+ exit $ksft_skip
+fi
+ip -net "$nsrouter" link set dummy0 up
+ip -net "$nsrouter" -6 addr add dead:2::1/64 dev dummy0 nodad
+
+ip -net "$nsrouter" link add dummy1 type dummy
+ip -net "$nsrouter" link set dummy1 up
+ip -net "$nsrouter" -6 addr add dead:9::1/64 dev dummy1 nodad
+
+ip netns exec "$nsrouter" sysctl -q net.ipv6.conf.all.forwarding=1
+
+load_fib_rule() {
+ # filter on iif + daddr so the counters only see our test packets
+ ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
+flush ruleset
+table ip6 t {
+ counter nf_ok { }
+ counter nf_bad { }
+ chain c {
+ type filter hook prerouting priority 0; policy accept;
+ iif "veth0" ip6 daddr dead:dead::/64 fib daddr . iif oif eq "veth0" counter name nf_ok
+ iif "veth0" ip6 daddr dead:dead::/64 fib daddr . iif oif missing counter name nf_bad
+ }
+}
+EOF
+}
+
+bad_counter() {
+ local counter=$1
+ local expect=$2
+ local tag=$3
+
+ echo "FAIL ($tag): counter $counter has unexpected value (expected \"$expect\")" 1>&2
+ ip netns exec "$nsrouter" nft list counter ip6 t "$counter" 1>&2
+}
+
+run_scenario() {
+ local what="$1"; shift
+ # counter output format is "packets PACKET_NUM bytes BYTES_NUM";
+ # we only care about the packet count
+ local expect_ok="packets $PKTS bytes"
+ local expect_bad="packets 0 bytes"
+ local lret=0
+
+ # reset route + nexthop state between scenarios
+ ip -net "$nsrouter" -6 route del dead:dead::/64 > /dev/null 2>&1 || true
+ ip -net "$nsrouter" nexthop flush > /dev/null 2>&1 || true
+
+ # run the scenario function passed by the caller
+ "$@" || echo "WARN ($what): scenario setup returned non-zero"
+
+ load_fib_rule || { echo "FAIL ($what): nft load"; ret=1; return; }
+
+ # ping a daddr inside dead:dead::/64 so fib has to walk the nh set
+ ip netns exec "$ns1" ping -6 -c "$PKTS" -i 0.1 -W 1 dead:dead::1 \
+ > /dev/null 2>&1 || true
+
+ # verify the packets went through the expected fib path
+ if ! ip netns exec "$nsrouter" nft list counter ip6 t nf_ok | grep -q "$expect_ok"; then
+ bad_counter nf_ok "$expect_ok" "$what"
+ lret=1
+ fi
+ if ! ip netns exec "$nsrouter" nft list counter ip6 t nf_bad | grep -q "$expect_bad"; then
+ bad_counter nf_bad "$expect_bad" "$what"
+ lret=1
+ fi
+
+ if [ $lret -eq 0 ]; then
+ echo "PASS: $what"
+ else
+ ret=1
+ fi
+}
+
+scenario_single_nh() {
+ ip -net "$nsrouter" nexthop add id 1 via dead:1::99 dev veth0
+ ip -net "$nsrouter" -6 route add dead:dead::/64 nhid 1
+}
+run_scenario "single external nexthop (nhid -> veth0)" scenario_single_nh
+
+scenario_nh_group() {
+ ip -net "$nsrouter" nexthop add id 1 via dead:2::2 dev dummy0
+ ip -net "$nsrouter" nexthop add id 2 via dead:1::99 dev veth0
+ ip -net "$nsrouter" nexthop add id 100 group 1/2
+ ip -net "$nsrouter" -6 route add dead:dead::/64 nhid 100
+}
+run_scenario "nexthop group (dummy0 + veth0)" scenario_nh_group
+
+scenario_old_multipath() {
+ ip -net "$nsrouter" -6 route add dead:dead::/64 \
+ nexthop via dead:2::2 dev dummy0 \
+ nexthop via dead:1::99 dev veth0
+}
+run_scenario "old-style multipath (sibling on veth0)" scenario_old_multipath
+
+exit $ret