From 134121bfd99a06d44ef5ba15a9beb075297c0821 Mon Sep 17 00:00:00 2001 From: Slavin Liu Date: Fri, 12 Sep 2025 01:57:59 +0800 Subject: ipvs: Defer ip_vs_ftp unregister during netns cleanup On the netns cleanup path, __ip_vs_ftp_exit() may unregister ip_vs_ftp before connections with valid cp->app pointers are flushed, leading to a use-after-free. Fix this by introducing a global `exiting_module` flag, set to true in ip_vs_ftp_exit() before unregistering the pernet subsystem. In __ip_vs_ftp_exit(), skip ip_vs_ftp unregister if called during netns cleanup (when exiting_module is false) and defer it to __ip_vs_cleanup_batch(), which unregisters all apps after all connections are flushed. If called during module exit, unregister ip_vs_ftp immediately. Fixes: 61b1ab4583e2 ("IPVS: netns, add basic init per netns.") Suggested-by: Julian Anastasov Signed-off-by: Slavin Liu Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal --- net/netfilter/ipvs/ip_vs_ftp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index d8a284999544..206c6700e200 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -53,6 +53,7 @@ enum { IP_VS_FTP_EPSV, }; +static bool exiting_module; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper * First port is set to the default port. @@ -605,7 +606,7 @@ static void __ip_vs_ftp_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - if (!ipvs) + if (!ipvs || !exiting_module) return; unregister_ip_vs_app(ipvs, &ip_vs_ftp); @@ -627,6 +628,7 @@ static int __init ip_vs_ftp_init(void) */ static void __exit ip_vs_ftp_exit(void) { + exiting_module = true; unregister_pernet_subsys(&ip_vs_ftp_ops); /* rcu_barrier() is called by netns */ } -- cgit v1.2.3 From 09efbac953f6f076a07735f9ba885148d4796235 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Fri, 19 Sep 2025 14:40:43 +0200 Subject: netfilter: nfnetlink: reset nlh pointer during batch replay During a batch replay, the nlh pointer is not reset until the parsing of the commands. Since commit bf2ac490d28c ("netfilter: nfnetlink: Handle ACK flags for batch messages") that is problematic as the condition to add an ACK for batch begin will evaluate to true even if NLM_F_ACK wasn't used for batch begin message. If there is an error during the command processing, netlink is sending an ACK despite that. This misleads userspace tools which think that the return code was 0. Reset the nlh pointer to the original one when a replay is triggered. Fixes: bf2ac490d28c ("netfilter: nfnetlink: Handle ACK flags for batch messages") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Florian Westphal --- net/netfilter/nfnetlink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e598a2a252b0..811d02b4c4f7 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -376,6 +376,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; struct netlink_ext_ack extack; + struct nlmsghdr *onlh = nlh; LIST_HEAD(err_list); u32 status; int err; @@ -386,6 +387,7 @@ replay: status = 0; replay_abort: skb = netlink_skb_clone(oskb, GFP_KERNEL); + nlh = onlh; if (!skb) return netlink_ack(oskb, nlh, -ENOMEM, NULL); -- cgit v1.2.3 From 4dbac7db17f1e973fbbd6aea07a00181cd4cd162 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 16 Sep 2025 18:34:01 +0200 Subject: netfilter: nft_set_pipapo: use 0 genmask for packetpath lookups In commit c4eaca2e1052 ("netfilter: nft_set_pipapo: don't check genbit from packetpath lookups") I replaced genmask_cur() with NFT_GENMASK_ANY, but this change has no effect in the pipapo set type. New entries are unreachable from the active copy, so NFT_GENMASK_ANY has same result as genmask_cur(): current-gen elements are disabled and the new-generation elements cannot be found. Tests did not catch this incomplete fix because the change also dropped the genmask test from the AVX2 version of the algorithm, so test only fails if host cpu lacks AVX2 support. Use genmask test only from the control plane (inserts, deletions, ..). Packet path has to skip the check, use of 0 is enough for this because ext->genmask has a the relevant bit set when the element is INACTIVE in that generation: using a 0 genmask thus makes nft_set_elem_active() always return true. Fix the comment and replace NFT_GENMASK_ANY with 0. Fixes: c4eaca2e1052 ("netfilter: nft_set_pipapo: don't check genbit from packetpath lookups") Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 9 ++++----- net/netfilter/nft_set_pipapo_avx2.c | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index a7b8fa8cab7c..112fe46788b6 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -549,8 +549,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, * * This function is called from the data path. It will search for * an element matching the given key in the current active copy. - * Unlike other set types, this uses NFT_GENMASK_ANY instead of - * nft_genmask_cur(). + * Unlike other set types, this uses 0 instead of nft_genmask_cur(). * * This is because new (future) elements are not reachable from * priv->match, they get added to priv->clone instead. @@ -560,8 +559,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, * inconsistent state: matching old entries get skipped but thew * newly matching entries are unreachable. * - * GENMASK will still find the 'now old' entries which ensures consistent - * priv->match view. + * GENMASK_ANY doesn't work for the same reason: old-gen entries get + * skipped, new-gen entries are only reachable from priv->clone. * * nft_pipapo_commit swaps ->clone and ->match shortly after the * genbit flip. As ->clone doesn't contain the old entries in the first @@ -578,7 +577,7 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const struct nft_pipapo_elem *e; m = rcu_dereference(priv->match); - e = pipapo_get_slow(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64()); + e = pipapo_get_slow(m, (const u8 *)key, 0, get_jiffies_64()); return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 27dab3667548..e72fd045d037 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1292,7 +1292,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, m = rcu_dereference(priv->match); - e = pipapo_get_avx2(m, rp, NFT_GENMASK_ANY, get_jiffies_64()); + e = pipapo_get_avx2(m, rp, 0, get_jiffies_64()); local_bh_enable(); return e ? &e->ext : NULL; -- cgit v1.2.3 From 5823699a11cf3c05d751b473c66920d2d3cac0a5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 9 Sep 2025 14:11:48 +0200 Subject: netfilter: nft_set_pipapo_avx2: fix skip of expired entries KASAN reports following splat: BUG: KASAN: slab-out-of-bounds in pipapo_get_avx2+0x941/0x25d0 Read of size 1 at addr ffff88814c561be0 by task nft/3944 Call Trace: pipapo_get_avx2+0x941/0x25d0 nft_pipapo_insert+0x440/0x11b0 nf_tables_newsetelem+0x220a/0x3a00 .. This bisects to commit 84c1da7b38d9 ("netfilter: nft_set_pipapo: use AVX2 algorithm for insertions too"). However, that change merely uncovers this bug. When we find a match but that match has expired or timed out, the AVX2 implementation restarts the full match loop. At that point, the pointer to the key data has already been changed and points to the keys last field. This will then result in out-of-bounds read once its incremented again for the next field. The restart logic in AVX2 is different compared to the plain C implementation, but both should follow the same logic. The C implementation just calls pipapo_refill() again do check the next entry. Do the same in the AVX2 implementation. Note that with this change, due to implementation differences of pipapo_refill vs. nft_pipapo_avx2_refill, the refill call will return the same element again. Then, on the next call, it will move to the next entry as expected. This is because avx2_refill doesn't clear the bitmap in the 'last' conditional. This is harmless. Expired/timed out elements are also not expected to be frequent. selftest is added in a followup commit. Fixes: 7400b063969b ("nft_set_pipapo: Introduce AVX2-based lookup implementation") Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo_avx2.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index e72fd045d037..7ff90325c97f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1179,7 +1179,6 @@ struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, nft_pipapo_avx2_prepare(); -next_match: nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1, first = !i; int ret = 0; @@ -1226,6 +1225,7 @@ next_match: #undef NFT_SET_PIPAPO_AVX2_LOOKUP +next_match: if (ret < 0) { scratch->map_index = map_index; kernel_fpu_end(); @@ -1238,8 +1238,11 @@ next_match: e = f->mt[ret].e; if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || - !nft_set_elem_active(&e->ext, genmask))) + !nft_set_elem_active(&e->ext, genmask))) { + ret = pipapo_refill(res, f->bsize, f->rules, + fill, f->mt, last); goto next_match; + } scratch->map_index = map_index; kernel_fpu_end(); -- cgit v1.2.3 From 94bd247bc25b7f1560f96e9c912db3ec1fc878ea Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 10 Sep 2025 01:39:48 +0200 Subject: selftests: netfilter: nft_concat_range.sh: add check for double-create bug Add a test case for bug resolved with: 'netfilter: nft_set_pipapo_avx2: fix skip of expired entries'. It passes on nf.git (it uses the generic/C version for insertion duplicate check) but fails on unpatched nf-next if AVX2 is supported: cannot create same element twice 0s [FAIL] Could create element twice in same transaction table inet filter { # handle 8 [..] elements = { 1.2.3.4 . 1.2.4.1 counter packets 0 bytes 0, 1.2.4.1 . 1.2.3.4 counter packets 0 bytes 0, 1.2.3.4 . 1.2.4.1 counter packets 0 bytes 0, 1.2.4.1 . 1.2.3.4 counter packets 0 bytes 0 } Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal --- .../selftests/net/netfilter/nft_concat_range.sh | 56 +++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/netfilter/nft_concat_range.sh b/tools/testing/selftests/net/netfilter/nft_concat_range.sh index 20e76b395c85..ad97c6227f35 100755 --- a/tools/testing/selftests/net/netfilter/nft_concat_range.sh +++ b/tools/testing/selftests/net/netfilter/nft_concat_range.sh @@ -29,7 +29,7 @@ TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto net6_port_net6_port net_port_mac_proto_net" # Reported bugs, also described by TYPE_ variables below -BUGS="flush_remove_add reload net_port_proto_match avx2_mismatch" +BUGS="flush_remove_add reload net_port_proto_match avx2_mismatch doublecreate" # List of possible paths to pktgen script from kernel tree for performance tests PKTGEN_SCRIPT_PATHS=" @@ -408,6 +408,18 @@ perf_duration 0 " +TYPE_doublecreate=" +display cannot create same element twice +type_spec ipv4_addr . ipv4_addr +chain_spec ip saddr . ip daddr +dst addr4 +proto icmp + +race_repeat 0 + +perf_duration 0 +" + # Set template for all tests, types and rules are filled in depending on test set_template=' flush ruleset @@ -1900,6 +1912,48 @@ test_bug_avx2_mismatch() fi } +test_bug_doublecreate() +{ + local elements="1.2.3.4 . 1.2.4.1, 1.2.4.1 . 1.2.3.4" + local ret=1 + local i + + setup veth send_"${proto}" set || return ${ksft_skip} + + add "{ $elements }" || return 1 + # expected to work: 'add' on existing should be no-op. + add "{ $elements }" || return 1 + + # 'create' should return an error. + if nft create element inet filter test "{ $elements }" 2>/dev/null; then + err "Could create an existing element" + return 1 + fi +nft -f - </dev/null +flush set inet filter test +create element inet filter test { $elements } +create element inet filter test { $elements } +EOF + ret=$? + if [ $ret -eq 0 ]; then + err "Could create element twice in one transaction" + err "$(nft -a list ruleset)" + return 1 + fi + +nft -f - </dev/null +flush set inet filter test +create element inet filter test { $elements } +EOF + ret=$? + if [ $ret -ne 0 ]; then + err "Could not flush and re-create element in one transaction" + return 1 + fi + + return 0 +} + test_reported_issues() { eval test_bug_"${subtest}" } -- cgit v1.2.3 From c5ba345b2d358b07cc4f07253ba1ada73e77d586 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 24 Sep 2025 07:27:09 +0000 Subject: netfilter: nf_conntrack: do not skip entries in /proc/net/nf_conntrack ct_seq_show() has an opportunistic garbage collector : if (nf_ct_should_gc(ct)) { nf_ct_kill(ct); goto release; } So if one nf_conn is killed there, next time ct_get_next() runs, we skip the following item in the bucket, even if it should have been displayed if gc did not take place. We can decrement st->skip_elems to tell ct_get_next() one of the items was removed from the chain. Fixes: 58e207e4983d ("netfilter: evict stale entries when user reads /proc/net/nf_conntrack") Signed-off-by: Eric Dumazet Signed-off-by: Florian Westphal --- net/netfilter/nf_conntrack_standalone.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 1f14ef0436c6..708b79380f04 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -317,6 +317,9 @@ static int ct_seq_show(struct seq_file *s, void *v) smp_acquire__after_ctrl_dep(); if (nf_ct_should_gc(ct)) { + struct ct_iter_state *st = s->private; + + st->skip_elems--; nf_ct_kill(ct); goto release; } -- cgit v1.2.3