From ddf8dec6db31df3eafb00b6a22864067fd49c5a7 Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Sun, 26 Jan 2025 08:19:24 -0500 Subject: netfilter: xt_hashlimit: replace vmalloc calls with kvmalloc Replace vmalloc allocations with kvmalloc since kvmalloc is more flexible in memory allocation Signed-off-by: Denis Kirjanov Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_hashlimit.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index fa02aab56724..3b507694e81e 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -294,8 +293,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, if (size < 16) size = 16; } - /* FIXME: don't use vmalloc() here or anywhere else -HW */ - hinfo = vmalloc(struct_size(hinfo, hash, size)); + hinfo = kvmalloc(struct_size(hinfo, hash, size), GFP_KERNEL); if (hinfo == NULL) return -ENOMEM; *out_hinfo = hinfo; @@ -303,7 +301,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, /* copy match config into hashtable config */ ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3); if (ret) { - vfree(hinfo); + kvfree(hinfo); return ret; } @@ -322,7 +320,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, hinfo->rnd_initialized = false; hinfo->name = kstrdup(name, GFP_KERNEL); if (!hinfo->name) { - vfree(hinfo); + kvfree(hinfo); return -ENOMEM; } spin_lock_init(&hinfo->lock); @@ -344,7 +342,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, ops, hinfo); if (hinfo->pde == NULL) { kfree(hinfo->name); - vfree(hinfo); + kvfree(hinfo); return -ENOMEM; } hinfo->net = net; @@ -433,7 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) cancel_delayed_work_sync(&hinfo->gc_work); htable_selective_cleanup(hinfo, true); kfree(hinfo->name); - vfree(hinfo); + kvfree(hinfo); } } -- cgit v1.2.3 From 8b6861390ffee6b8ed78b9395e3776c16fec6579 Mon Sep 17 00:00:00 2001 From: Nicolas Bouchinet Date: Wed, 29 Jan 2025 18:06:30 +0100 Subject: netfilter: conntrack: Bound nf_conntrack sysctl writes nf_conntrack_max and nf_conntrack_expect_max sysctls were authorized to be written any negative value, which would then be stored in the unsigned int variables nf_conntrack_max and nf_ct_expect_max variables. While the do_proc_dointvec_conv function is supposed to limit writing handled by proc_dointvec proc_handler to INT_MAX. Such a negative value being written in an unsigned int leads to a very high value, exceeding this limit. Moreover, the nf_conntrack_expect_max sysctl documentation specifies the minimum value is 1. The proc_handlers have thus been updated to proc_dointvec_minmax in order to specify the following write bounds : * Bound nf_conntrack_max sysctl writings between SYSCTL_ZERO and SYSCTL_INT_MAX. * Bound nf_conntrack_expect_max sysctl writings between SYSCTL_ONE and SYSCTL_INT_MAX as defined in the sysctl documentation. With this patch applied, sysctl writes outside the defined in the bound will thus lead to a write error : ``` sysctl -w net.netfilter.nf_conntrack_expect_max=-1 sysctl: setting key "net.netfilter.nf_conntrack_expect_max": Invalid argument ``` Signed-off-by: Nicolas Bouchinet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_standalone.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 502cf10aab41..2f666751c7e7 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -618,7 +618,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", @@ -654,7 +656,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { .data = &nf_ct_expect_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", @@ -947,7 +951,9 @@ static struct ctl_table nf_ct_netfilter_table[] = { .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, }; -- cgit v1.2.3 From eaaff9b6702e99be5d79135f2afa9fc48a0d59e0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Feb 2025 14:07:01 +0100 Subject: netfilter: fib: avoid lookup if socket is available In case the fib match is used from the input hook we can avoid the fib lookup if early demux assigned a socket for us: check that the input interface matches sk-cached one. Rework the existing 'lo bypass' logic to first check sk, then for loopback interface type to elide the fib lookup. This speeds up fib matching a little, before: 93.08 GBit/s (no rules at all) 75.1 GBit/s ("fib saddr . iif oif missing drop" in prerouting) 75.62 GBit/s ("fib saddr . iif oif missing drop" in input) After: 92.48 GBit/s (no rules at all) 75.62 GBit/s (fib rule in prerouting) 90.37 GBit/s (fib rule in input). Numbers for the 'no rules' and 'prerouting' are expected to closely match in-between runs, the 3rd/input test case exercises the the 'avoid lookup if cached ifindex in sk matches' case. Test used iperf3 via veth interface, lo can't be used due to existing loopback test. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_fib.h | 21 +++++++++++++++++++++ net/ipv4/netfilter/nft_fib_ipv4.c | 11 +++++------ net/ipv6/netfilter/nft_fib_ipv6.c | 19 ++++++++++--------- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 38cae7113de4..6e202ed5e63f 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -18,6 +18,27 @@ nft_fib_is_loopback(const struct sk_buff *skb, const struct net_device *in) return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } +static inline bool nft_fib_can_skip(const struct nft_pktinfo *pkt) +{ + const struct net_device *indev = nft_in(pkt); + const struct sock *sk; + + switch (nft_hook(pkt)) { + case NF_INET_PRE_ROUTING: + case NF_INET_INGRESS: + case NF_INET_LOCAL_IN: + break; + default: + return false; + } + + sk = pkt->skb->sk; + if (sk && sk_fullsock(sk)) + return sk->sk_rx_dst_ifindex == indev->ifindex; + + return nft_fib_is_loopback(pkt->skb, indev); +} + int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 625adbc42037..9082ca17e845 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -71,6 +71,11 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct net_device *oif; const struct net_device *found; + if (nft_fib_can_skip(pkt)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; + } + /* * Do not set flowi4_oif, it restricts results (for example, asking * for oif 3 will get RTN_UNICAST result even if the daddr exits @@ -85,12 +90,6 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; - if (nft_hook(pkt) == NF_INET_PRE_ROUTING && - nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv, nft_in(pkt)); - return; - } - iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); if (!iph) { regs->verdict.code = NFT_BREAK; diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index c9f1634b3838..7fd9d7b21cd4 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -170,6 +170,11 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, struct rt6_info *rt; int lookup_flags; + if (nft_fib_can_skip(pkt)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; + } + if (priv->flags & NFTA_FIB_F_IIF) oif = nft_in(pkt); else if (priv->flags & NFTA_FIB_F_OIF) @@ -181,17 +186,13 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); - - if (nft_hook(pkt) == NF_INET_PRE_ROUTING || - nft_hook(pkt) == NF_INET_INGRESS) { - if (nft_fib_is_loopback(pkt->skb, nft_in(pkt)) || - nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) { - nft_fib_store_result(dest, priv, nft_in(pkt)); - return; - } + if (nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; } + lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); + *dest = 0; rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb, lookup_flags); -- cgit v1.2.3 From 778b09d91baafb13408470c721d034d6515cfa5a Mon Sep 17 00:00:00 2001 From: Chenyuan Yang Date: Thu, 13 Mar 2025 14:54:41 -0500 Subject: netfilter: nfnetlink_queue: Initialize ctx to avoid memory allocation error It is possible that ctx in nfqnl_build_packet_message() could be used before it is properly initialize, which is only initialized by nfqnl_get_sk_secctx(). This patch corrects this problem by initializing the lsmctx to a safe value when it is declared. This is similar to the commit 35fcac7a7c25 ("audit: Initialize lsmctx to avoid memory allocation error"). Fixes: 2d470c778120 ("lsm: replace context+len with lsm_context") Signed-off-by: Chenyuan Yang Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 5c913987901a..8b7b39d8a109 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -567,7 +567,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, enum ip_conntrack_info ctinfo = 0; const struct nfnl_ct_hook *nfnl_ct; bool csum_verify; - struct lsm_context ctx; + struct lsm_context ctx = { NULL, 0, 0 }; int seclen = 0; ktime_t tstamp; -- cgit v1.2.3 From 3b4aff61ca5dd696856d2db98e9268d94504eff3 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 18 Mar 2025 19:55:19 +0100 Subject: netfilter: xtables: Use strscpy() instead of strscpy_pad() kzalloc() already zero-initializes the destination buffer, making strscpy() sufficient for safely copying the name. The additional NUL- padding performed by strscpy_pad() is unnecessary. The size parameter is optional, and strscpy() automatically determines the size of the destination buffer using sizeof() if the argument is omitted. This makes the explicit sizeof() call unnecessary; remove it. No functional changes intended. Signed-off-by: Thorsten Blum Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_repldata.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 5d1fb7018dba..600060ca940a 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -29,7 +29,7 @@ if (tbl == NULL) \ return NULL; \ term = (struct type##_error *)&(((char *)tbl)[term_offset]); \ - strscpy_pad(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ + strscpy(tbl->repl.name, info->name); \ *term = (struct type##_error)typ2##_ERROR_INIT; \ tbl->repl.valid_hooks = hook_mask; \ tbl->repl.num_entries = nhooks + 1; \ -- cgit v1.2.3 From 932b32ffd7604fb00b5c57e239a3cc4d901ccf6e Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 18 Mar 2025 18:15:16 +0200 Subject: netfilter: socket: Lookup orig tuple for IPv6 SNAT nf_sk_lookup_slow_v4 does the conntrack lookup for IPv4 packets to restore the original 5-tuple in case of SNAT, to be able to find the right socket (if any). Then socket_match() can correctly check whether the socket was transparent. However, the IPv6 counterpart (nf_sk_lookup_slow_v6) lacks this conntrack lookup, making xt_socket fail to match on the socket when the packet was SNATed. Add the same logic to nf_sk_lookup_slow_v6. IPv6 SNAT is used in Kubernetes clusters for pod-to-world packets, as pods' addresses are in the fd00::/8 ULA subnet and need to be replaced with the node's external address. Cilium leverages Envoy to enforce L7 policies, and Envoy uses transparent sockets. Cilium inserts an iptables prerouting rule that matches on `-m socket --transparent` and redirects the packets to localhost, but it fails to match SNATed IPv6 packets due to that missing conntrack lookup. Closes: https://github.com/cilium/cilium/issues/37932 Fixes: eb31628e37a0 ("netfilter: nf_tables: Add support for IPv6 NAT") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/nf_socket_ipv6.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index a7690ec62325..9ea5ef56cb27 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -103,6 +103,10 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, struct sk_buff *data_skb = NULL; int doff = 0; int thoff = 0, tproto; +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + enum ip_conntrack_info ctinfo; + struct nf_conn const *ct; +#endif tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); if (tproto < 0) { @@ -136,6 +140,25 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, return NULL; } +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + /* Do the lookup with the original socket address in + * case this is a reply packet of an established + * SNAT-ted connection. + */ + ct = nf_ct_get(skb, &ctinfo); + if (ct && + ((tproto != IPPROTO_ICMPV6 && + ctinfo == IP_CT_ESTABLISHED_REPLY) || + (tproto == IPPROTO_ICMPV6 && + ctinfo == IP_CT_RELATED_REPLY)) && + (ct->status & IPS_SRC_NAT_DONE)) { + daddr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6; + dport = (tproto == IPPROTO_TCP) ? + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + } +#endif + return nf_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr, sport, dport, indev); } -- cgit v1.2.3 From e3a4182edd1ae60e7e3539ff3b3784af9830d223 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Wed, 19 Mar 2025 22:01:47 +0800 Subject: netfilter: nf_tables: Only use nf_skip_indirect_calls() when MITIGATION_RETPOLINE 1. MITIGATION_RETPOLINE is x86-only (defined in arch/x86/Kconfig), so no need to AND with CONFIG_X86 when checking if enabled. 2. Remove unused declaration of nf_skip_indirect_calls() when MITIGATION_RETPOLINE is disabled to avoid warnings. 3. Declare nf_skip_indirect_calls() and nf_skip_indirect_calls_enable() as inline when MITIGATION_RETPOLINE is enabled, as they are called only once and have simple logic. Fix follow error with clang-21 when W=1e: net/netfilter/nf_tables_core.c:39:20: error: unused function 'nf_skip_indirect_calls' [-Werror,-Wunused-function] 39 | static inline bool nf_skip_indirect_calls(void) { return false; } | ^~~~~~~~~~~~~~~~~~~~~~ 1 error generated. make[4]: *** [scripts/Makefile.build:207: net/netfilter/nf_tables_core.o] Error 1 make[3]: *** [scripts/Makefile.build:465: net/netfilter] Error 2 make[3]: *** Waiting for unfinished jobs.... Fixes: d8d760627855 ("netfilter: nf_tables: add static key to skip retpoline workarounds") Co-developed-by: Wentao Guan Signed-off-by: Wentao Guan Signed-off-by: WangYuli Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_core.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 75598520b0fa..6557a4018c09 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -21,25 +21,22 @@ #include #include -#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_X86) - +#ifdef CONFIG_MITIGATION_RETPOLINE static struct static_key_false nf_tables_skip_direct_calls; -static bool nf_skip_indirect_calls(void) +static inline bool nf_skip_indirect_calls(void) { return static_branch_likely(&nf_tables_skip_direct_calls); } -static void __init nf_skip_indirect_calls_enable(void) +static inline void __init nf_skip_indirect_calls_enable(void) { if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) static_branch_enable(&nf_tables_skip_direct_calls); } #else -static inline bool nf_skip_indirect_calls(void) { return false; } - static inline void nf_skip_indirect_calls_enable(void) { } -#endif +#endif /* CONFIG_MITIGATION_RETPOLINE */ static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, -- cgit v1.2.3