From 74455a5b4326add2499cb4a1f9706154b3a1eab4 Mon Sep 17 00:00:00 2001 From: Jiejian Wu Date: Tue, 24 Feb 2026 21:50:40 +0100 Subject: ipvs: make ip_vs_svc_table and ip_vs_svc_fwm_table per netns Current ipvs uses one global mutex "__ip_vs_mutex" to keep the global "ip_vs_svc_table" and "ip_vs_svc_fwm_table" safe. But when there are tens of thousands of services from different netns in the table, it takes a long time to look up the table, for example, using "ipvsadm -ln" from different netns simultaneously. We make "ip_vs_svc_table" and "ip_vs_svc_fwm_table" per netns, and we add "service_mutex" per netns to keep these two tables safe instead of the global "__ip_vs_mutex" in current version. To this end, looking up services from different netns simultaneously will not get stuck, shortening the time consumption in large-scale deployment. It can be reproduced using the simple scripts below. init.sh: #!/bin/bash for((i=1;i<=4;i++));do ip netns add ns$i ip netns exec ns$i ip link set dev lo up ip netns exec ns$i sh add-services.sh done add-services.sh: #!/bin/bash for((i=0;i<30000;i++)); do ipvsadm -A -t 10.10.10.10:$((80+$i)) -s rr done runtest.sh: #!/bin/bash for((i=1;i<4;i++));do ip netns exec ns$i ipvsadm -ln > /dev/null & done ip netns exec ns4 ipvsadm -ln > /dev/null Run "sh init.sh" to initiate the network environment. Then run "time ./runtest.sh" to evaluate the time consumption. Our testbed is a 4-core Intel Xeon ECS. The result of the original version is around 8 seconds, while the result of the modified version is only 0.8 seconds. Signed-off-by: Jiejian Wu Co-developed-by: Dust Li Signed-off-by: Dust Li Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20260224205048.4718-2-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/net/ip_vs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 29a36709e7f3..074a204ec6db 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -33,6 +33,12 @@ #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE BIT(IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) /* Generic access of ipvs struct */ static inline struct netns_ipvs *net_ipvs(struct net* net) @@ -1041,6 +1047,13 @@ struct netns_ipvs { */ unsigned int mixed_address_family_dests; unsigned int hooks_afmask; /* &1=AF_INET, &2=AF_INET6 */ + + /* the service mutex that protect svc_table and svc_fwm_table */ + struct mutex service_mutex; + /* the service table hashed by */ + struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE]; + /* the service table hashed by fwmark */ + struct hlist_head svc_fwm_table[IP_VS_SVC_TAB_SIZE]; }; #define DEFAULT_SYNC_THRESHOLD 3 -- cgit v1.2.3 From b24ae1a387e404e832385448ccad30cb03520e45 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 24 Feb 2026 21:50:42 +0100 Subject: ipvs: use single svc table fwmark based services and non-fwmark based services can be hashed in same service table. This reduces the burden of working with two tables. Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20260224205048.4718-4-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/net/ip_vs.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 074a204ec6db..b5a5a5efe3cc 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -679,8 +679,7 @@ struct ip_vs_dest_user_kern { * forwarding entries. */ struct ip_vs_service { - struct hlist_node s_list; /* for normal service table */ - struct hlist_node f_list; /* for fwmark-based service table */ + struct hlist_node s_list; /* node in service table */ atomic_t refcnt; /* reference counter */ u16 af; /* address family */ @@ -1050,10 +1049,7 @@ struct netns_ipvs { /* the service mutex that protect svc_table and svc_fwm_table */ struct mutex service_mutex; - /* the service table hashed by */ - struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE]; - /* the service table hashed by fwmark */ - struct hlist_head svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + struct hlist_head svc_table[IP_VS_SVC_TAB_SIZE]; /* Services */ }; #define DEFAULT_SYNC_THRESHOLD 3 -- cgit v1.2.3 From c59bd9e62e060bb5cd4d697b73bbe6f23a723345 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 24 Feb 2026 21:50:44 +0100 Subject: ipvs: use more counters to avoid service lookups When new connection is created we can lookup for services multiple times to support fallback options. We already have some counters to skip specific lookups because it costs CPU cycles for hash calculation, etc. Add more counters for fwmark/non-fwmark services (fwm_services and nonfwm_services) and make all counters per address family. Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20260224205048.4718-6-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/net/ip_vs.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index b5a5a5efe3cc..f2291be36409 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -271,6 +271,18 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len, pr_err(msg, ##__VA_ARGS__); \ } while (0) +/* For arrays per family */ +enum { + IP_VS_AF_INET, + IP_VS_AF_INET6, + IP_VS_AF_MAX +}; + +static inline int ip_vs_af_index(int af) +{ + return af == AF_INET6 ? IP_VS_AF_INET6 : IP_VS_AF_INET; +} + /* The port number of FTP service (in network order). */ #define FTPPORT cpu_to_be16(21) #define FTPDATA cpu_to_be16(20) @@ -940,17 +952,17 @@ struct netns_ipvs { /* ip_vs_ctl */ struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ - int num_services; /* no of virtual services */ - int num_services6; /* IPv6 virtual services */ - /* Trash for destinations */ struct list_head dest_trash; spinlock_t dest_trash_lock; struct timer_list dest_trash_timer; /* expiration timer */ /* Service counters */ - atomic_t ftpsvc_counter; - atomic_t nullsvc_counter; - atomic_t conn_out_counter; + atomic_t num_services[IP_VS_AF_MAX]; /* Services */ + atomic_t fwm_services[IP_VS_AF_MAX]; /* Services */ + atomic_t nonfwm_services[IP_VS_AF_MAX];/* Services */ + atomic_t ftpsvc_counter[IP_VS_AF_MAX]; /* FTPPORT */ + atomic_t nullsvc_counter[IP_VS_AF_MAX];/* Zero port */ + atomic_t conn_out_counter[IP_VS_AF_MAX];/* out conn */ #ifdef CONFIG_SYSCTL /* delayed work for expiring no dest connections */ -- cgit v1.2.3 From 09b71fb459468b408f3fa3e2b75d20113374f057 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 24 Feb 2026 21:50:45 +0100 Subject: ipvs: no_cport and dropentry counters can be per-net Change the no_cport counters to be per-net and address family. This should reduce the extra conn lookups done during present NO_CPORT connections. By changing from global to per-net dropentry counters, one net will not affect the drop rate of another net. Signed-off-by: Julian Anastasov Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20260224205048.4718-7-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/net/ip_vs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index f2291be36409..ad8a16146ac5 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -948,6 +948,7 @@ struct netns_ipvs { #endif /* ip_vs_conn */ atomic_t conn_count; /* connection counter */ + atomic_t no_cport_conns[IP_VS_AF_MAX]; /* ip_vs_ctl */ struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ @@ -973,6 +974,7 @@ struct netns_ipvs { int drop_counter; int old_secure_tcp; atomic_t dropentry; + s8 dropentry_counters[8]; /* locks in ctl.c */ spinlock_t dropentry_lock; /* drop entry handling */ spinlock_t droppacket_lock; /* drop packet handling */ -- cgit v1.2.3 From 6b94d081f81dd524626f7aab2b98a9de335edb72 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 24 Feb 2026 21:50:48 +0100 Subject: netfilter: nf_tables: remove register tracking infrastructure This facility was disabled in commit 9e539c5b6d9c ("netfilter: nf_tables: disable expression reduction infra"), because not all nft_exprs guarantee they will update the destination register: some may set NFT_BREAK instead to cancel evaluation of the rule. This has been dead code ever since. There are no plans to salvage this at this time, so remove this. Signed-off-by: Florian Westphal Link: https://patch.msgid.link/20260224205048.4718-10-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/net/netfilter/nf_tables.h | 32 -------------------------------- include/net/netfilter/nft_fib.h | 2 -- include/net/netfilter/nft_meta.h | 3 --- 3 files changed, 37 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 426534a711b0..40e8106e71f0 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -122,17 +122,6 @@ struct nft_regs { }; }; -struct nft_regs_track { - struct { - const struct nft_expr *selector; - const struct nft_expr *bitwise; - u8 num_reg; - } regs[NFT_REG32_NUM]; - - const struct nft_expr *cur; - const struct nft_expr *last; -}; - /* Store/load an u8, u16 or u64 integer to/from the u32 data register. * * Note, when using concatenations, register allocation happens at 32-bit @@ -427,8 +416,6 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); -bool nft_expr_reduce_bitwise(struct nft_regs_track *track, - const struct nft_expr *expr); struct nft_set_ext; @@ -941,7 +928,6 @@ struct nft_offload_ctx; * @destroy_clone: destruction clone function * @dump: function to dump parameters * @validate: validate expression, called during loop detection - * @reduce: reduce expression * @gc: garbage collection expression * @offload: hardware offload expression * @offload_action: function to report true/false to allocate one slot or not in the flow @@ -975,8 +961,6 @@ struct nft_expr_ops { bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr); - bool (*reduce)(struct nft_regs_track *track, - const struct nft_expr *expr); bool (*gc)(struct net *net, const struct nft_expr *expr); int (*offload)(struct nft_offload_ctx *ctx, @@ -1954,20 +1938,4 @@ static inline u64 nft_net_tstamp(const struct net *net) return nft_pernet(net)->tstamp; } -#define __NFT_REDUCE_READONLY 1UL -#define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY - -void nft_reg_track_update(struct nft_regs_track *track, - const struct nft_expr *expr, u8 dreg, u8 len); -void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); -void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); - -static inline bool nft_reg_track_cmp(struct nft_regs_track *track, - const struct nft_expr *expr, u8 dreg) -{ - return track->regs[dreg].selector && - track->regs[dreg].selector->ops == expr->ops && - track->regs[dreg].num_reg == 0; -} - #endif /* _NET_NF_TABLES_H */ diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 7370fba844ef..e0422456f27b 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -66,6 +66,4 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, void nft_fib_store_result(void *reg, const struct nft_fib *priv, const struct net_device *dev); -bool nft_fib_reduce(struct nft_regs_track *track, - const struct nft_expr *expr); #endif diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index d602263590fe..f74e63290603 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -43,9 +43,6 @@ void nft_meta_set_destroy(const struct nft_ctx *ctx, int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr); -bool nft_meta_get_reduce(struct nft_regs_track *track, - const struct nft_expr *expr); - struct nft_inner_tun_ctx; void nft_meta_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt, -- cgit v1.2.3