From c3025e94daa9ce84ef0e53df983b5bf2fbd76ea6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Apr 2025 16:35:59 +0000 Subject: net: rps: change skb_flow_limit() hash function As explained in commit f3483c8e1da6 ("net: rfs: hash function change"), masking low order bits of skb_get_hash(skb) has low entropy. A NIC with 32 RX queues uses the 5 low order bits of rss key to select a queue. This means all packets landing to a given queue share the same 5 low order bits. Switch to hash_32() to reduce hash collisions. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250407163602.170356-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 +- net/core/dev.h | 2 +- net/core/sysctl_net_core.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 0608605cfc24..f674236f34be 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5038,7 +5038,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { - new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); + new_flow = hash_32(skb_get_hash(skb), fl->log_buckets); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; diff --git a/net/core/dev.h b/net/core/dev.h index 7ee203395d8e..b1cd44b5f009 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -16,7 +16,7 @@ struct cpumask; #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { u64 count; - unsigned int num_buckets; + u8 log_buckets; unsigned int history_head; u16 history[FLOW_LIMIT_HISTORY]; u8 buckets[]; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index c7769ee0d9c5..5cfe76ede523 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -248,7 +248,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, ret = -ENOMEM; goto write_unlock; } - cur->num_buckets = netdev_flow_limit_table_len; + cur->log_buckets = ilog2(netdev_flow_limit_table_len); rcu_assign_pointer(sd->flow_limit, cur); } } -- cgit v1.2.3 From 7b6f0a852da34379a304d7020d70049ba5d1f0f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Apr 2025 16:36:00 +0000 Subject: net: rps: annotate data-races around (struct sd_flow_limit)->count softnet_seq_show() can read fl->count while another cpu updates this field from skb_flow_limit(). Make this field an 'unsigned int', as its only consumer only deals with 32 bit. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250407163602.170356-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 3 ++- net/core/dev.h | 2 +- net/core/net-procfs.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index f674236f34be..969883173182 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5049,7 +5049,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { - fl->count++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(fl->count, fl->count + 1); rcu_read_unlock(); return true; } diff --git a/net/core/dev.h b/net/core/dev.h index b1cd44b5f009..e855e1cb43fd 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -15,7 +15,7 @@ struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { - u64 count; + unsigned int count; u8 log_buckets; unsigned int history_head; u16 history[FLOW_LIMIT_HISTORY]; diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 3e92bf0f9060..69782d62fbe1 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -132,8 +132,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); + /* Pairs with WRITE_ONCE() in skb_flow_limit() */ if (fl) - flow_limit_count = fl->count; + flow_limit_count = READ_ONCE(fl->count); rcu_read_unlock(); #endif -- cgit v1.2.3 From 22d046a778e4344437fd49bb0995e315ec3fddcf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Apr 2025 16:36:01 +0000 Subject: net: add data-race annotations in softnet_seq_show() softnet_seq_show() reads several fields that might be updated concurrently. Add READ_ONCE() and WRITE_ONCE() annotations. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250407163602.170356-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 6 ++++-- net/core/net-procfs.c | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 969883173182..4ccc6dc5303e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4953,7 +4953,8 @@ static void rps_trigger_softirq(void *data) struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); - sd->received_rps++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->received_rps, sd->received_rps + 1); } #endif /* CONFIG_RPS */ @@ -7523,7 +7524,8 @@ start: */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { - sd->time_squeeze++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1); break; } } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 69782d62fbe1..4f0f0709a1cb 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -145,11 +145,11 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - sd->processed, atomic_read(&sd->dropped), - sd->time_squeeze, 0, + READ_ONCE(sd->processed), atomic_read(&sd->dropped), + READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ - sd->received_rps, flow_limit_count, + READ_ONCE(sd->received_rps), flow_limit_count, input_qlen + process_qlen, (int)seq->index, input_qlen, process_qlen); return 0; -- cgit v1.2.3 From 0a7de4a8f898c480ffafe024c4a0a8b8819597f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Apr 2025 16:36:02 +0000 Subject: net: rps: remove kfree_rcu_mightsleep() use Add an rcu_head to sd_flow_limit and rps_sock_flow_table structs to use the more conventional and predictable k[v]free_rcu(). Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250407163602.170356-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/rps.h | 5 +++-- net/core/dev.h | 1 + net/core/sysctl_net_core.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/net/rps.h b/include/net/rps.h index e358e9711f27..507f4aa5d39b 100644 --- a/include/net/rps.h +++ b/include/net/rps.h @@ -57,9 +57,10 @@ struct rps_dev_flow_table { * meaning we use 32-6=26 bits for the hash. */ struct rps_sock_flow_table { - u32 mask; + struct rcu_head rcu; + u32 mask; - u32 ents[] ____cacheline_aligned_in_smp; + u32 ents[] ____cacheline_aligned_in_smp; }; #define RPS_SOCK_FLOW_TABLE_SIZE(_num) (offsetof(struct rps_sock_flow_table, ents[_num])) diff --git a/net/core/dev.h b/net/core/dev.h index e855e1cb43fd..710abc05ebdb 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -15,6 +15,7 @@ struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { + struct rcu_head rcu; unsigned int count; u8 log_buckets; unsigned int history_head; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 5cfe76ede523..5dbb2c6f371d 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -201,7 +201,7 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - kvfree_rcu_mightsleep(orig_sock_table); + kvfree_rcu(orig_sock_table, rcu); } } } @@ -239,7 +239,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - kfree_rcu_mightsleep(cur); + kfree_rcu(cur, rcu); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); -- cgit v1.2.3