summaryrefslogtreecommitdiff
path: root/tools
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2026-02-27 15:39:01 -0800
committerAlexei Starovoitov <ast@kernel.org>2026-02-27 15:39:01 -0800
commit5263e30fffbcc7934671f0421eafb87b690a01d2 (patch)
tree9d89fe95ffee4efc17eef99084b39ba515d3ae80 /tools
parent6881af27f9ea0f5ca8f606f573ef5cc25ca31fe4 (diff)
parent2939d7b3b0e5f35359ce8f69dbbad0bfc4e920b6 (diff)
Merge branch 'close-race-in-freeing-special-fields-and-map-value'
Kumar Kartikeya Dwivedi says: ==================== Close race in freeing special fields and map value There exists a race across various map types where the freeing of special fields (tw, timer, wq, kptr, etc.) can be done eagerly when a logical delete operation is done on a map value, such that the program which continues to have access to such a map value can recreate the fields and cause them to leak. The set contains fixes for this case. It is a continuation of Mykyta's previous attempt in [0], but applies to all fields. A test is included which reproduces the bug reliably in absence of the fixes. Local Storage Benchmarks ------------------------ Evaluation Setup: Benchmarked on a dual-socket Intel Xeon Gold 6348 (Ice Lake) @ 2.60GHz (56 cores / 112 threads), with the CPU governor set to performance. Bench was pinned to a single NUMA node throughout the test. Benchmark comes from [1] using the following command: ./bench -p 1 local-storage-create --storage-type <socket,task> --batch-size <16,32,64> Before the test, 10 runs of all cases ([socket|task] x 3 batch sizes x 7 iterations per batch size) are done to warm up and prime the machine. Then, 3 runs of all cases are done (with and without the patch, across reboots). For each comparison, we have 21 samples, i.e. per batch size (e.g. socket 16) of a given local storage, we have 3 runs x 7 iterations. The statistics (mean, median, stddev) and t-test is done for each scenario (local storage and batch size pair) individually (21 samples for either case). All values are for local storage creations in thousand creations / sec (k/s). Baseline (without patch) With patch Delta Case Median Mean Std. Dev. Median Mean Std. Dev. Median % --------------------------------------------------------------------------------------------------- socket 16 432.026 431.941 1.047 431.347 431.953 1.635 -0.679 -0.16% socket 32 432.641 432.818 1.535 432.488 432.302 1.508 -0.153 -0.04% socket 64 431.504 431.996 1.337 429.145 430.326 2.469 -2.359 -0.55% task 16 38.816 39.382 1.456 39.657 39.337 1.831 +0.841 +2.17% task 32 38.815 39.644 2.690 38.721 39.122 1.636 -0.094 -0.24% task 64 37.562 38.080 1.701 39.554 38.563 1.689 +1.992 +5.30% The cases for socket are within the range of noise, and improvements in task local storage are due to high variance (CV ~4%-6% across batch sizes). The only statistically significant case worth mentioning is socket with batch size 64 with p-value from t-test < 0.05, but the absolute difference is small (~2k/s). TL;DR there doesn't appear to be any significant regression or improvement. [0]: https://lore.kernel.org/bpf/20260216131341.1285427-1-mykyta.yatsenko5@gmail.com [1]: https://lore.kernel.org/bpf/20260205222916.1788211-1-ameryhung@gmail.com Changelog: ---------- v2 -> v3 v2: https://lore.kernel.org/bpf/20260227052031.3988575-1-memxor@gmail.com * Add syzbot Tested-by. * Add Amery's Reviewed-by. * Fix missing rcu_dereference_check() in __bpf_selem_free_rcu. (BPF CI Bot) * Remove migrate_disable() in bpf_selem_free_rcu. (Alexei) v1 -> v2 v1: https://lore.kernel.org/bpf/20260225185121.2057388-1-memxor@gmail.com * Add Paul's Reviewed-by. * Fix use-after-free in accessing bpf_mem_alloc embedded in map. (syzbot CI) * Add benchmark numbers for local storage. * Add extra test case for per-cpu hashmap coverage with up to 16 refcount leaks. * Target bpf tree. ==================== Link: https://patch.msgid.link/20260227224806.646888-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Diffstat (limited to 'tools')
-rw-r--r--tools/testing/selftests/bpf/prog_tests/map_kptr_race.c218
-rw-r--r--tools/testing/selftests/bpf/progs/map_kptr_race.c197
2 files changed, 415 insertions, 0 deletions
diff --git a/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c b/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c
new file mode 100644
index 000000000000..506ed55e8528
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/map_kptr_race.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+#include <network_helpers.h>
+
+#include "map_kptr_race.skel.h"
+
+static int get_map_id(int map_fd)
+{
+ struct bpf_map_info info = {};
+ __u32 len = sizeof(info);
+
+ if (!ASSERT_OK(bpf_map_get_info_by_fd(map_fd, &info, &len), "get_map_info"))
+ return -1;
+ return info.id;
+}
+
+static int read_refs(struct map_kptr_race *skel)
+{
+ LIBBPF_OPTS(bpf_test_run_opts, opts);
+ int ret;
+
+ ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.count_ref), &opts);
+ if (!ASSERT_OK(ret, "count_ref run"))
+ return -1;
+ if (!ASSERT_OK(opts.retval, "count_ref retval"))
+ return -1;
+ return skel->bss->num_of_refs;
+}
+
+static void test_htab_leak(void)
+{
+ LIBBPF_OPTS(bpf_test_run_opts, opts,
+ .data_in = &pkt_v4,
+ .data_size_in = sizeof(pkt_v4),
+ .repeat = 1,
+ );
+ struct map_kptr_race *skel, *watcher;
+ int ret, map_id;
+
+ skel = map_kptr_race__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "open_and_load"))
+ return;
+
+ ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_htab_leak), &opts);
+ if (!ASSERT_OK(ret, "test_htab_leak run"))
+ goto out_skel;
+ if (!ASSERT_OK(opts.retval, "test_htab_leak retval"))
+ goto out_skel;
+
+ map_id = get_map_id(bpf_map__fd(skel->maps.race_hash_map));
+ if (!ASSERT_GE(map_id, 0, "map_id"))
+ goto out_skel;
+
+ watcher = map_kptr_race__open_and_load();
+ if (!ASSERT_OK_PTR(watcher, "watcher open_and_load"))
+ goto out_skel;
+
+ watcher->bss->target_map_id = map_id;
+ watcher->links.map_put = bpf_program__attach(watcher->progs.map_put);
+ if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry"))
+ goto out_watcher;
+ watcher->links.htab_map_free = bpf_program__attach(watcher->progs.htab_map_free);
+ if (!ASSERT_OK_PTR(watcher->links.htab_map_free, "attach fexit"))
+ goto out_watcher;
+
+ map_kptr_race__destroy(skel);
+ skel = NULL;
+
+ kern_sync_rcu();
+
+ while (!READ_ONCE(watcher->bss->map_freed))
+ sched_yield();
+
+ ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed");
+ ASSERT_EQ(read_refs(watcher), 2, "htab refcount");
+
+out_watcher:
+ map_kptr_race__destroy(watcher);
+out_skel:
+ map_kptr_race__destroy(skel);
+}
+
+static void test_percpu_htab_leak(void)
+{
+ LIBBPF_OPTS(bpf_test_run_opts, opts,
+ .data_in = &pkt_v4,
+ .data_size_in = sizeof(pkt_v4),
+ .repeat = 1,
+ );
+ struct map_kptr_race *skel, *watcher;
+ int ret, map_id;
+
+ skel = map_kptr_race__open();
+ if (!ASSERT_OK_PTR(skel, "open"))
+ return;
+
+ skel->rodata->nr_cpus = libbpf_num_possible_cpus();
+ if (skel->rodata->nr_cpus > 16)
+ skel->rodata->nr_cpus = 16;
+
+ ret = map_kptr_race__load(skel);
+ if (!ASSERT_OK(ret, "load"))
+ goto out_skel;
+
+ ret = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_percpu_htab_leak), &opts);
+ if (!ASSERT_OK(ret, "test_percpu_htab_leak run"))
+ goto out_skel;
+ if (!ASSERT_OK(opts.retval, "test_percpu_htab_leak retval"))
+ goto out_skel;
+
+ map_id = get_map_id(bpf_map__fd(skel->maps.race_percpu_hash_map));
+ if (!ASSERT_GE(map_id, 0, "map_id"))
+ goto out_skel;
+
+ watcher = map_kptr_race__open_and_load();
+ if (!ASSERT_OK_PTR(watcher, "watcher open_and_load"))
+ goto out_skel;
+
+ watcher->bss->target_map_id = map_id;
+ watcher->links.map_put = bpf_program__attach(watcher->progs.map_put);
+ if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry"))
+ goto out_watcher;
+ watcher->links.htab_map_free = bpf_program__attach(watcher->progs.htab_map_free);
+ if (!ASSERT_OK_PTR(watcher->links.htab_map_free, "attach fexit"))
+ goto out_watcher;
+
+ map_kptr_race__destroy(skel);
+ skel = NULL;
+
+ kern_sync_rcu();
+
+ while (!READ_ONCE(watcher->bss->map_freed))
+ sched_yield();
+
+ ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed");
+ ASSERT_EQ(read_refs(watcher), 2, "percpu_htab refcount");
+
+out_watcher:
+ map_kptr_race__destroy(watcher);
+out_skel:
+ map_kptr_race__destroy(skel);
+}
+
+static void test_sk_ls_leak(void)
+{
+ struct map_kptr_race *skel, *watcher;
+ int listen_fd = -1, client_fd = -1, map_id;
+
+ skel = map_kptr_race__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "open_and_load"))
+ return;
+
+ if (!ASSERT_OK(map_kptr_race__attach(skel), "attach"))
+ goto out_skel;
+
+ listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
+ if (!ASSERT_GE(listen_fd, 0, "start_server"))
+ goto out_skel;
+
+ client_fd = connect_to_fd(listen_fd, 0);
+ if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
+ goto out_skel;
+
+ if (!ASSERT_EQ(skel->bss->sk_ls_leak_done, 1, "sk_ls_leak_done"))
+ goto out_skel;
+
+ close(client_fd);
+ client_fd = -1;
+ close(listen_fd);
+ listen_fd = -1;
+
+ map_id = get_map_id(bpf_map__fd(skel->maps.race_sk_ls_map));
+ if (!ASSERT_GE(map_id, 0, "map_id"))
+ goto out_skel;
+
+ watcher = map_kptr_race__open_and_load();
+ if (!ASSERT_OK_PTR(watcher, "watcher open_and_load"))
+ goto out_skel;
+
+ watcher->bss->target_map_id = map_id;
+ watcher->links.map_put = bpf_program__attach(watcher->progs.map_put);
+ if (!ASSERT_OK_PTR(watcher->links.map_put, "attach fentry"))
+ goto out_watcher;
+ watcher->links.sk_map_free = bpf_program__attach(watcher->progs.sk_map_free);
+ if (!ASSERT_OK_PTR(watcher->links.sk_map_free, "attach fexit"))
+ goto out_watcher;
+
+ map_kptr_race__destroy(skel);
+ skel = NULL;
+
+ kern_sync_rcu();
+
+ while (!READ_ONCE(watcher->bss->map_freed))
+ sched_yield();
+
+ ASSERT_EQ(watcher->bss->map_freed, 1, "map_freed");
+ ASSERT_EQ(read_refs(watcher), 2, "sk_ls refcount");
+
+out_watcher:
+ map_kptr_race__destroy(watcher);
+out_skel:
+ if (client_fd >= 0)
+ close(client_fd);
+ if (listen_fd >= 0)
+ close(listen_fd);
+ map_kptr_race__destroy(skel);
+}
+
+void serial_test_map_kptr_race(void)
+{
+ if (test__start_subtest("htab_leak"))
+ test_htab_leak();
+ if (test__start_subtest("percpu_htab_leak"))
+ test_percpu_htab_leak();
+ if (test__start_subtest("sk_ls_leak"))
+ test_sk_ls_leak();
+}
diff --git a/tools/testing/selftests/bpf/progs/map_kptr_race.c b/tools/testing/selftests/bpf/progs/map_kptr_race.c
new file mode 100644
index 000000000000..f6f136cd8f60
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/map_kptr_race.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Meta Platforms, Inc. and affiliates. */
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "../test_kmods/bpf_testmod_kfunc.h"
+
+struct map_value {
+ struct prog_test_ref_kfunc __kptr *ref_ptr;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct map_value);
+ __uint(max_entries, 1);
+} race_hash_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct map_value);
+ __uint(max_entries, 1);
+} race_percpu_hash_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct map_value);
+} race_sk_ls_map SEC(".maps");
+
+int num_of_refs;
+int sk_ls_leak_done;
+int target_map_id;
+int map_freed;
+const volatile int nr_cpus;
+
+SEC("tc")
+int test_htab_leak(struct __sk_buff *skb)
+{
+ struct prog_test_ref_kfunc *p, *old;
+ struct map_value val = {};
+ struct map_value *v;
+ int key = 0;
+
+ if (bpf_map_update_elem(&race_hash_map, &key, &val, BPF_ANY))
+ return 1;
+
+ v = bpf_map_lookup_elem(&race_hash_map, &key);
+ if (!v)
+ return 2;
+
+ p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
+ if (!p)
+ return 3;
+ old = bpf_kptr_xchg(&v->ref_ptr, p);
+ if (old)
+ bpf_kfunc_call_test_release(old);
+
+ bpf_map_delete_elem(&race_hash_map, &key);
+
+ p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
+ if (!p)
+ return 4;
+ old = bpf_kptr_xchg(&v->ref_ptr, p);
+ if (old)
+ bpf_kfunc_call_test_release(old);
+
+ return 0;
+}
+
+static int fill_percpu_kptr(struct map_value *v)
+{
+ struct prog_test_ref_kfunc *p, *old;
+
+ p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
+ if (!p)
+ return 1;
+ old = bpf_kptr_xchg(&v->ref_ptr, p);
+ if (old)
+ bpf_kfunc_call_test_release(old);
+ return 0;
+}
+
+SEC("tc")
+int test_percpu_htab_leak(struct __sk_buff *skb)
+{
+ struct map_value *v, *arr[16] = {};
+ struct map_value val = {};
+ int key = 0;
+ int err = 0;
+
+ if (bpf_map_update_elem(&race_percpu_hash_map, &key, &val, BPF_ANY))
+ return 1;
+
+ for (int i = 0; i < nr_cpus; i++) {
+ v = bpf_map_lookup_percpu_elem(&race_percpu_hash_map, &key, i);
+ if (!v)
+ return 2;
+ arr[i] = v;
+ }
+
+ bpf_map_delete_elem(&race_percpu_hash_map, &key);
+
+ for (int i = 0; i < nr_cpus; i++) {
+ v = arr[i];
+ err = fill_percpu_kptr(v);
+ if (err)
+ return 3;
+ }
+
+ return 0;
+}
+
+SEC("tp_btf/inet_sock_set_state")
+int BPF_PROG(test_sk_ls_leak, struct sock *sk, int oldstate, int newstate)
+{
+ struct prog_test_ref_kfunc *p, *old;
+ struct map_value *v;
+
+ if (newstate != BPF_TCP_SYN_SENT)
+ return 0;
+
+ if (sk_ls_leak_done)
+ return 0;
+
+ v = bpf_sk_storage_get(&race_sk_ls_map, sk, NULL,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!v)
+ return 0;
+
+ p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
+ if (!p)
+ return 0;
+ old = bpf_kptr_xchg(&v->ref_ptr, p);
+ if (old)
+ bpf_kfunc_call_test_release(old);
+
+ bpf_sk_storage_delete(&race_sk_ls_map, sk);
+
+ p = bpf_kfunc_call_test_acquire(&(unsigned long){0});
+ if (!p)
+ return 0;
+ old = bpf_kptr_xchg(&v->ref_ptr, p);
+ if (old)
+ bpf_kfunc_call_test_release(old);
+
+ sk_ls_leak_done = 1;
+ return 0;
+}
+
+long target_map_ptr;
+
+SEC("fentry/bpf_map_put")
+int BPF_PROG(map_put, struct bpf_map *map)
+{
+ if (target_map_id && map->id == (u32)target_map_id)
+ target_map_ptr = (long)map;
+ return 0;
+}
+
+SEC("fexit/htab_map_free")
+int BPF_PROG(htab_map_free, struct bpf_map *map)
+{
+ if (target_map_ptr && (long)map == target_map_ptr)
+ map_freed = 1;
+ return 0;
+}
+
+SEC("fexit/bpf_sk_storage_map_free")
+int BPF_PROG(sk_map_free, struct bpf_map *map)
+{
+ if (target_map_ptr && (long)map == target_map_ptr)
+ map_freed = 1;
+ return 0;
+}
+
+SEC("syscall")
+int count_ref(void *ctx)
+{
+ struct prog_test_ref_kfunc *p;
+ unsigned long arg = 0;
+
+ p = bpf_kfunc_call_test_acquire(&arg);
+ if (!p)
+ return 1;
+
+ num_of_refs = p->cnt.refs.counter;
+
+ bpf_kfunc_call_test_release(p);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";