summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMykyta Yatsenko <yatsenko@meta.com>2026-06-05 04:41:23 -0700
committerAlexei Starovoitov <ast@kernel.org>2026-06-05 08:00:08 -0700
commit6905f8601298ecd2d1932a4b4849bf265201118e (patch)
tree6dd8a1040d0df38498a70822f8cbd1c815e2bfef
parent818e0084822742fc00eacbf5df3476a5e72c7d0e (diff)
bpf: Allow special fields in resizable hashtab
Add support for timers, workqueues, task work, spin locks and kptrs. Without this, users needing deferred callbacks, BPF_F_LOCK, or refcounted kernel pointers in a dynamically-sized map have no option - fixed-size htab is the only map supporting these field types. Resizable hashtab should offer the same capability. kptr semantics under in-place updates are identical to array map. Properly clean up BTF record fields on element delete and map teardown by wiring up bpf_obj_free_fields through a memory allocator destructor, matching the pattern used by htab for non-prealloc maps. Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com> Link: https://lore.kernel.org/r/20260605-rhash-v7-6-5b8e05f8630d@meta.com Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r--kernel/bpf/hashtab.c111
-rw-r--r--kernel/bpf/syscall.c3
2 files changed, 104 insertions, 10 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a149713d0953..7b9408b8320c 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -497,28 +497,26 @@ static void htab_dtor_ctx_free(void *ctx)
kfree(ctx);
}
-static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *))
+static int bpf_ma_set_dtor(struct bpf_map *map, struct bpf_mem_alloc *ma,
+ void (*dtor)(void *, void *))
{
- u32 key_size = htab->map.key_size;
- struct bpf_mem_alloc *ma;
struct htab_btf_record *hrec;
int err;
/* No need for dtors. */
- if (IS_ERR_OR_NULL(htab->map.record))
+ if (IS_ERR_OR_NULL(map->record))
return 0;
hrec = kzalloc(sizeof(*hrec), GFP_KERNEL);
if (!hrec)
return -ENOMEM;
- hrec->key_size = key_size;
- hrec->record = btf_record_dup(htab->map.record);
+ hrec->key_size = map->key_size;
+ hrec->record = btf_record_dup(map->record);
if (IS_ERR(hrec->record)) {
err = PTR_ERR(hrec->record);
kfree(hrec);
return err;
}
- ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma;
bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec);
return 0;
}
@@ -535,9 +533,9 @@ static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf,
* populated in htab_map_alloc(), so it will always appear as NULL.
*/
if (htab_is_percpu(htab))
- return htab_set_dtor(htab, htab_pcpu_mem_dtor);
+ return bpf_ma_set_dtor(map, &htab->pcpu_ma, htab_pcpu_mem_dtor);
else
- return htab_set_dtor(htab, htab_mem_dtor);
+ return bpf_ma_set_dtor(map, &htab->ma, htab_mem_dtor);
}
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
@@ -2752,6 +2750,7 @@ struct bpf_rhtab {
struct rhashtable ht;
struct bpf_mem_alloc ma;
u32 elem_size;
+ bool freeing_internal;
};
static const struct rhashtable_params rhtab_params = {
@@ -2832,11 +2831,34 @@ static int rhtab_map_alloc_check(union bpf_attr *attr)
return htab_map_alloc_check(attr);
}
+static void rhtab_check_and_free_fields(struct bpf_rhtab *rhtab,
+ struct rhtab_elem *elem)
+{
+ if (IS_ERR_OR_NULL(rhtab->map.record))
+ return;
+
+ bpf_obj_free_fields(rhtab->map.record,
+ rhtab_elem_value(elem, rhtab->map.key_size));
+}
+
+static void rhtab_mem_dtor(void *obj, void *ctx)
+{
+ struct htab_btf_record *hrec = ctx;
+ struct rhtab_elem *elem = obj;
+
+ if (IS_ERR_OR_NULL(hrec->record))
+ return;
+
+ bpf_obj_free_fields(hrec->record,
+ rhtab_elem_value(elem, hrec->key_size));
+}
+
static void rhtab_free_elem(void *ptr, void *arg)
{
struct bpf_rhtab *rhtab = arg;
struct rhtab_elem *elem = ptr;
+ bpf_map_free_internal_structs(&rhtab->map, rhtab_elem_value(elem, rhtab->map.key_size));
bpf_mem_cache_free_rcu(&rhtab->ma, elem);
}
@@ -2900,7 +2922,8 @@ static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, v
rhtab_read_elem_value(&rhtab->map, copy, elem, flags);
check_and_init_map_value(&rhtab->map, copy);
}
-
+ /* Release internal structs: kptr, bpf_timer, task_work, wq */
+ rhtab_check_and_free_fields(rhtab, elem);
bpf_mem_cache_free_rcu(&rhtab->ma, elem);
return 0;
}
@@ -2942,6 +2965,7 @@ static int rhtab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void
static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *elem, void *value,
u64 map_flags)
{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
void *old_val = rhtab_elem_value(elem, map->key_size);
if (map_flags & BPF_NOEXIST)
@@ -2951,6 +2975,17 @@ static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *el
copy_map_value_locked(map, old_val, value, false);
else
copy_map_value(map, old_val, value);
+
+ /*
+ * Torn reads: a concurrent reader without BPF_F_LOCK may observe
+ * the value mid-copy. Callers requiring consistent reads must use
+ * BPF_F_LOCK, matching arraymap semantics.
+ *
+ * copy_map_value() skips special-field offsets, so old timers/
+ * kptrs/etc. still sit in the slot. Cancel them after the copy
+ * to match arraymap's update semantics.
+ */
+ rhtab_check_and_free_fields(rhtab, elem);
return 0;
}
@@ -2973,6 +3008,14 @@ static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u
if (map_flags & BPF_EXIST)
return -ENOENT;
+ /*
+ * Reject new insertions while map_release_uref cleanup walks the
+ * table. Without this, new elements could keep triggering rehash
+ * and prevent the walk from terminating.
+ */
+ if (READ_ONCE(rhtab->freeing_internal))
+ return -EBUSY;
+
/* Check max_entries limit before inserting new element */
if (atomic_read(&rhtab->ht.nelems) >= map->max_entries)
return -E2BIG;
@@ -2983,6 +3026,7 @@ static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u
memcpy(elem->data, key, map->key_size);
copy_map_value(map, rhtab_elem_value(elem, map->key_size), value);
+ check_and_init_map_value(map, rhtab_elem_value(elem, map->key_size));
/* Prevent deadlock for NMI programs attempting to take bucket lock */
bpf_disable_instrumentation();
@@ -3015,8 +3059,54 @@ static int rhtab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
return insn - insn_buf;
}
+static int rhtab_map_check_btf(struct bpf_map *map, const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+
+ return bpf_ma_set_dtor(map, &rhtab->ma, rhtab_mem_dtor);
+}
+
static void rhtab_map_free_internal_structs(struct bpf_map *map)
{
+ struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+ struct rhashtable_iter iter;
+ struct rhtab_elem *elem;
+
+ if (!bpf_map_has_internal_structs(map))
+ return;
+
+ /*
+ * Block new insertions. Once observed, no new growth is triggered,
+ * so any in-flight rehash will drain and the walker is guaranteed
+ * to stop returning -EAGAIN. Treat -EAGAIN as "rehash in progress,
+ * retry"; do not wait for the worker.
+ */
+ WRITE_ONCE(rhtab->freeing_internal, true);
+
+ rhashtable_walk_enter(&rhtab->ht, &iter);
+ rhashtable_walk_start(&iter);
+
+ while ((elem = rhashtable_walk_next(&iter))) {
+ if (IS_ERR(elem)) {
+ if (PTR_ERR(elem) == -EAGAIN)
+ continue;
+ break;
+ }
+
+ bpf_map_free_internal_structs(map, rhtab_elem_value(elem, map->key_size));
+
+ if (need_resched()) { /* Avoid stalls on large maps */
+ rhashtable_walk_stop(&iter);
+ cond_resched();
+ rhashtable_walk_start(&iter);
+ }
+ }
+
+ rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
+ WRITE_ONCE(rhtab->freeing_internal, false);
}
static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -3382,6 +3472,7 @@ const struct bpf_map_ops rhtab_map_ops = {
.map_free = rhtab_map_free,
.map_get_next_key = rhtab_map_get_next_key,
.map_release_uref = rhtab_map_free_internal_structs,
+ .map_check_btf = rhtab_map_check_btf,
.map_lookup_elem = rhtab_map_lookup_elem,
.map_lookup_and_delete_elem = rhtab_map_lookup_and_delete_elem,
.map_update_elem = rhtab_map_update_elem,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1faae184de48..31a3b70a0b5d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1280,6 +1280,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
case BPF_SPIN_LOCK:
case BPF_RES_SPIN_LOCK:
if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_RHASH &&
map->map_type != BPF_MAP_TYPE_ARRAY &&
map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
@@ -1294,6 +1295,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
case BPF_WORKQUEUE:
case BPF_TASK_WORK:
if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_RHASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY) {
ret = -EOPNOTSUPP;
@@ -1305,6 +1307,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
case BPF_KPTR_PERCPU:
case BPF_REFCOUNT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_RHASH &&
map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&