diff options
Diffstat (limited to 'kernel/bpf')
| -rw-r--r-- | kernel/bpf/bpf_local_storage.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/btf.c | 199 | ||||
| -rw-r--r-- | kernel/bpf/core.c | 13 | ||||
| -rw-r--r-- | kernel/bpf/hashtab.c | 4 | ||||
| -rw-r--r-- | kernel/bpf/helpers.c | 94 | ||||
| -rw-r--r-- | kernel/bpf/memalloc.c | 5 | ||||
| -rw-r--r-- | kernel/bpf/syscall.c | 48 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 606 |
8 files changed, 795 insertions, 178 deletions
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 373c3c2c75bc..35f4138a54dc 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -568,8 +568,8 @@ static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_att nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); - smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, - GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); + smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), + nbuckets, GFP_USER | __GFP_NOWARN); if (!smap->buckets) { bpf_map_area_free(smap); return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 740bdb045b14..fa22ec79ac0e 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3324,12 +3324,14 @@ static const char *btf_find_decl_tag_value(const struct btf *btf, return NULL; } -static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt, - const struct btf_type *t, int comp_idx, - u32 off, int sz, struct btf_field_info *info) +static int +btf_find_graph_root(const struct btf *btf, const struct btf_type *pt, + const struct btf_type *t, int comp_idx, u32 off, + int sz, struct btf_field_info *info, + enum btf_field_type head_type) { + const char *node_field_name; const char *value_type; - const char *list_node; s32 id; if (!__btf_type_is_struct(t)) @@ -3339,26 +3341,32 @@ static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt, value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:"); if (!value_type) return -EINVAL; - list_node = strstr(value_type, ":"); - if (!list_node) + node_field_name = strstr(value_type, ":"); + if (!node_field_name) return -EINVAL; - value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN); + value_type = kstrndup(value_type, node_field_name - value_type, GFP_KERNEL | __GFP_NOWARN); if (!value_type) return -ENOMEM; id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT); kfree(value_type); if (id < 0) return id; - list_node++; - if (str_is_empty(list_node)) + node_field_name++; + if (str_is_empty(node_field_name)) return -EINVAL; - info->type = BPF_LIST_HEAD; + info->type = head_type; info->off = off; info->graph_root.value_btf_id = id; - info->graph_root.node_name = list_node; + info->graph_root.node_name = node_field_name; return BTF_FIELD_FOUND; } +#define field_mask_test_name(field_type, field_type_str) \ + if (field_mask & field_type && !strcmp(name, field_type_str)) { \ + type = field_type; \ + goto end; \ + } + static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, int *align, int *sz) { @@ -3382,18 +3390,11 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, goto end; } } - if (field_mask & BPF_LIST_HEAD) { - if (!strcmp(name, "bpf_list_head")) { - type = BPF_LIST_HEAD; - goto end; - } - } - if (field_mask & BPF_LIST_NODE) { - if (!strcmp(name, "bpf_list_node")) { - type = BPF_LIST_NODE; - goto end; - } - } + field_mask_test_name(BPF_LIST_HEAD, "bpf_list_head"); + field_mask_test_name(BPF_LIST_NODE, "bpf_list_node"); + field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root"); + field_mask_test_name(BPF_RB_NODE, "bpf_rb_node"); + /* Only return BPF_KPTR when all other types with matchable names fail */ if (field_mask & BPF_KPTR) { type = BPF_KPTR_REF; @@ -3406,6 +3407,8 @@ end: return type; } +#undef field_mask_test_name + static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt) @@ -3438,6 +3441,7 @@ static int btf_find_struct_field(const struct btf *btf, case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_LIST_NODE: + case BPF_RB_NODE: ret = btf_find_struct(btf, member_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3451,8 +3455,11 @@ static int btf_find_struct_field(const struct btf *btf, return ret; break; case BPF_LIST_HEAD: - ret = btf_find_list_head(btf, t, member_type, i, off, sz, - idx < info_cnt ? &info[idx] : &tmp); + case BPF_RB_ROOT: + ret = btf_find_graph_root(btf, t, member_type, + i, off, sz, + idx < info_cnt ? &info[idx] : &tmp, + field_type); if (ret < 0) return ret; break; @@ -3499,6 +3506,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, case BPF_SPIN_LOCK: case BPF_TIMER: case BPF_LIST_NODE: + case BPF_RB_NODE: ret = btf_find_struct(btf, var_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3512,8 +3520,11 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, return ret; break; case BPF_LIST_HEAD: - ret = btf_find_list_head(btf, var, var_type, -1, off, sz, - idx < info_cnt ? &info[idx] : &tmp); + case BPF_RB_ROOT: + ret = btf_find_graph_root(btf, var, var_type, + -1, off, sz, + idx < info_cnt ? &info[idx] : &tmp, + field_type); if (ret < 0) return ret; break; @@ -3615,8 +3626,11 @@ end_btf: return ret; } -static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, - struct btf_field_info *info) +static int btf_parse_graph_root(const struct btf *btf, + struct btf_field *field, + struct btf_field_info *info, + const char *node_type_name, + size_t node_type_align) { const struct btf_type *t, *n = NULL; const struct btf_member *member; @@ -3638,13 +3652,13 @@ static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, n = btf_type_by_id(btf, member->type); if (!__btf_type_is_struct(n)) return -EINVAL; - if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off))) + if (strcmp(node_type_name, __btf_name_by_offset(btf, n->name_off))) return -EINVAL; offset = __btf_member_bit_offset(n, member); if (offset % 8) return -EINVAL; offset /= 8; - if (offset % __alignof__(struct bpf_list_node)) + if (offset % node_type_align) return -EINVAL; field->graph_root.btf = (struct btf *)btf; @@ -3656,6 +3670,20 @@ static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, return 0; } +static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, + struct btf_field_info *info) +{ + return btf_parse_graph_root(btf, field, info, "bpf_list_node", + __alignof__(struct bpf_list_node)); +} + +static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field, + struct btf_field_info *info) +{ + return btf_parse_graph_root(btf, field, info, "bpf_rb_node", + __alignof__(struct bpf_rb_node)); +} + struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size) { @@ -3718,7 +3746,13 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type if (ret < 0) goto end; break; + case BPF_RB_ROOT: + ret = btf_parse_rb_root(btf, &rec->fields[i], &info_arr[i]); + if (ret < 0) + goto end; + break; case BPF_LIST_NODE: + case BPF_RB_NODE: break; default: ret = -EFAULT; @@ -3727,8 +3761,33 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec->cnt++; } - /* bpf_list_head requires bpf_spin_lock */ - if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) { + /* bpf_{list_head, rb_node} require bpf_spin_lock */ + if ((btf_record_has_field(rec, BPF_LIST_HEAD) || + btf_record_has_field(rec, BPF_RB_ROOT)) && rec->spin_lock_off < 0) { + ret = -EINVAL; + goto end; + } + + /* need collection identity for non-owning refs before allowing this + * + * Consider a node type w/ both list and rb_node fields: + * struct node { + * struct bpf_list_node l; + * struct bpf_rb_node r; + * } + * + * Used like so: + * struct node *n = bpf_obj_new(....); + * bpf_list_push_front(&list_head, &n->l); + * bpf_rbtree_remove(&rb_root, &n->r); + * + * It should not be possible to rbtree_remove the node since it hasn't + * been added to a tree. But push_front converts n to a non-owning + * reference, and rbtree_remove accepts the non-owning reference to + * a type w/ bpf_rb_node field. + */ + if (btf_record_has_field(rec, BPF_LIST_NODE) && + btf_record_has_field(rec, BPF_RB_NODE)) { ret = -EINVAL; goto end; } @@ -3739,22 +3798,28 @@ end: return ERR_PTR(ret); } +#define GRAPH_ROOT_MASK (BPF_LIST_HEAD | BPF_RB_ROOT) +#define GRAPH_NODE_MASK (BPF_LIST_NODE | BPF_RB_NODE) + int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) { int i; - /* There are two owning types, kptr_ref and bpf_list_head. The former - * only supports storing kernel types, which can never store references - * to program allocated local types, atleast not yet. Hence we only need - * to ensure that bpf_list_head ownership does not form cycles. + /* There are three types that signify ownership of some other type: + * kptr_ref, bpf_list_head, bpf_rb_root. + * kptr_ref only supports storing kernel types, which can't store + * references to program allocated local types. + * + * Hence we only need to ensure that bpf_{list_head,rb_root} ownership + * does not form cycles. */ - if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_LIST_HEAD)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & GRAPH_ROOT_MASK)) return 0; for (i = 0; i < rec->cnt; i++) { struct btf_struct_meta *meta; u32 btf_id; - if (!(rec->fields[i].type & BPF_LIST_HEAD)) + if (!(rec->fields[i].type & GRAPH_ROOT_MASK)) continue; btf_id = rec->fields[i].graph_root.value_btf_id; meta = btf_find_struct_meta(btf, btf_id); @@ -3762,39 +3827,47 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) return -EFAULT; rec->fields[i].graph_root.value_rec = meta->record; - if (!(rec->field_mask & BPF_LIST_NODE)) + /* We need to set value_rec for all root types, but no need + * to check ownership cycle for a type unless it's also a + * node type. + */ + if (!(rec->field_mask & GRAPH_NODE_MASK)) continue; /* We need to ensure ownership acyclicity among all types. The * proper way to do it would be to topologically sort all BTF * IDs based on the ownership edges, since there can be multiple - * bpf_list_head in a type. Instead, we use the following - * reasoning: + * bpf_{list_head,rb_node} in a type. Instead, we use the + * following resaoning: * * - A type can only be owned by another type in user BTF if it - * has a bpf_list_node. + * has a bpf_{list,rb}_node. Let's call these node types. * - A type can only _own_ another type in user BTF if it has a - * bpf_list_head. + * bpf_{list_head,rb_root}. Let's call these root types. * - * We ensure that if a type has both bpf_list_head and - * bpf_list_node, its element types cannot be owning types. + * We ensure that if a type is both a root and node, its + * element types cannot be root types. * * To ensure acyclicity: * - * When A only has bpf_list_head, ownership chain can be: + * When A is an root type but not a node, its ownership + * chain can be: * A -> B -> C * Where: - * - B has both bpf_list_head and bpf_list_node. - * - C only has bpf_list_node. + * - A is an root, e.g. has bpf_rb_root. + * - B is both a root and node, e.g. has bpf_rb_node and + * bpf_list_head. + * - C is only an root, e.g. has bpf_list_node * - * When A has both bpf_list_head and bpf_list_node, some other - * type already owns it in the BTF domain, hence it can not own - * another owning type through any of the bpf_list_head edges. + * When A is both a root and node, some other type already + * owns it in the BTF domain, hence it can not own + * another root type through any of the ownership edges. * A -> B * Where: - * - B only has bpf_list_node. + * - A is both an root and node. + * - B is only an node. */ - if (meta->record->field_mask & BPF_LIST_HEAD) + if (meta->record->field_mask & GRAPH_ROOT_MASK) return -ELOOP; } return 0; @@ -5256,6 +5329,8 @@ static const char *alloc_obj_fields[] = { "bpf_spin_lock", "bpf_list_head", "bpf_list_node", + "bpf_rb_root", + "bpf_rb_node", }; static struct btf_struct_metas * @@ -5329,7 +5404,8 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) type = &tab->types[tab->cnt]; type->btf_id = i; - record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE, t->size); + record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE | + BPF_RB_ROOT | BPF_RB_NODE, t->size); /* The record cannot be unset, treat it as an error if so */ if (IS_ERR_OR_NULL(record)) { ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT; @@ -5593,6 +5669,7 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, if (!ctx_struct) /* should not happen */ return NULL; +again: ctx_tname = btf_name_by_offset(btf_vmlinux, ctx_struct->name_off); if (!ctx_tname) { /* should not happen */ @@ -5606,8 +5683,16 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, * int socket_filter_bpf_prog(struct __sk_buff *skb) * { // no fields of skb are ever used } */ - if (strcmp(ctx_tname, tname)) - return NULL; + if (strcmp(ctx_tname, tname)) { + /* bpf_user_pt_regs_t is a typedef, so resolve it to + * underlying struct and check name again + */ + if (!btf_type_is_modifier(ctx_struct)) + return NULL; + while (btf_type_is_modifier(ctx_struct)) + ctx_struct = btf_type_by_id(btf_vmlinux, ctx_struct->type); + goto again; + } return ctx_type; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 16da51093aff..3390961c4e10 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -35,6 +35,7 @@ #include <linux/bpf_verifier.h> #include <linux/nodemask.h> #include <linux/bpf_mem_alloc.h> +#include <linux/memcontrol.h> #include <asm/barrier.h> #include <asm/unaligned.h> @@ -87,7 +88,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog_aux *aux; struct bpf_prog *fp; @@ -96,12 +97,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag if (fp == NULL) return NULL; - aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags); + aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (aux == NULL) { vfree(fp); return NULL; } - fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags); + fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (!fp->active) { vfree(fp); kfree(aux); @@ -126,7 +127,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog *prog; int cpu; @@ -159,7 +160,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo, sizeof(*prog->aux->jited_linfo), - GFP_KERNEL_ACCOUNT | __GFP_NOWARN); + bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN)); if (!prog->aux->jited_linfo) return -ENOMEM; @@ -234,7 +235,7 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog *fp; u32 pages; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 66bded144377..5dfcb5ad0d06 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1004,8 +1004,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-ENOMEM); goto dec_count; } - check_and_init_map_value(&htab->map, - l_new->key + round_up(key_size, 8)); } memcpy(l_new->key, key, key_size); @@ -1592,6 +1590,7 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, else copy_map_value(map, value, l->key + roundup_key_size); + /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, value); } @@ -1792,6 +1791,7 @@ again_nocopy: true); else copy_map_value(map, dst_val, value); + /* Zeroing special fields in the temp buffer */ check_and_init_map_value(map, dst_val); } if (do_delete) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 2dae44581922..5b278a38ae58 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1772,6 +1772,46 @@ unlock: } } +/* Like rbtree_postorder_for_each_entry_safe, but 'pos' and 'n' are + * 'rb_node *', so field name of rb_node within containing struct is not + * needed. + * + * Since bpf_rb_tree's node type has a corresponding struct btf_field with + * graph_root.node_offset, it's not necessary to know field name + * or type of node struct + */ +#define bpf_rbtree_postorder_for_each_entry_safe(pos, n, root) \ + for (pos = rb_first_postorder(root); \ + pos && ({ n = rb_next_postorder(pos); 1; }); \ + pos = n) + +void bpf_rb_root_free(const struct btf_field *field, void *rb_root, + struct bpf_spin_lock *spin_lock) +{ + struct rb_root_cached orig_root, *root = rb_root; + struct rb_node *pos, *n; + void *obj; + + BUILD_BUG_ON(sizeof(struct rb_root_cached) > sizeof(struct bpf_rb_root)); + BUILD_BUG_ON(__alignof__(struct rb_root_cached) > __alignof__(struct bpf_rb_root)); + + __bpf_spin_lock_irqsave(spin_lock); + orig_root = *root; + *root = RB_ROOT_CACHED; + __bpf_spin_unlock_irqrestore(spin_lock); + + bpf_rbtree_postorder_for_each_entry_safe(pos, n, &orig_root.rb_root) { + obj = pos; + obj -= field->graph_root.node_offset; + + bpf_obj_free_fields(field->graph_root.value_rec, obj); + + migrate_disable(); + bpf_mem_free(&bpf_global_ma, obj); + migrate_enable(); + } +} + __diag_push(); __diag_ignore_all("-Wmissing-prototypes", "Global functions as their definitions will be in vmlinux BTF"); @@ -1844,6 +1884,56 @@ __bpf_kfunc struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) return __bpf_list_del(head, true); } +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, + struct bpf_rb_node *node) +{ + struct rb_root_cached *r = (struct rb_root_cached *)root; + struct rb_node *n = (struct rb_node *)node; + + rb_erase_cached(n, r); + RB_CLEAR_NODE(n); + return (struct bpf_rb_node *)n; +} + +/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF + * program + */ +static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, + void *less) +{ + struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node; + bpf_callback_t cb = (bpf_callback_t)less; + struct rb_node *parent = NULL; + bool leftmost = true; + + while (*link) { + parent = *link; + if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = false; + } + } + + rb_link_node((struct rb_node *)node, parent, link); + rb_insert_color_cached((struct rb_node *)node, + (struct rb_root_cached *)root, leftmost); +} + +__bpf_kfunc void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, + bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)) +{ + __bpf_rbtree_add(root, node, (void *)less); +} + +__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) +{ + struct rb_root_cached *r = (struct rb_root_cached *)root; + + return (struct bpf_rb_node *)rb_first_cached(r); +} + /** * bpf_task_acquire - Acquire a reference to a task. A task acquired by this * kfunc which is not stored in a map as a kptr, must be released by calling @@ -2068,6 +2158,10 @@ BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE) +BTF_ID_FLAGS(func, bpf_rbtree_add) +BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) + #ifdef CONFIG_CGROUPS BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 1db156405b68..5fcdacbb8439 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -143,7 +143,7 @@ static void *__alloc(struct bpf_mem_cache *c, int node) return obj; } - return kmalloc_node(c->unit_size, flags, node); + return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); } static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) @@ -395,7 +395,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) unit_size = size; #ifdef CONFIG_MEMCG_KMEM - objcg = get_obj_cgroup_from_current(); + if (memcg_bpf_enabled()) + objcg = get_obj_cgroup_from_current(); #endif for_each_possible_cpu(cpu) { c = per_cpu_ptr(pc, cpu); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bcc97613de76..e3fcdc9836a6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -309,7 +309,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT; + gfp_t gfp = bpf_memcg_flags(__GFP_NOWARN | __GFP_ZERO); unsigned int flags = 0; unsigned long align = 1; void *area; @@ -418,7 +418,8 @@ static void bpf_map_save_memcg(struct bpf_map *map) * So we have to check map->objcg for being NULL each time it's * being used. */ - map->objcg = get_obj_cgroup_from_current(); + if (memcg_bpf_enabled()) + map->objcg = get_obj_cgroup_from_current(); } static void bpf_map_release_memcg(struct bpf_map *map) @@ -464,6 +465,21 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) return ptr; } +void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, + gfp_t flags) +{ + struct mem_cgroup *memcg, *old_memcg; + void *ptr; + + memcg = bpf_map_get_memcg(map); + old_memcg = set_active_memcg(memcg); + ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT); + set_active_memcg(old_memcg); + mem_cgroup_put(memcg); + + return ptr; +} + void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags) { @@ -521,9 +537,6 @@ void btf_record_free(struct btf_record *rec) return; for (i = 0; i < rec->cnt; i++) { switch (rec->fields[i].type) { - case BPF_SPIN_LOCK: - case BPF_TIMER: - break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: if (rec->fields[i].kptr.module) @@ -532,7 +545,11 @@ void btf_record_free(struct btf_record *rec) break; case BPF_LIST_HEAD: case BPF_LIST_NODE: - /* Nothing to release for bpf_list_head */ + case BPF_RB_ROOT: + case BPF_RB_NODE: + case BPF_SPIN_LOCK: + case BPF_TIMER: + /* Nothing to release */ break; default: WARN_ON_ONCE(1); @@ -565,9 +582,6 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) new_rec->cnt = 0; for (i = 0; i < rec->cnt; i++) { switch (fields[i].type) { - case BPF_SPIN_LOCK: - case BPF_TIMER: - break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: btf_get(fields[i].kptr.btf); @@ -578,7 +592,11 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) break; case BPF_LIST_HEAD: case BPF_LIST_NODE: - /* Nothing to acquire for bpf_list_head */ + case BPF_RB_ROOT: + case BPF_RB_NODE: + case BPF_SPIN_LOCK: + case BPF_TIMER: + /* Nothing to acquire */ break; default: ret = -EFAULT; @@ -658,7 +676,13 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) continue; bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off); break; + case BPF_RB_ROOT: + if (WARN_ON_ONCE(rec->spin_lock_off < 0)) + continue; + bpf_rb_root_free(field, field_ptr, obj + rec->spin_lock_off); + break; case BPF_LIST_NODE: + case BPF_RB_NODE: break; default: WARN_ON_ONCE(1); @@ -994,7 +1018,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return -EINVAL; map->record = btf_parse_fields(btf, value_type, - BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD, + BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD | + BPF_RB_ROOT, map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1042,6 +1067,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, } break; case BPF_LIST_HEAD: + case BPF_RB_ROOT: if (map->map_type != BPF_MAP_TYPE_HASH && map->map_type != BPF_MAP_TYPE_LRU_HASH && map->map_type != BPF_MAP_TYPE_ARRAY) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 388245e8826e..272563a0b770 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -190,6 +190,10 @@ struct bpf_verifier_stack_elem { static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx); static int release_reference(struct bpf_verifier_env *env, int ref_obj_id); +static void invalidate_non_owning_refs(struct bpf_verifier_env *env); +static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env); +static int ref_set_non_owning(struct bpf_verifier_env *env, + struct bpf_reg_state *reg); static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) { @@ -457,6 +461,11 @@ static bool type_is_ptr_alloc_obj(u32 type) return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; } +static bool type_is_non_owning_ref(u32 type) +{ + return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; +} + static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) { struct btf_record *rec = NULL; @@ -1073,6 +1082,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose_a("id=%d", reg->id); if (reg->ref_obj_id) verbose_a("ref_obj_id=%d", reg->ref_obj_id); + if (type_is_non_owning_ref(reg->type)) + verbose_a("%s", "non_own_ref"); if (t != SCALAR_VALUE) verbose_a("off=%d", reg->off); if (type_is_pkt_pointer(t)) @@ -1632,6 +1643,16 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) reg->type &= ~PTR_MAYBE_NULL; } +static void mark_reg_graph_node(struct bpf_reg_state *regs, u32 regno, + struct btf_field_graph_root *ds_head) +{ + __mark_reg_known_zero(®s[regno]); + regs[regno].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[regno].btf = ds_head->btf; + regs[regno].btf_id = ds_head->value_btf_id; + regs[regno].off = ds_head->node_offset; +} + static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) { return type_is_pkt_pointer(reg->type); @@ -3452,6 +3473,11 @@ static void save_register_state(struct bpf_func_state *state, scrub_spilled_slot(&state->stack[spi].slot_type[i - 1]); } +static bool is_bpf_st_mem(struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_ST && BPF_MODE(insn->code) == BPF_MEM; +} + /* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -3463,8 +3489,9 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; - u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; + struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; struct bpf_reg_state *reg = NULL; + u32 dst_reg = insn->dst_reg; err = grow_stack_state(state, round_up(slot + 1, BPF_REG_SIZE)); if (err) @@ -3517,6 +3544,13 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, return err; } save_register_state(state, spi, reg, size); + } else if (!reg && !(off % BPF_REG_SIZE) && is_bpf_st_mem(insn) && + insn->imm != 0 && env->bpf_capable) { + struct bpf_reg_state fake_reg = {}; + + __mark_reg_known(&fake_reg, (u32)insn->imm); + fake_reg.type = SCALAR_VALUE; + save_register_state(state, spi, &fake_reg, size); } else if (reg && is_spillable_regtype(reg->type)) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -3551,7 +3585,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; /* when we zero initialize stack slots mark them as such */ - if (reg && register_is_null(reg)) { + if ((reg && register_is_null(reg)) || + (!reg && is_bpf_st_mem(insn) && insn->imm == 0)) { /* backtracking doesn't work for STACK_ZERO yet. */ err = mark_chain_precision(env, value_regno); if (err) @@ -3596,6 +3631,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, int min_off, max_off; int i, err; struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; + struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; bool writing_zero = false; /* set if the fact that we're writing a zero is used to let any * stack slots remain STACK_ZERO @@ -3608,7 +3644,8 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, max_off = ptr_reg->smax_value + off + size; if (value_regno >= 0) value_reg = &cur->regs[value_regno]; - if (value_reg && register_is_null(value_reg)) + if ((value_reg && register_is_null(value_reg)) || + (!value_reg && is_bpf_st_mem(insn) && insn->imm == 0)) writing_zero = true; err = grow_stack_state(state, round_up(-min_off, BPF_REG_SIZE)); @@ -5052,7 +5089,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } - if (type_is_alloc(reg->type) && !reg->ref_obj_id) { + if (type_is_alloc(reg->type) && !type_is_non_owning_ref(reg->type) && + !reg->ref_obj_id) { verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n"); return -EFAULT; } @@ -6042,9 +6080,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, cur->active_lock.ptr = btf; cur->active_lock.id = reg->id; } else { - struct bpf_func_state *fstate = cur_func(env); void *ptr; - int i; if (map) ptr = map; @@ -6060,25 +6096,11 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, verbose(env, "bpf_spin_unlock of different lock\n"); return -EINVAL; } - cur->active_lock.ptr = NULL; - cur->active_lock.id = 0; - for (i = fstate->acquired_refs - 1; i >= 0; i--) { - int err; + invalidate_non_owning_refs(env); - /* Complain on error because this reference state cannot - * be freed before this point, as bpf_spin_lock critical - * section does not allow functions that release the - * allocated object immediately. - */ - if (!fstate->refs[i].release_on_unlock) - continue; - err = release_reference(env, fstate->refs[i].id); - if (err) { - verbose(env, "failed to release release_on_unlock reference"); - return err; - } - } + cur->active_lock.ptr = NULL; + cur->active_lock.id = 0; } return 0; } @@ -6546,6 +6568,23 @@ found: return 0; } +static struct btf_field * +reg_find_field_offset(const struct bpf_reg_state *reg, s32 off, u32 fields) +{ + struct btf_field *field; + struct btf_record *rec; + + rec = reg_btf_record(reg); + if (!rec) + return NULL; + + field = btf_record_find(rec, off, fields); + if (!field) + return NULL; + + return field; +} + int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, enum bpf_arg_type arg_type) @@ -6567,6 +6606,18 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, */ if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK) return 0; + + if ((type_is_ptr_alloc_obj(type) || type_is_non_owning_ref(type)) && reg->off) { + if (reg_find_field_offset(reg, reg->off, BPF_GRAPH_NODE_OR_ROOT)) + return __check_ptr_off_reg(env, reg, regno, true); + + verbose(env, "R%d must have zero offset when passed to release func\n", + regno); + verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno, + kernel_type_name(reg->btf, reg->btf_id), reg->off); + return -EINVAL; + } + /* Doing check_ptr_off_reg check for the offset will catch this * because fixed_off_ok is false, but checking here allows us * to give the user a better error message. @@ -6601,6 +6652,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, case PTR_TO_BTF_ID | PTR_TRUSTED: case PTR_TO_BTF_ID | MEM_RCU: case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: + case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF: /* When referenced PTR_TO_BTF_ID is passed to release function, * its fixed offset must be 0. In the other cases, fixed offset * can be non-zero. This was already checked above. So pass @@ -6812,6 +6864,10 @@ skip_type_check: meta->ret_btf_id = reg->btf_id; break; case ARG_PTR_TO_SPIN_LOCK: + if (in_rbtree_lock_required_cb(env)) { + verbose(env, "can't spin_{lock,unlock} in rbtree cb\n"); + return -EACCES; + } if (meta->func_id == BPF_FUNC_spin_lock) { err = process_spin_lock(env, regno, true); if (err) @@ -7363,6 +7419,17 @@ static int release_reference(struct bpf_verifier_env *env, return 0; } +static void invalidate_non_owning_refs(struct bpf_verifier_env *env) +{ + struct bpf_func_state *unused; + struct bpf_reg_state *reg; + + bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ + if (type_is_non_owning_ref(reg->type)) + __mark_reg_unknown(env, reg); + })); +} + static void clear_caller_saved_regs(struct bpf_verifier_env *env, struct bpf_reg_state *regs) { @@ -7384,6 +7451,8 @@ static int set_callee_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, int insn_idx); +static bool is_callback_calling_kfunc(u32 btf_id); + static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx, int subprog, set_callee_state_fn set_callee_state_cb) @@ -7438,10 +7507,18 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn * interested in validating only BPF helpers that can call subprogs as * callbacks */ - if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) { - verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n", - func_id_name(insn->imm), insn->imm); - return -EFAULT; + if (set_callee_state_cb != set_callee_state) { + if (bpf_pseudo_kfunc_call(insn) && + !is_callback_calling_kfunc(insn->imm)) { + verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } else if (!bpf_pseudo_kfunc_call(insn) && + !is_callback_calling_function(insn->imm)) { /* helper */ + verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } } if (insn->code == (BPF_JMP | BPF_CALL) && @@ -7706,6 +7783,63 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_rbtree_add_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node, + * bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b)); + * + * 'struct bpf_rb_node *node' arg to bpf_rbtree_add is the same PTR_TO_BTF_ID w/ offset + * that 'less' callback args will be receiving. However, 'node' arg was release_reference'd + * by this point, so look at 'root' + */ + struct btf_field *field; + + field = reg_find_field_offset(&caller->regs[BPF_REG_1], caller->regs[BPF_REG_1].off, + BPF_RB_ROOT); + if (!field || !field->graph_root.value_btf_id) + return -EFAULT; + + mark_reg_graph_node(callee->regs, BPF_REG_1, &field->graph_root); + ref_set_non_owning(env, &callee->regs[BPF_REG_1]); + mark_reg_graph_node(callee->regs, BPF_REG_2, &field->graph_root); + ref_set_non_owning(env, &callee->regs[BPF_REG_2]); + + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_callback_fn = true; + callee->callback_ret_range = tnum_range(0, 1); + return 0; +} + +static bool is_rbtree_lock_required_kfunc(u32 btf_id); + +/* Are we currently verifying the callback for a rbtree helper that must + * be called with lock held? If so, no need to complain about unreleased + * lock + */ +static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_insn *insn = env->prog->insnsi; + struct bpf_func_state *callee; + int kfunc_btf_id; + + if (!state->curframe) + return false; + + callee = state->frame[state->curframe]; + + if (!callee->in_callback_fn) + return false; + + kfunc_btf_id = insn[callee->callsite].imm; + return is_rbtree_lock_required_kfunc(kfunc_btf_id); +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -8474,6 +8608,7 @@ struct bpf_kfunc_call_arg_meta { bool r0_rdonly; u32 ret_btf_id; u64 r0_size; + u32 subprogno; struct { u64 value; bool found; @@ -8485,6 +8620,9 @@ struct bpf_kfunc_call_arg_meta { struct { struct btf_field *field; } arg_list_head; + struct { + struct btf_field *field; + } arg_rbtree_root; }; static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) @@ -8596,12 +8734,16 @@ enum { KF_ARG_DYNPTR_ID, KF_ARG_LIST_HEAD_ID, KF_ARG_LIST_NODE_ID, + KF_ARG_RB_ROOT_ID, + KF_ARG_RB_NODE_ID, }; BTF_ID_LIST(kf_arg_btf_ids) BTF_ID(struct, bpf_dynptr_kern) BTF_ID(struct, bpf_list_head) BTF_ID(struct, bpf_list_node) +BTF_ID(struct, bpf_rb_root) +BTF_ID(struct, bpf_rb_node) static bool __is_kfunc_ptr_arg_type(const struct btf *btf, const struct btf_param *arg, int type) @@ -8635,6 +8777,28 @@ static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID); } +static bool is_kfunc_arg_rbtree_root(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_ROOT_ID); +} + +static bool is_kfunc_arg_rbtree_node(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_RB_NODE_ID); +} + +static bool is_kfunc_arg_callback(struct bpf_verifier_env *env, const struct btf *btf, + const struct btf_param *arg) +{ + const struct btf_type *t; + + t = btf_type_resolve_func_ptr(btf, arg->type, NULL); + if (!t) + return false; + + return true; +} + /* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, const struct btf *btf, @@ -8694,6 +8858,9 @@ enum kfunc_ptr_arg_type { KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */ KF_ARG_PTR_TO_MEM, KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */ + KF_ARG_PTR_TO_CALLBACK, + KF_ARG_PTR_TO_RB_ROOT, + KF_ARG_PTR_TO_RB_NODE, }; enum special_kfunc_type { @@ -8707,6 +8874,9 @@ enum special_kfunc_type { KF_bpf_rdonly_cast, KF_bpf_rcu_read_lock, KF_bpf_rcu_read_unlock, + KF_bpf_rbtree_remove, + KF_bpf_rbtree_add, + KF_bpf_rbtree_first, }; BTF_SET_START(special_kfunc_set) @@ -8718,6 +8888,9 @@ BTF_ID(func, bpf_list_pop_front) BTF_ID(func, bpf_list_pop_back) BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) +BTF_ID(func, bpf_rbtree_remove) +BTF_ID(func, bpf_rbtree_add) +BTF_ID(func, bpf_rbtree_first) BTF_SET_END(special_kfunc_set) BTF_ID_LIST(special_kfunc_list) @@ -8731,6 +8904,9 @@ BTF_ID(func, bpf_cast_to_kern_ctx) BTF_ID(func, bpf_rdonly_cast) BTF_ID(func, bpf_rcu_read_lock) BTF_ID(func, bpf_rcu_read_unlock) +BTF_ID(func, bpf_rbtree_remove) +BTF_ID(func, bpf_rbtree_add) +BTF_ID(func, bpf_rbtree_first) static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) { @@ -8792,6 +8968,12 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, if (is_kfunc_arg_list_node(meta->btf, &args[argno])) return KF_ARG_PTR_TO_LIST_NODE; + if (is_kfunc_arg_rbtree_root(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_RB_ROOT; + + if (is_kfunc_arg_rbtree_node(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_RB_NODE; + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { if (!btf_type_is_struct(ref_t)) { verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", @@ -8801,6 +8983,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, return KF_ARG_PTR_TO_BTF_ID; } + if (is_kfunc_arg_callback(env, meta->btf, &args[argno])) + return KF_ARG_PTR_TO_CALLBACK; + if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1])) arg_mem_size = true; @@ -8915,38 +9100,54 @@ static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env, return 0; } -static int ref_set_release_on_unlock(struct bpf_verifier_env *env, u32 ref_obj_id) +static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - struct bpf_func_state *state = cur_func(env); + struct bpf_verifier_state *state = env->cur_state; + + if (!state->active_lock.ptr) { + verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n"); + return -EFAULT; + } + + if (type_flag(reg->type) & NON_OWN_REF) { + verbose(env, "verifier internal error: NON_OWN_REF already set\n"); + return -EFAULT; + } + + reg->type |= NON_OWN_REF; + return 0; +} + +static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id) +{ + struct bpf_func_state *state, *unused; struct bpf_reg_state *reg; int i; - /* bpf_spin_lock only allows calling list_push and list_pop, no BPF - * subprogs, no global functions. This means that the references would - * not be released inside the critical section but they may be added to - * the reference state, and the acquired_refs are never copied out for a - * different frame as BPF to BPF calls don't work in bpf_spin_lock - * critical sections. - */ + state = cur_func(env); + if (!ref_obj_id) { - verbose(env, "verifier internal error: ref_obj_id is zero for release_on_unlock\n"); + verbose(env, "verifier internal error: ref_obj_id is zero for " + "owning -> non-owning conversion\n"); return -EFAULT; } + for (i = 0; i < state->acquired_refs; i++) { - if (state->refs[i].id == ref_obj_id) { - if (state->refs[i].release_on_unlock) { - verbose(env, "verifier internal error: expected false release_on_unlock"); - return -EFAULT; + if (state->refs[i].id != ref_obj_id) + continue; + + /* Clear ref_obj_id here so release_reference doesn't clobber + * the whole reg + */ + bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({ + if (reg->ref_obj_id == ref_obj_id) { + reg->ref_obj_id = 0; + ref_set_non_owning(env, reg); } - state->refs[i].release_on_unlock = true; - /* Now mark everyone sharing same ref_obj_id as untrusted */ - bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) - reg->type |= PTR_UNTRUSTED; - })); - return 0; - } + })); + return 0; } + verbose(env, "verifier internal error: ref state missing for ref_obj_id\n"); return -EFAULT; } @@ -9032,102 +9233,226 @@ static bool is_bpf_list_api_kfunc(u32 btf_id) btf_id == special_kfunc_list[KF_bpf_list_pop_back]; } -static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, u32 regno, - struct bpf_kfunc_call_arg_meta *meta) +static bool is_bpf_rbtree_api_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_rbtree_add] || + btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || + btf_id == special_kfunc_list[KF_bpf_rbtree_first]; +} + +static bool is_bpf_graph_api_kfunc(u32 btf_id) +{ + return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id); +} + +static bool is_callback_calling_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_rbtree_add]; +} + +static bool is_rbtree_lock_required_kfunc(u32 btf_id) +{ + return is_bpf_rbtree_api_kfunc(btf_id); +} + +static bool check_kfunc_is_graph_root_api(struct bpf_verifier_env *env, + enum btf_field_type head_field_type, + u32 kfunc_btf_id) +{ + bool ret; + + switch (head_field_type) { + case BPF_LIST_HEAD: + ret = is_bpf_list_api_kfunc(kfunc_btf_id); + break; + case BPF_RB_ROOT: + ret = is_bpf_rbtree_api_kfunc(kfunc_btf_id); + break; + default: + verbose(env, "verifier internal error: unexpected graph root argument type %s\n", + btf_field_type_name(head_field_type)); + return false; + } + + if (!ret) + verbose(env, "verifier internal error: %s head arg for unknown kfunc\n", + btf_field_type_name(head_field_type)); + return ret; +} + +static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env, + enum btf_field_type node_field_type, + u32 kfunc_btf_id) +{ + bool ret; + + switch (node_field_type) { + case BPF_LIST_NODE: + ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front] || + kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back]); + break; + case BPF_RB_NODE: + ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] || + kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add]); + break; + default: + verbose(env, "verifier internal error: unexpected graph node argument type %s\n", + btf_field_type_name(node_field_type)); + return false; + } + + if (!ret) + verbose(env, "verifier internal error: %s node arg for unknown kfunc\n", + btf_field_type_name(node_field_type)); + return ret; +} + +static int +__process_kf_arg_ptr_to_graph_root(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta, + enum btf_field_type head_field_type, + struct btf_field **head_field) { + const char *head_type_name; struct btf_field *field; struct btf_record *rec; - u32 list_head_off; + u32 head_off; - if (meta->btf != btf_vmlinux || !is_bpf_list_api_kfunc(meta->func_id)) { - verbose(env, "verifier internal error: bpf_list_head argument for unknown kfunc\n"); + if (meta->btf != btf_vmlinux) { + verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n"); return -EFAULT; } + if (!check_kfunc_is_graph_root_api(env, head_field_type, meta->func_id)) + return -EFAULT; + + head_type_name = btf_field_type_name(head_field_type); if (!tnum_is_const(reg->var_off)) { verbose(env, - "R%d doesn't have constant offset. bpf_list_head has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s has to be at the constant offset\n", + regno, head_type_name); return -EINVAL; } rec = reg_btf_record(reg); - list_head_off = reg->off + reg->var_off.value; - field = btf_record_find(rec, list_head_off, BPF_LIST_HEAD); + head_off = reg->off + reg->var_off.value; + field = btf_record_find(rec, head_off, head_field_type); if (!field) { - verbose(env, "bpf_list_head not found at offset=%u\n", list_head_off); + verbose(env, "%s not found at offset=%u\n", head_type_name, head_off); return -EINVAL; } /* All functions require bpf_list_head to be protected using a bpf_spin_lock */ if (check_reg_allocation_locked(env, reg)) { - verbose(env, "bpf_spin_lock at off=%d must be held for bpf_list_head\n", - rec->spin_lock_off); + verbose(env, "bpf_spin_lock at off=%d must be held for %s\n", + rec->spin_lock_off, head_type_name); return -EINVAL; } - if (meta->arg_list_head.field) { - verbose(env, "verifier internal error: repeating bpf_list_head arg\n"); + if (*head_field) { + verbose(env, "verifier internal error: repeating %s arg\n", head_type_name); return -EFAULT; } - meta->arg_list_head.field = field; + *head_field = field; return 0; } -static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, +static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, struct bpf_kfunc_call_arg_meta *meta) { + return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_LIST_HEAD, + &meta->arg_list_head.field); +} + +static int process_kf_arg_ptr_to_rbtree_root(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + return __process_kf_arg_ptr_to_graph_root(env, reg, regno, meta, BPF_RB_ROOT, + &meta->arg_rbtree_root.field); +} + +static int +__process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta, + enum btf_field_type head_field_type, + enum btf_field_type node_field_type, + struct btf_field **node_field) +{ + const char *node_type_name; const struct btf_type *et, *t; struct btf_field *field; - struct btf_record *rec; - u32 list_node_off; + u32 node_off; - if (meta->btf != btf_vmlinux || - (meta->func_id != special_kfunc_list[KF_bpf_list_push_front] && - meta->func_id != special_kfunc_list[KF_bpf_list_push_back])) { - verbose(env, "verifier internal error: bpf_list_node argument for unknown kfunc\n"); + if (meta->btf != btf_vmlinux) { + verbose(env, "verifier internal error: unexpected btf mismatch in kfunc call\n"); return -EFAULT; } + if (!check_kfunc_is_graph_node_api(env, node_field_type, meta->func_id)) + return -EFAULT; + + node_type_name = btf_field_type_name(node_field_type); if (!tnum_is_const(reg->var_off)) { verbose(env, - "R%d doesn't have constant offset. bpf_list_node has to be at the constant offset\n", - regno); + "R%d doesn't have constant offset. %s has to be at the constant offset\n", + regno, node_type_name); return -EINVAL; } - rec = reg_btf_record(reg); - list_node_off = reg->off + reg->var_off.value; - field = btf_record_find(rec, list_node_off, BPF_LIST_NODE); - if (!field || field->offset != list_node_off) { - verbose(env, "bpf_list_node not found at offset=%u\n", list_node_off); + node_off = reg->off + reg->var_off.value; + field = reg_find_field_offset(reg, node_off, node_field_type); + if (!field || field->offset != node_off) { + verbose(env, "%s not found at offset=%u\n", node_type_name, node_off); return -EINVAL; } - field = meta->arg_list_head.field; + field = *node_field; et = btf_type_by_id(field->graph_root.btf, field->graph_root.value_btf_id); t = btf_type_by_id(reg->btf, reg->btf_id); if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->graph_root.btf, field->graph_root.value_btf_id, true)) { - verbose(env, "operation on bpf_list_head expects arg#1 bpf_list_node at offset=%d " + verbose(env, "operation on %s expects arg#1 %s at offset=%d " "in struct %s, but arg is at offset=%d in struct %s\n", + btf_field_type_name(head_field_type), + btf_field_type_name(node_field_type), field->graph_root.node_offset, btf_name_by_offset(field->graph_root.btf, et->name_off), - list_node_off, btf_name_by_offset(reg->btf, t->name_off)); + node_off, btf_name_by_offset(reg->btf, t->name_off)); return -EINVAL; } - if (list_node_off != field->graph_root.node_offset) { - verbose(env, "arg#1 offset=%d, but expected bpf_list_node at offset=%d in struct %s\n", - list_node_off, field->graph_root.node_offset, + if (node_off != field->graph_root.node_offset) { + verbose(env, "arg#1 offset=%d, but expected %s at offset=%d in struct %s\n", + node_off, btf_field_type_name(node_field_type), + field->graph_root.node_offset, btf_name_by_offset(field->graph_root.btf, et->name_off)); return -EINVAL; } - /* Set arg#1 for expiration after unlock */ - return ref_set_release_on_unlock(env, reg->ref_obj_id); + + return 0; +} + +static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, + BPF_LIST_HEAD, BPF_LIST_NODE, + &meta->arg_list_head.field); +} + +static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + return __process_kf_arg_ptr_to_graph_node(env, reg, regno, meta, + BPF_RB_ROOT, BPF_RB_NODE, + &meta->arg_rbtree_root.field); } static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta) @@ -9264,8 +9589,11 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_DYNPTR: case KF_ARG_PTR_TO_LIST_HEAD: case KF_ARG_PTR_TO_LIST_NODE: + case KF_ARG_PTR_TO_RB_ROOT: + case KF_ARG_PTR_TO_RB_NODE: case KF_ARG_PTR_TO_MEM: case KF_ARG_PTR_TO_MEM_SIZE: + case KF_ARG_PTR_TO_CALLBACK: /* Trusted by default */ break; default: @@ -9342,6 +9670,20 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_RB_ROOT: + if (reg->type != PTR_TO_MAP_VALUE && + reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); + return -EINVAL; + } + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); + return -EINVAL; + } + ret = process_kf_arg_ptr_to_rbtree_root(env, reg, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_LIST_NODE: if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { verbose(env, "arg#%d expected pointer to allocated object\n", i); @@ -9355,6 +9697,31 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (ret < 0) return ret; break; + case KF_ARG_PTR_TO_RB_NODE: + if (meta->func_id == special_kfunc_list[KF_bpf_rbtree_remove]) { + if (!type_is_non_owning_ref(reg->type) || reg->ref_obj_id) { + verbose(env, "rbtree_remove node input must be non-owning ref\n"); + return -EINVAL; + } + if (in_rbtree_lock_required_cb(env)) { + verbose(env, "rbtree_remove not allowed in rbtree cb\n"); + return -EINVAL; + } + } else { + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); + return -EINVAL; + } + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); + return -EINVAL; + } + } + + ret = process_kf_arg_ptr_to_rbtree_node(env, reg, regno, meta); + if (ret < 0) + return ret; + break; case KF_ARG_PTR_TO_BTF_ID: /* Only base_type is checked, further checks are done here */ if ((base_type(reg->type) != PTR_TO_BTF_ID || @@ -9390,6 +9757,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ /* Skip next '__sz' argument */ i++; break; + case KF_ARG_PTR_TO_CALLBACK: + meta->subprogno = reg->subprogno; + break; } } @@ -9406,11 +9776,11 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { const struct btf_type *t, *func, *func_proto, *ptr_type; + u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; bool sleepable, rcu_lock, rcu_unlock; struct bpf_kfunc_call_arg_meta meta; - u32 i, nargs, func_id, ptr_type_id; int err, insn_idx = *insn_idx_p; const struct btf_param *args; const struct btf_type *ret_t; @@ -9505,6 +9875,35 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, } } + if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] || + meta.func_id == special_kfunc_list[KF_bpf_list_push_back] || + meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) { + release_ref_obj_id = regs[BPF_REG_2].ref_obj_id; + err = ref_convert_owning_non_owning(env, release_ref_obj_id); + if (err) { + verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n", + func_name, func_id); + return err; + } + + err = release_reference(env, release_ref_obj_id); + if (err) { + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, func_id); + return err; + } + } + + if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) { + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_rbtree_add_callback_state); + if (err) { + verbose(env, "kfunc %s#%d failed callback verification\n", + func_name, func_id); + return err; + } + } + for (i = 0; i < CALLER_SAVED_REGS; i++) mark_reg_not_init(env, regs, caller_saved[i]); @@ -9569,11 +9968,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { struct btf_field *field = meta.arg_list_head.field; - mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; - regs[BPF_REG_0].btf = field->graph_root.btf; - regs[BPF_REG_0].btf_id = field->graph_root.value_btf_id; - regs[BPF_REG_0].off = field->graph_root.node_offset; + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); + } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove] || + meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { + struct btf_field *field = meta.arg_rbtree_root.field; + + mark_reg_graph_node(regs, BPF_REG_0, &field->graph_root); } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; @@ -9639,7 +10039,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (is_kfunc_ret_null(&meta)) regs[BPF_REG_0].id = id; regs[BPF_REG_0].ref_obj_id = id; + } else if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_first]) { + ref_set_non_owning(env, ®s[BPF_REG_0]); } + + if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove]) + invalidate_non_owning_refs(env); + if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) regs[BPF_REG_0].id = ++env->id_gen; } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ @@ -11825,8 +12231,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, */ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0))) return; - if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL) && WARN_ON_ONCE(reg->off)) + if (!(type_is_ptr_alloc_obj(reg->type) || type_is_non_owning_ref(reg->type)) && + WARN_ON_ONCE(reg->off)) return; + if (is_null) { reg->type = SCALAR_VALUE; /* We don't need id and ref_obj_id from this point @@ -14335,7 +14743,7 @@ static int do_check(struct bpf_verifier_env *env) if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || (insn->src_reg == BPF_PSEUDO_CALL) || (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && - (insn->off != 0 || !is_bpf_list_api_kfunc(insn->imm)))) { + (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) { verbose(env, "function calls are not allowed while holding a lock\n"); return -EINVAL; } @@ -14371,7 +14779,8 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (env->cur_state->active_lock.ptr) { + if (env->cur_state->active_lock.ptr && + !in_rbtree_lock_required_cb(env)) { verbose(env, "bpf_spin_unlock is missing\n"); return -EINVAL; } @@ -14633,9 +15042,10 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); - if (btf_record_has_field(map->record, BPF_LIST_HEAD)) { + if (btf_record_has_field(map->record, BPF_LIST_HEAD) || + btf_record_has_field(map->record, BPF_RB_ROOT)) { if (is_tracing_prog_type(prog_type)) { - verbose(env, "tracing progs cannot use bpf_list_head yet\n"); + verbose(env, "tracing progs cannot use bpf_{list_head,rb_root} yet\n"); return -EINVAL; } } |
