From b6c1a8af5b1eec42aabc13376f94aa90c3d765f1 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 10 Feb 2023 15:47:31 +0000 Subject: mm: memcontrol: add new kernel parameter cgroup.memory=nobpf Add new kernel parameter cgroup.memory=nobpf to allow user disable bpf memory accounting. This is a preparation for the followup patch. Signed-off-by: Yafang Shao Acked-by: Johannes Weiner Acked-by: Roman Gushchin Link: https://lore.kernel.org/r/20230210154734.4416-2-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/memcontrol.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 85dc9b88ea37..1e38e99998c7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1754,6 +1754,12 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page); int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); +extern struct static_key_false memcg_bpf_enabled_key; +static inline bool memcg_bpf_enabled(void) +{ + return static_branch_likely(&memcg_bpf_enabled_key); +} + extern struct static_key_false memcg_kmem_enabled_key; static inline bool memcg_kmem_enabled(void) @@ -1832,6 +1838,11 @@ static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) return NULL; } +static inline bool memcg_bpf_enabled(void) +{ + return false; +} + static inline bool memcg_kmem_enabled(void) { return false; -- cgit v1.2.3 From ddef81b5fd1da4d7c3cc8785d2043b73b72f38ef Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 10 Feb 2023 15:47:32 +0000 Subject: bpf: use bpf_map_kvcalloc in bpf_local_storage Introduce new helper bpf_map_kvcalloc() for the memory allocation in bpf_local_storage(). Then the allocation will charge the memory from the map instead of from current, though currently they are the same thing as it is only used in map creation path now. By charging map's memory into the memcg from the map, it will be more clear. Signed-off-by: Yafang Shao Acked-by: Johannes Weiner Acked-by: Roman Gushchin Link: https://lore.kernel.org/r/20230210154734.4416-3-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 35c18a98c21a..fe0bf482fdf8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1886,6 +1886,8 @@ struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, int node); void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); +void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, + gfp_t flags); void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags); #else @@ -1902,6 +1904,12 @@ bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) return kzalloc(size, flags); } +static inline void * +bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size, gfp_t flags) +{ + return kvcalloc(n, size, flags); +} + static inline void __percpu * bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, gfp_t flags) -- cgit v1.2.3 From ee53cbfb1ebf990de0d084a7cd6b67b05fe1f7ac Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Fri, 10 Feb 2023 15:47:33 +0000 Subject: bpf: allow to disable bpf map memory accounting We can simply set root memcg as the map's memcg to disable bpf memory accounting. bpf_map_area_alloc is a little special as it gets the memcg from current rather than from the map, so we need to disable GFP_ACCOUNT specifically for it. Signed-off-by: Yafang Shao Acked-by: Johannes Weiner Acked-by: Roman Gushchin Link: https://lore.kernel.org/r/20230210154734.4416-4-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fe0bf482fdf8..4385418118f6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -28,6 +28,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -2933,4 +2934,11 @@ static inline bool type_is_alloc(u32 type) return type & MEM_ALLOC; } +static inline gfp_t bpf_memcg_flags(gfp_t flags) +{ + if (memcg_bpf_enabled()) + return flags | __GFP_ACCOUNT; + return flags; +} + #endif /* _LINUX_BPF_H */ -- cgit v1.2.3 From 6a3cd3318ff65622415e34e8ee39d76331e7c869 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sun, 12 Feb 2023 01:27:07 -0800 Subject: bpf: Migrate release_on_unlock logic to non-owning ref semantics This patch introduces non-owning reference semantics to the verifier, specifically linked_list API kfunc handling. release_on_unlock logic for refs is refactored - with small functional changes - to implement these semantics, and bpf_list_push_{front,back} are migrated to use them. When a list node is pushed to a list, the program still has a pointer to the node: n = bpf_obj_new(typeof(*n)); bpf_spin_lock(&l); bpf_list_push_back(&l, n); /* n still points to the just-added node */ bpf_spin_unlock(&l); What the verifier considers n to be after the push, and thus what can be done with n, are changed by this patch. Common properties both before/after this patch: * After push, n is only a valid reference to the node until end of critical section * After push, n cannot be pushed to any list * After push, the program can read the node's fields using n Before: * After push, n retains the ref_obj_id which it received on bpf_obj_new, but the associated bpf_reference_state's release_on_unlock field is set to true * release_on_unlock field and associated logic is used to implement "n is only a valid ref until end of critical section" * After push, n cannot be written to, the node must be removed from the list before writing to its fields * After push, n is marked PTR_UNTRUSTED After: * After push, n's ref is released and ref_obj_id set to 0. NON_OWN_REF type flag is added to reg's type, indicating that it's a non-owning reference. * NON_OWN_REF flag and logic is used to implement "n is only a valid ref until end of critical section" * n can be written to (except for special fields e.g. bpf_list_node, timer, ...) Summary of specific implementation changes to achieve the above: * release_on_unlock field, ref_set_release_on_unlock helper, and logic to "release on unlock" based on that field are removed * The anonymous active_lock struct used by bpf_verifier_state is pulled out into a named struct bpf_active_lock. * NON_OWN_REF type flag is introduced along with verifier logic changes to handle non-owning refs * Helpers are added to use NON_OWN_REF flag to implement non-owning ref semantics as described above * invalidate_non_owning_refs - helper to clobber all non-owning refs matching a particular bpf_active_lock identity. Replaces release_on_unlock logic in process_spin_lock. * ref_set_non_owning - set NON_OWN_REF type flag after doing some sanity checking * ref_convert_owning_non_owning - convert owning reference w/ specified ref_obj_id to non-owning references. Set NON_OWN_REF flag for each reg with that ref_obj_id and 0-out its ref_obj_id * Update linked_list selftests to account for minor semantic differences introduced by this patch * Writes to a release_on_unlock node ref are not allowed, while writes to non-owning reference pointees are. As a result the linked_list "write after push" failure tests are no longer scenarios that should fail. * The test##missing_lock##op and test##incorrect_lock##op macro-generated failure tests need to have a valid node argument in order to have the same error output as before. Otherwise verification will fail early and the expected error output won't be seen. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230212092715.1422619-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ include/linux/bpf_verifier.h | 38 ++++++++++++++++++-------------------- 2 files changed, 24 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4385418118f6..8b5d0b4c4ada 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -181,6 +181,7 @@ enum btf_field_type { BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, BPF_LIST_HEAD = (1 << 4), BPF_LIST_NODE = (1 << 5), + BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD, }; struct btf_field_kptr { @@ -576,6 +577,11 @@ enum bpf_type_flag { /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */ MEM_RCU = BIT(13 + BPF_BASE_TYPE_BITS), + /* Used to tag PTR_TO_BTF_ID | MEM_ALLOC references which are non-owning. + * Currently only valid for linked-list and rbtree nodes. + */ + NON_OWN_REF = BIT(14 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index aa83de1fe755..cf1bb1cf4a7b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -43,6 +43,22 @@ enum bpf_reg_liveness { REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ }; +/* For every reg representing a map value or allocated object pointer, + * we consider the tuple of (ptr, id) for them to be unique in verifier + * context and conside them to not alias each other for the purposes of + * tracking lock state. + */ +struct bpf_active_lock { + /* This can either be reg->map_ptr or reg->btf. If ptr is NULL, + * there's no active lock held, and other fields have no + * meaning. If non-NULL, it indicates that a lock is held and + * id member has the reg->id of the register which can be >= 0. + */ + void *ptr; + /* This will be reg->id */ + u32 id; +}; + struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; @@ -226,11 +242,6 @@ struct bpf_reference_state { * exiting a callback function. */ int callback_ref; - /* Mark the reference state to release the registers sharing the same id - * on bpf_spin_unlock (for nodes that we will lose ownership to but are - * safe to access inside the critical section). - */ - bool release_on_unlock; }; /* state of the program: @@ -331,21 +342,8 @@ struct bpf_verifier_state { u32 branches; u32 insn_idx; u32 curframe; - /* For every reg representing a map value or allocated object pointer, - * we consider the tuple of (ptr, id) for them to be unique in verifier - * context and conside them to not alias each other for the purposes of - * tracking lock state. - */ - struct { - /* This can either be reg->map_ptr or reg->btf. If ptr is NULL, - * there's no active lock held, and other fields have no - * meaning. If non-NULL, it indicates that a lock is held and - * id member has the reg->id of the register which can be >= 0. - */ - void *ptr; - /* This will be reg->id */ - u32 id; - } active_lock; + + struct bpf_active_lock active_lock; bool speculative; bool active_rcu_lock; -- cgit v1.2.3 From 9c395c1b99bd23f74bc628fa000480c49593d17f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 13 Feb 2023 16:40:10 -0800 Subject: bpf: Add basic bpf_rb_{root,node} support This patch adds special BPF_RB_{ROOT,NODE} btf_field_types similar to BPF_LIST_{HEAD,NODE}, adds the necessary plumbing to detect the new types, and adds bpf_rb_root_free function for freeing bpf_rb_root in map_values. structs bpf_rb_root and bpf_rb_node are opaque types meant to obscure structs rb_root_cached rb_node, respectively. btf_struct_access will prevent BPF programs from touching these special fields automatically now that they're recognized. btf_check_and_fixup_fields now groups list_head and rb_root together as "graph root" fields and {list,rb}_node as "graph node", and does same ownership cycle checking as before. Note that this function does _not_ prevent ownership type mixups (e.g. rb_root owning list_node) - that's handled by btf_parse_graph_root. After this patch, a bpf program can have a struct bpf_rb_root in a map_value, but not add anything to nor do anything useful with it. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230214004017.2534011-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8b5d0b4c4ada..be34f7deb6c3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -181,7 +181,10 @@ enum btf_field_type { BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, BPF_LIST_HEAD = (1 << 4), BPF_LIST_NODE = (1 << 5), - BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD, + BPF_RB_ROOT = (1 << 6), + BPF_RB_NODE = (1 << 7), + BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | + BPF_RB_NODE | BPF_RB_ROOT, }; struct btf_field_kptr { @@ -285,6 +288,10 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_list_head"; case BPF_LIST_NODE: return "bpf_list_node"; + case BPF_RB_ROOT: + return "bpf_rb_root"; + case BPF_RB_NODE: + return "bpf_rb_node"; default: WARN_ON_ONCE(1); return "unknown"; @@ -305,6 +312,10 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_list_head); case BPF_LIST_NODE: return sizeof(struct bpf_list_node); + case BPF_RB_ROOT: + return sizeof(struct bpf_rb_root); + case BPF_RB_NODE: + return sizeof(struct bpf_rb_node); default: WARN_ON_ONCE(1); return 0; @@ -325,6 +336,10 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_list_head); case BPF_LIST_NODE: return __alignof__(struct bpf_list_node); + case BPF_RB_ROOT: + return __alignof__(struct bpf_rb_root); + case BPF_RB_NODE: + return __alignof__(struct bpf_rb_node); default: WARN_ON_ONCE(1); return 0; @@ -435,6 +450,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, void bpf_timer_cancel_and_free(void *timer); void bpf_list_head_free(const struct btf_field *field, void *list_head, struct bpf_spin_lock *spin_lock); +void bpf_rb_root_free(const struct btf_field *field, void *rb_root, + struct bpf_spin_lock *spin_lock); + int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); -- cgit v1.2.3 From 997849c4b969034e225153f41026657def66d286 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 15 Feb 2023 16:21:31 +0800 Subject: bpf: Zeroing allocated object from slab in bpf memory allocator Currently the freed element in bpf memory allocator may be immediately reused, for htab map the reuse will reinitialize special fields in map value (e.g., bpf_spin_lock), but lookup procedure may still access these special fields, and it may lead to hard-lockup as shown below: NMI backtrace for cpu 16 CPU: 16 PID: 2574 Comm: htab.bin Tainted: G L 6.1.0+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), RIP: 0010:queued_spin_lock_slowpath+0x283/0x2c0 ...... Call Trace: copy_map_value_locked+0xb7/0x170 bpf_map_copy_value+0x113/0x3c0 __sys_bpf+0x1c67/0x2780 __x64_sys_bpf+0x1c/0x20 do_syscall_64+0x30/0x60 entry_SYSCALL_64_after_hwframe+0x46/0xb0 ...... For htab map, just like the preallocated case, these is no need to initialize these special fields in map value again once these fields have been initialized. For preallocated htab map, these fields are initialized through __GFP_ZERO in bpf_map_area_alloc(), so do the similar thing for non-preallocated htab in bpf memory allocator. And there is no need to use __GFP_ZERO for per-cpu bpf memory allocator, because __alloc_percpu_gfp() does it implicitly. Fixes: 0fd7c5d43339 ("bpf: Optimize call_rcu in non-preallocated hash map.") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20230215082132.3856544-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be34f7deb6c3..520b238abd5a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -363,6 +363,13 @@ static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]); } +/* 'dst' must be a temporary buffer and should not point to memory that is being + * used in parallel by a bpf program or bpf syscall, otherwise the access from + * the bpf program or bpf syscall may be corrupted by the reinitialization, + * leading to weird problems. Even 'dst' is newly-allocated from bpf memory + * allocator, it is still possible for 'dst' to be used in parallel by a bpf + * program or bpf syscall. + */ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { bpf_obj_init(map->field_offs, dst); -- cgit v1.2.3