From 7c8199e24fa09d2344ae0204527d55d7803e8409 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:43 -0700 Subject: bpf: Introduce any context BPF specific memory allocator. Tracing BPF programs can attach to kprobe and fentry. Hence they run in unknown context where calling plain kmalloc() might not be safe. Front-end kmalloc() with minimal per-cpu cache of free elements. Refill this cache asynchronously from irq_work. BPF programs always run with migration disabled. It's safe to allocate from cache of the current cpu with irqs disabled. Free-ing is always done into bucket of the current cpu as well. irq_work trims extra free elements from buckets with kfree and refills them with kmalloc, so global kmalloc logic takes care of freeing objects allocated by one cpu and freed on another. struct bpf_mem_alloc supports two modes: - When size != 0 create kmem_cache and bpf_mem_cache for each cpu. This is typical bpf hash map use case when all elements have equal size. - When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on kmalloc/kfree. Max allocation size is 4096 in this case. This is bpf_dynptr and bpf_kptr use case. bpf_mem_alloc/bpf_mem_free are bpf specific 'wrappers' of kmalloc/kfree. bpf_mem_cache_alloc/bpf_mem_cache_free are 'wrappers' of kmem_cache_alloc/kmem_cache_free. The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-2-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 include/linux/bpf_mem_alloc.h (limited to 'include') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h new file mode 100644 index 000000000000..804733070f8d --- /dev/null +++ b/include/linux/bpf_mem_alloc.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#ifndef _BPF_MEM_ALLOC_H +#define _BPF_MEM_ALLOC_H +#include + +struct bpf_mem_cache; +struct bpf_mem_caches; + +struct bpf_mem_alloc { + struct bpf_mem_caches __percpu *caches; + struct bpf_mem_cache __percpu *cache; +}; + +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size); +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); + +/* kmalloc/kfree equivalent: */ +void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); +void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); + +/* kmem_cache_alloc/free equivalent: */ +void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); +void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); + +#endif /* _BPF_MEM_ALLOC_H */ -- cgit v1.2.3 From 4ab67149f3c6e97c5c506a726f0ebdec38241679 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:52 -0700 Subject: bpf: Add percpu allocation support to bpf_mem_alloc. Extend bpf_mem_alloc to cache free list of fixed size per-cpu allocations. Once such cache is created bpf_mem_cache_alloc() will return per-cpu objects. bpf_mem_cache_free() will free them back into global per-cpu pool after observing RCU grace period. per-cpu flavor of bpf_mem_alloc is going to be used by per-cpu hash maps. The free list cache consists of tuples { llist_node, per-cpu pointer } Unlike alloc_percpu() that returns per-cpu pointer the bpf_mem_cache_alloc() returns a pointer to per-cpu pointer and bpf_mem_cache_free() expects to receive it back. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-11-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 804733070f8d..653ed1584a03 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -12,7 +12,7 @@ struct bpf_mem_alloc { struct bpf_mem_cache __percpu *cache; }; -int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size); +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ -- cgit v1.2.3 From 9f2c6e96c65e6fa1aebef546be0c30a5895fcb37 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:58 -0700 Subject: bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc. User space might be creating and destroying a lot of hash maps. Synchronous rcu_barrier-s in a destruction path of hash map delay freeing of hash buckets and other map memory and may cause artificial OOM situation under stress. Optimize rcu_barrier usage between bpf hash map and bpf_mem_alloc: - remove rcu_barrier from hash map, since htab doesn't use call_rcu directly and there are no callback to wait for. - bpf_mem_alloc has call_rcu_in_progress flag that indicates pending callbacks. Use it to avoid barriers in fast path. - When barriers are needed copy bpf_mem_alloc into temp structure and wait for rcu barrier-s in the worker to let the rest of hash map freeing to proceed. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220902211058.60789-17-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 653ed1584a03..3e164b8efaa9 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -3,6 +3,7 @@ #ifndef _BPF_MEM_ALLOC_H #define _BPF_MEM_ALLOC_H #include +#include struct bpf_mem_cache; struct bpf_mem_caches; @@ -10,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + struct work_struct work; }; int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); -- cgit v1.2.3