summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-03-30 13:45:28 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-03-30 13:45:28 -0700
commitaa918db707fba507e85217961643281ee8dfb2ed (patch)
tree86d529825cc85a1d309f33efba97016dd64c8529 /kernel/bpf
parent494e7fe591bf834d57c6607cdc26ab8873708aa7 (diff)
parentf90b474a35744b5d43009e4fab232e74a3024cae (diff)
Merge tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Pull bpf try_alloc_pages() support from Alexei Starovoitov: "The pull includes work from Sebastian, Vlastimil and myself with a lot of help from Michal and Shakeel. This is a first step towards making kmalloc reentrant to get rid of slab wrappers: bpf_mem_alloc, kretprobe's objpool, etc. These patches make page allocator safe from any context. Vlastimil kicked off this effort at LSFMM 2024: https://lwn.net/Articles/974138/ and we continued at LSFMM 2025: https://lore.kernel.org/all/CAADnVQKfkGxudNUkcPJgwe3nTZ=xohnRshx9kLZBTmR_E1DFEg@mail.gmail.com/ Why: SLAB wrappers bind memory to a particular subsystem making it unavailable to the rest of the kernel. Some BPF maps in production consume Gbytes of preallocated memory. Top 5 in Meta: 1.5G, 1.2G, 1.1G, 300M, 200M. Once we have kmalloc that works in any context BPF map preallocation won't be necessary. How: Synchronous kmalloc/page alloc stack has multiple stages going from fast to slow: cmpxchg16 -> slab_alloc -> new_slab -> alloc_pages -> rmqueue_pcplist -> __rmqueue, where rmqueue_pcplist was already relying on trylock. This set changes rmqueue_bulk/rmqueue_buddy to attempt a trylock and return ENOMEM if alloc_flags & ALLOC_TRYLOCK. It then wraps this functionality into try_alloc_pages() helper. We make sure that the logic is sane in PREEMPT_RT. End result: try_alloc_pages()/free_pages_nolock() are safe to call from any context. try_kmalloc() for any context with similar trylock approach will follow. It will use try_alloc_pages() when slab needs a new page. Though such try_kmalloc/page_alloc() is an opportunistic allocator, this design ensures that the probability of successful allocation of small objects (up to one page in size) is high. Even before we have try_kmalloc(), we already use try_alloc_pages() in BPF arena implementation and it's going to be used more extensively in BPF" * tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: mm: Fix the flipped condition in gfpflags_allow_spinning() bpf: Use try_alloc_pages() to allocate pages for bpf needs. mm, bpf: Use memcg in try_alloc_pages(). memcg: Use trylock to access memcg stock_lock. mm, bpf: Introduce free_pages_nolock() mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation locking/local_lock: Introduce localtry_lock_t
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/arena.c5
-rw-r--r--kernel/bpf/syscall.c23
2 files changed, 22 insertions, 6 deletions
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 647b709d7d77..0d56cea71602 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
return VM_FAULT_SIGSEGV;
/* Account into memcg of the process that created bpf_arena */
- ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
+ ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
return VM_FAULT_SIGSEGV;
@@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
if (ret)
goto out_free_pages;
- ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
- node_id, page_cnt, pages);
+ ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
if (ret)
goto out;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 77062799143e..9794446bc8c6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
-int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
+static bool can_alloc_pages(void)
+{
+ return preempt_count() == 0 && !irqs_disabled() &&
+ !IS_ENABLED(CONFIG_PREEMPT_RT);
+}
+
+static struct page *__bpf_alloc_page(int nid)
+{
+ if (!can_alloc_pages())
+ return try_alloc_pages(nid, 0);
+
+ return alloc_pages_node(nid,
+ GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
+ | __GFP_NOWARN,
+ 0);
+}
+
+int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
unsigned long nr_pages, struct page **pages)
{
unsigned long i, j;
@@ -582,14 +599,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
old_memcg = set_active_memcg(memcg);
#endif
for (i = 0; i < nr_pages; i++) {
- pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
+ pg = __bpf_alloc_page(nid);
if (pg) {
pages[i] = pg;
continue;
}
for (j = 0; j < i; j++)
- __free_page(pages[j]);
+ free_pages_nolock(pages[j], 0);
ret = -ENOMEM;
break;
}