From 0893d6007db5cf397f3fc92b2a6935c3ed0c6f00 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 9 Dec 2022 09:09:46 +0800 Subject: bpf: Reuse freed element in free_by_rcu during allocation When there are batched freeing operations on a specific CPU, part of the freed elements ((high_watermark - lower_watermark) / 2 + 1) will be indirectly moved into waiting_for_gp list through free_by_rcu list. After call_rcu_in_progress becomes false again, the remaining elements in free_by_rcu list will be moved to waiting_for_gp list by the next invocation of free_bulk(). However if the expiration of RCU tasks trace grace period is relatively slow, none element in free_by_rcu list will be moved. So instead of invoking __alloc_percpu_gfp() or kmalloc_node() to allocate a new object, in alloc_bulk() just check whether or not there is freed element in free_by_rcu list and reuse it if available. Acked-by: Yonghong Song Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20221209010947.3130477-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 8f0d65f2474a..04d96d1b98a3 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -171,9 +171,24 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); for (i = 0; i < cnt; i++) { - obj = __alloc(c, node); - if (!obj) - break; + /* + * free_by_rcu is only manipulated by irq work refill_work(). + * IRQ works on the same CPU are called sequentially, so it is + * safe to use __llist_del_first() here. If alloc_bulk() is + * invoked by the initial prefill, there will be no running + * refill_work(), so __llist_del_first() is fine as well. + * + * In most cases, objects on free_by_rcu are from the same CPU. + * If some objects come from other CPUs, it doesn't incur any + * harm because NUMA_NO_NODE means the preference for current + * numa node and it is not a guarantee. + */ + obj = __llist_del_first(&c->free_by_rcu); + if (!obj) { + obj = __alloc(c, node); + if (!obj) + break; + } if (IS_ENABLED(CONFIG_PREEMPT_RT)) /* In RT irq_work runs in per-cpu kthread, so disable * interrupts to avoid preemption and interrupts and -- cgit v1.2.3 From 822ed78fab13d5a54f8b8c030e8c5dc0fcd2cdae Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 9 Dec 2022 09:09:47 +0800 Subject: bpf: Skip rcu_barrier() if rcu_trace_implies_rcu_gp() is true If there are pending rcu callback, free_mem_alloc() will use rcu_barrier_tasks_trace() and rcu_barrier() to wait for the pending __free_rcu_tasks_trace() and __free_rcu() callback. If rcu_trace_implies_rcu_gp() is true, there will be no pending __free_rcu(), so it will be OK to skip rcu_barrier() as well. Acked-by: Yonghong Song Acked-by: Paul E. McKenney Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20221209010947.3130477-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/memalloc.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 04d96d1b98a3..ebcc3dd0fa19 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -464,9 +464,17 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) { /* waiting_for_gp lists was drained, but __free_rcu might * still execute. Wait for it now before we freeing percpu caches. + * + * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), + * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used + * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), + * so if call_rcu(head, __free_rcu) is skipped due to + * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by + * using rcu_trace_implies_rcu_gp() as well. */ rcu_barrier_tasks_trace(); - rcu_barrier(); + if (!rcu_trace_implies_rcu_gp()) + rcu_barrier(); free_mem_alloc_no_barrier(ma); } -- cgit v1.2.3