From ee3bad033d01066348913f3220ea81987721ed53 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 27 Mar 2024 11:20:22 +0800 Subject: bpf: Mitigate latency spikes associated with freeing non-preallocated htab Following the recent upgrade of one of our BPF programs, we encountered significant latency spikes affecting other applications running on the same host. After thorough investigation, we identified that these spikes were primarily caused by the prolonged duration required to free a non-preallocated htab with approximately 2 million keys. Notably, our kernel configuration lacks the presence of CONFIG_PREEMPT. In scenarios where kernel execution extends excessively, other threads might be starved of CPU time, resulting in latency issues across the system. To mitigate this, we've adopted a proactive approach by incorporating cond_resched() calls within the kernel code. This ensures that during lengthy kernel operations, the scheduler is invoked periodically to provide opportunities for other threads to execute. Signed-off-by: Yafang Shao Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20240327032022.78391-1-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/bpf/hashtab.c') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3a088a5349bc..e81059faae63 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1490,6 +1490,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_nulls_del_rcu(&l->hash_node); htab_elem_free(htab, l); } + cond_resched(); } migrate_enable(); } -- cgit v1.2.3 From 0b56e637f705836b9aa51e2b1058c3c814c121a8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 1 Apr 2024 19:13:05 -0700 Subject: bpf: inline bpf_map_lookup_elem() helper for PERCPU_HASH map Using new per-CPU BPF instruction, partially inline bpf_map_lookup_elem() helper for per-CPU hashmap BPF map. Just like for normal HASH map, we still generate a call into __htab_map_lookup_elem(), but after that we resolve per-CPU element address using a new instruction, saving on extra functions calls. Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/r/20240402021307.1012571-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel/bpf/hashtab.c') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e81059faae63..83a9a74260e9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2308,6 +2308,26 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +/* inline bpf_map_lookup_elem() call for per-CPU hashmap */ +static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + if (!bpf_jit_supports_percpu_insn()) + return -EOPNOTSUPP; + + BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, + (void *(*)(struct bpf_map *map, void *key))NULL)); + *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3); + *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, + offsetof(struct htab_elem, key) + map->key_size); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0); + *insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); + + return insn - insn_buf; +} + static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) { struct htab_elem *l; @@ -2436,6 +2456,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_gen_lookup = htab_percpu_map_gen_lookup, .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_update_elem = htab_percpu_map_update_elem, .map_delete_elem = htab_map_delete_elem, -- cgit v1.2.3 From a7de265cb2d849f8986a197499ad58dca0a4f209 Mon Sep 17 00:00:00 2001 From: Rafael Passos Date: Wed, 17 Apr 2024 15:49:14 -0300 Subject: bpf: Fix typos in comments Found the following typos in comments, and fixed them: s/unpriviledged/unprivileged/ s/reponsible/responsible/ s/possiblities/possibilities/ s/Divison/Division/ s/precsion/precision/ s/havea/have a/ s/reponsible/responsible/ s/responsibile/responsible/ s/tigher/tighter/ s/respecitve/respective/ Signed-off-by: Rafael Passos Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/6af7deb4-bb24-49e8-b3f1-8dd410597337@smtp-relay.sendinblue.com --- kernel/bpf/hashtab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/bpf/hashtab.c') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 83a9a74260e9..c3e79a0b9361 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1539,7 +1539,7 @@ static void htab_map_free(struct bpf_map *map) */ /* htab no longer uses call_rcu() directly. bpf_mem_alloc does it - * underneath and is reponsible for waiting for callbacks to finish + * underneath and is responsible for waiting for callbacks to finish * during bpf_mem_alloc_destroy(). */ if (!htab_is_prealloc(htab)) { -- cgit v1.2.3 From 246331e3f1eac905170a923f0ec76725c2558232 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Sat, 20 Apr 2024 11:09:09 +0200 Subject: bpf: allow struct bpf_wq to be embedded in arraymaps and hashmaps Currently bpf_wq_cancel_and_free() is just a placeholder as there is no memory allocation for bpf_wq just yet. Again, duplication of the bpf_timer approach Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20240420-bpf_wq-v2-9-6c986a5a741f@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/hashtab.c | 55 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 12 deletions(-) (limited to 'kernel/bpf/hashtab.c') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c3e79a0b9361..0179183c543a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -240,6 +240,26 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) } } +static void htab_free_prealloced_wq(struct bpf_htab *htab) +{ + u32 num_entries = htab->map.max_entries; + int i; + + if (!btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + return; + if (htab_has_extra_elems(htab)) + num_entries += num_possible_cpus(); + + for (i = 0; i < num_entries; i++) { + struct htab_elem *elem; + + elem = get_htab_elem(htab, i); + bpf_obj_free_workqueue(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); + cond_resched(); + } +} + static void htab_free_prealloced_fields(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; @@ -1495,7 +1515,7 @@ static void delete_all_elements(struct bpf_htab *htab) migrate_enable(); } -static void htab_free_malloced_timers(struct bpf_htab *htab) +static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer) { int i; @@ -1507,24 +1527,35 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ - bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); + if (is_timer) + bpf_obj_free_timer(htab->map.record, + l->key + round_up(htab->map.key_size, 8)); + else + bpf_obj_free_workqueue(htab->map.record, + l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } rcu_read_unlock(); } -static void htab_map_free_timers(struct bpf_map *map) +static void htab_map_free_timers_and_wq(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We only free timer on uref dropping to zero */ - if (!btf_record_has_field(htab->map.record, BPF_TIMER)) - return; - if (!htab_is_prealloc(htab)) - htab_free_malloced_timers(htab); - else - htab_free_prealloced_timers(htab); + /* We only free timer and workqueue on uref dropping to zero */ + if (btf_record_has_field(htab->map.record, BPF_TIMER)) { + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers_or_wq(htab, true); + else + htab_free_prealloced_timers(htab); + } + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) { + if (!htab_is_prealloc(htab)) + htab_free_malloced_timers_or_wq(htab, false); + else + htab_free_prealloced_wq(htab); + } } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -2260,7 +2291,7 @@ const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers, + .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_map_lookup_elem, .map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem, .map_update_elem = htab_map_update_elem, @@ -2281,7 +2312,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, - .map_release_uref = htab_map_free_timers, + .map_release_uref = htab_map_free_timers_and_wq, .map_lookup_elem = htab_lru_map_lookup_elem, .map_lookup_and_delete_elem = htab_lru_map_lookup_and_delete_elem, .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, -- cgit v1.2.3 From a891711d0166133ec5120615fcf365d9745d82b2 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 30 Apr 2024 12:43:25 +0200 Subject: bpf: Do not walk twice the hash map on free If someone stores both a timer and a workqueue in a hash map, on free, we would walk it twice. Add a check in htab_free_malloced_timers_or_wq and free the timers and workqueues if they are present. Fixes: 246331e3f1ea ("bpf: allow struct bpf_wq to be embedded in arraymaps and hashmaps") Signed-off-by: Benjamin Tissoires Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20240430-bpf-next-v3-2-27afe7f3b17c@kernel.org --- kernel/bpf/hashtab.c | 49 +++++++++++++------------------------------------ 1 file changed, 13 insertions(+), 36 deletions(-) (limited to 'kernel/bpf/hashtab.c') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 0179183c543a..06115f8728e8 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -221,13 +221,11 @@ static bool htab_has_extra_elems(struct bpf_htab *htab) return !htab_is_percpu(htab) && !htab_is_lru(htab); } -static void htab_free_prealloced_timers(struct bpf_htab *htab) +static void htab_free_prealloced_timers_and_wq(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; - if (!btf_record_has_field(htab->map.record, BPF_TIMER)) - return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -235,27 +233,12 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); - cond_resched(); - } -} - -static void htab_free_prealloced_wq(struct bpf_htab *htab) -{ - u32 num_entries = htab->map.max_entries; - int i; - - if (!btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) - return; - if (htab_has_extra_elems(htab)) - num_entries += num_possible_cpus(); - - for (i = 0; i < num_entries; i++) { - struct htab_elem *elem; - - elem = get_htab_elem(htab, i); - bpf_obj_free_workqueue(htab->map.record, - elem->key + round_up(htab->map.key_size, 8)); + if (btf_record_has_field(htab->map.record, BPF_TIMER)) + bpf_obj_free_timer(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) + bpf_obj_free_workqueue(htab->map.record, + elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } @@ -1515,7 +1498,7 @@ static void delete_all_elements(struct bpf_htab *htab) migrate_enable(); } -static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer) +static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab) { int i; @@ -1527,10 +1510,10 @@ static void htab_free_malloced_timers_or_wq(struct bpf_htab *htab, bool is_timer hlist_nulls_for_each_entry(l, n, head, hash_node) { /* We only free timer on uref dropping to zero */ - if (is_timer) + if (btf_record_has_field(htab->map.record, BPF_TIMER)) bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); - else + if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) bpf_obj_free_workqueue(htab->map.record, l->key + round_up(htab->map.key_size, 8)); } @@ -1544,17 +1527,11 @@ static void htab_map_free_timers_and_wq(struct bpf_map *map) struct bpf_htab *htab = container_of(map, struct bpf_htab, map); /* We only free timer and workqueue on uref dropping to zero */ - if (btf_record_has_field(htab->map.record, BPF_TIMER)) { - if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_or_wq(htab, true); - else - htab_free_prealloced_timers(htab); - } - if (btf_record_has_field(htab->map.record, BPF_WORKQUEUE)) { + if (btf_record_has_field(htab->map.record, BPF_TIMER | BPF_WORKQUEUE)) { if (!htab_is_prealloc(htab)) - htab_free_malloced_timers_or_wq(htab, false); + htab_free_malloced_timers_and_wq(htab); else - htab_free_prealloced_wq(htab); + htab_free_prealloced_timers_and_wq(htab); } } -- cgit v1.2.3