From cf83b2d2e2b64920bd6999b199dfa271d7e94cf8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 27 Oct 2020 23:10:54 -0700 Subject: bpf: Permit cond_resched for some iterators Commit e679654a704e ("bpf: Fix a rcu_sched stall issue with bpf task/task_file iterator") tries to fix rcu stalls warning which is caused by bpf task_file iterator when running "bpftool prog". rcu: INFO: rcu_sched self-detected stall on CPU rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913 \x09(t=21031 jiffies g=2534773 q=179750) NMI backtrace for cpu 7 CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019 Call Trace: dump_stack+0x57/0x70 nmi_cpu_backtrace.cold+0x14/0x53 ? lapic_can_unplug_cpu.cold+0x39/0x39 nmi_trigger_cpumask_backtrace+0xb7/0xc7 rcu_dump_cpu_stacks+0xa2/0xd0 rcu_sched_clock_irq.cold+0x1ff/0x3d9 ? tick_nohz_handler+0x100/0x100 update_process_times+0x5b/0x90 tick_sched_timer+0x5e/0xf0 __hrtimer_run_queues+0x12a/0x2a0 hrtimer_interrupt+0x10e/0x280 __sysvec_apic_timer_interrupt+0x51/0xe0 asm_call_on_stack+0xf/0x20 sysvec_apic_timer_interrupt+0x6f/0x80 ... task_file_seq_next+0x52/0xa0 bpf_seq_read+0xb9/0x320 vfs_read+0x9d/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x38/0x60 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The fix is to limit the number of bpf program runs to be one million. This fixed the program in most cases. But we also found under heavy load, which can increase the wallclock time for bpf_seq_read(), the warning may still be possible. For example, calling bpf_delay() in the "while" loop of bpf_seq_read(), which will introduce artificial delay, the warning will show up in my qemu run. static unsigned q; volatile unsigned *p = &q; volatile unsigned long long ll; static void bpf_delay(void) { int i, j; for (i = 0; i < 10000; i++) for (j = 0; j < 10000; j++) ll += *p; } There are two ways to fix this issue. One is to reduce the above one million threshold to say 100,000 and hopefully rcu warning will not show up any more. Another is to introduce a target feature which enables bpf_seq_read() calling cond_resched(). This patch took second approach as the first approach may cause more -EAGAIN failures for read() syscalls. Note that not all bpf_iter targets can permit cond_resched() in bpf_seq_read() as some, e.g., netlink seq iterator, rcu read lock critical section spans through seq_ops->next() -> seq_ops->show() -> seq_ops->next(). For the kernel code with the above hack, "bpftool p" roughly takes 38 seconds to finish on my VM with 184 bpf program runs. Using the following command, I am able to collect the number of context switches: perf stat -e context-switches -- ./bpftool p >& log Without this patch, 69 context-switches With this patch, 75 context-switches This patch added additional 6 context switches, roughly every 6 seconds to reschedule, to avoid lengthy no-rescheduling which may cause the above RCU warnings. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201028061054.1411116-1-yhs@fb.com --- include/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2b16bf48aab6..2fffd30e13ac 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1294,6 +1294,10 @@ typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); +enum bpf_iter_feature { + BPF_ITER_RESCHED = BIT(0), +}; + #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; @@ -1302,6 +1306,7 @@ struct bpf_iter_reg { bpf_iter_show_fdinfo_t show_fdinfo; bpf_iter_fill_link_info_t fill_link_info; u32 ctx_arg_info_size; + u32 feature; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; -- cgit v1.2.3 From 4cf1bc1f10452065a29d576fc5693fc4fab5b919 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:40 +0000 Subject: bpf: Implement task local storage Similar to bpf_local_storage for sockets and inodes add local storage for task_struct. The life-cycle of storage is managed with the life-cycle of the task_struct. i.e. the storage is destroyed along with the owning task with a callback to the bpf_task_storage_free from the task_free LSM hook. The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the security blob which are now stackable and can co-exist with other LSMs. The userspace map operations can be done by using a pid fd as a key passed to the lookup, update and delete operations. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-3-kpsingh@chromium.org --- include/linux/bpf_lsm.h | 23 +++++++++++++++++++++++ include/linux/bpf_types.h | 1 + 2 files changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index aaacb6aafc87..73226181b744 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -7,6 +7,7 @@ #ifndef _LINUX_BPF_LSM_H #define _LINUX_BPF_LSM_H +#include #include #include @@ -35,9 +36,21 @@ static inline struct bpf_storage_blob *bpf_inode( return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } +static inline struct bpf_storage_blob *bpf_task( + const struct task_struct *task) +{ + if (unlikely(!task->security)) + return NULL; + + return task->security + bpf_lsm_blob_sizes.lbs_task; +} + extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; +extern const struct bpf_func_proto bpf_task_storage_get_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); +void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ @@ -53,10 +66,20 @@ static inline struct bpf_storage_blob *bpf_inode( return NULL; } +static inline struct bpf_storage_blob *bpf_task( + const struct task_struct *task) +{ + return NULL; +} + static inline void bpf_inode_storage_free(struct inode *inode) { } +static inline void bpf_task_storage_free(struct task_struct *task) +{ +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2e6f568377f1..99f7fd657d87 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -109,6 +109,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) -- cgit v1.2.3 From 3ca1032ab7ab010eccb107aa515598788f7d93bb Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:43 +0000 Subject: bpf: Implement get_current_task_btf and RET_PTR_TO_BTF_ID The currently available bpf_get_current_task returns an unsigned integer which can be used along with BPF_CORE_READ to read data from the task_struct but still cannot be used as an input argument to a helper that accepts an ARG_PTR_TO_BTF_ID of type task_struct. In order to implement this helper a new return type, RET_PTR_TO_BTF_ID, is added. This is similar to RET_PTR_TO_BTF_ID_OR_NULL but does not require checking the nullness of returned pointer. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-6-kpsingh@chromium.org --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2fffd30e13ac..73d5381a5d5c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -310,6 +310,7 @@ enum bpf_return_type { RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ + RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs -- cgit v1.2.3 From 36e68442d1afd4f720704ee1ea8486331507e834 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Nov 2020 17:19:31 -0800 Subject: bpf: Load and verify kernel module BTFs Add kernel module listener that will load/validate and unload module BTF. Module BTFs gets ID generated for them, which makes it possible to iterate them with existing BTF iteration API. They are given their respective module's names, which will get reported through GET_OBJ_INFO API. They are also marked as in-kernel BTFs for tooling to distinguish them from user-provided BTFs. Also, similarly to vmlinux BTF, kernel module BTFs are exposed through sysfs as /sys/kernel/btf/. This is convenient for user-space tools to inspect module BTF contents and dump their types with existing tools: [vmuser@archvm bpf]$ ls -la /sys/kernel/btf total 0 drwxr-xr-x 2 root root 0 Nov 4 19:46 . drwxr-xr-x 13 root root 0 Nov 4 19:46 .. ... -r--r--r-- 1 root root 888 Nov 4 19:46 irqbypass -r--r--r-- 1 root root 100225 Nov 4 19:46 kvm -r--r--r-- 1 root root 35401 Nov 4 19:46 kvm_intel -r--r--r-- 1 root root 120 Nov 4 19:46 pcspkr -r--r--r-- 1 root root 399 Nov 4 19:46 serio_raw -r--r--r-- 1 root root 4094095 Nov 4 19:46 vmlinux Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/bpf/20201110011932.3201430-5-andrii@kernel.org --- include/linux/bpf.h | 2 ++ include/linux/module.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 73d5381a5d5c..581b2a2e78eb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -36,9 +36,11 @@ struct seq_operations; struct bpf_iter_aux_info; struct bpf_local_storage; struct bpf_local_storage_map; +struct kobject; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; +extern struct kobject *btf_kobj; typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, struct bpf_iter_aux_info *aux); diff --git a/include/linux/module.h b/include/linux/module.h index a29187f7c360..20fce258ffba 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -475,6 +475,10 @@ struct module { unsigned int num_bpf_raw_events; struct bpf_raw_event_map *bpf_raw_events; #endif +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + unsigned int btf_data_size; + void *btf_data; +#endif #ifdef CONFIG_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; -- cgit v1.2.3 From 6d94e741a8ff818e5518da8257f5ca0aaed1f269 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 10 Nov 2020 19:12:11 -0800 Subject: bpf: Support for pointers beyond pkt_end. This patch adds the verifier support to recognize inlined branch conditions. The LLVM knows that the branch evaluates to the same value, but the verifier couldn't track it. Hence causing valid programs to be rejected. The potential LLVM workaround: https://reviews.llvm.org/D87428 can have undesired side effects, since LLVM doesn't know that skb->data/data_end are being compared. LLVM has to introduce extra boolean variable and use inline_asm trick to force easier for the verifier assembly. Instead teach the verifier to recognize that r1 = skb->data; r1 += 10; r2 = skb->data_end; if (r1 > r2) { here r1 points beyond packet_end and subsequent if (r1 > r2) // always evaluates to "true". } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Jiri Olsa Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20201111031213.25109-2-alexei.starovoitov@gmail.com --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e83ef6f6bf43..306869d4743b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -45,7 +45,7 @@ struct bpf_reg_state { enum bpf_reg_type type; union { /* valid when type == PTR_TO_PACKET */ - u16 range; + int range; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL -- cgit v1.2.3 From 423f16108c9d832bd96059d5c882c8ef6d76eb96 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 13 Nov 2020 00:59:29 +0000 Subject: bpf: Augment the set of sleepable LSM hooks Update the set of sleepable hooks with the ones that do not trigger a warning with might_fault() when exercised with the correct kernel config options enabled, i.e. DEBUG_ATOMIC_SLEEP=y LOCKDEP=y PROVE_LOCKING=y This means that a sleepable LSM eBPF program can be attached to these LSM hooks. A new helper method bpf_lsm_is_sleepable_hook is added and the set is maintained locally in bpf_lsm.c Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201113005930.541956-2-kpsingh@chromium.org --- include/linux/bpf_lsm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 73226181b744..0d1c33ace398 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -27,6 +27,8 @@ extern struct lsm_blob_sizes bpf_lsm_blob_sizes; int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); +bool bpf_lsm_is_sleepable_hook(u32 btf_id); + static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) { @@ -54,6 +56,11 @@ void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ +static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) +{ + return false; +} + static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { -- cgit v1.2.3