From 5447e8e01e101ba19fe5b7551f02d37367156f6b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 May 2020 16:07:12 +0200 Subject: sysctl: Fix unused function warning The newly added bpf_stats_handler function has the wrong #ifdef check around it, leading to an unused-function warning when CONFIG_SYSCTL is disabled: kernel/sysctl.c:205:12: error: unused function 'bpf_stats_handler' [-Werror,-Wunused-function] static int bpf_stats_handler(struct ctl_table *table, int write, Fix the check to match the reference. Fixes: d46edd671a14 ("bpf: Sharing bpf runtime stats with BPF_ENABLE_STATS") Signed-off-by: Arnd Bergmann Signed-off-by: Alexei Starovoitov Reviewed-by: Luis Chamberlain Acked-by: Martin KaFai Lau Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200505140734.503701-1-arnd@arndb.de --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7adfe5dbce9d..17c7633d90fc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -201,7 +201,7 @@ static int max_extfrag_threshold = 1000; #endif /* CONFIG_SYSCTL */ -#ifdef CONFIG_BPF_SYSCALL +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) static int bpf_stats_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) -- cgit v1.2.3 From ae24345da54e452880808b011fa2d8a0bbd191ba Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:58:59 -0700 Subject: bpf: Implement an interface to register bpf_iter targets The target can call bpf_iter_reg_target() to register itself. The needed information: target: target name seq_ops: the seq_file operations for the target init_seq_private target callback to initialize seq_priv during file open fini_seq_private target callback to clean up seq_priv during file release seq_priv_size: the private_data size needed by the seq_file operations The target name represents a target which provides a seq_ops for iterating objects. The target can provide two callback functions, init_seq_private and fini_seq_private, called during file open/release time. For example, /proc/net/{tcp6, ipv6_route, netlink, ...}, net name space needs to be setup properly during file open and released properly during file release. Function bpf_iter_unreg_target() is also implemented to unregister a particular target. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175859.2474669-1-yhs@fb.com --- kernel/bpf/Makefile | 2 +- kernel/bpf/bpf_iter.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/bpf_iter.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index f2d7be596966..6a8b0febd3f6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c new file mode 100644 index 000000000000..5a8119d17d14 --- /dev/null +++ b/kernel/bpf/bpf_iter.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ + +#include +#include +#include + +struct bpf_iter_target_info { + struct list_head list; + const char *target; + const struct seq_operations *seq_ops; + bpf_iter_init_seq_priv_t init_seq_private; + bpf_iter_fini_seq_priv_t fini_seq_private; + u32 seq_priv_size; +}; + +static struct list_head targets = LIST_HEAD_INIT(targets); +static DEFINE_MUTEX(targets_mutex); + +int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) +{ + struct bpf_iter_target_info *tinfo; + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + return -ENOMEM; + + tinfo->target = reg_info->target; + tinfo->seq_ops = reg_info->seq_ops; + tinfo->init_seq_private = reg_info->init_seq_private; + tinfo->fini_seq_private = reg_info->fini_seq_private; + tinfo->seq_priv_size = reg_info->seq_priv_size; + INIT_LIST_HEAD(&tinfo->list); + + mutex_lock(&targets_mutex); + list_add(&tinfo->list, &targets); + mutex_unlock(&targets_mutex); + + return 0; +} + +void bpf_iter_unreg_target(const char *target) +{ + struct bpf_iter_target_info *tinfo; + bool found = false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (!strcmp(target, tinfo->target)) { + list_del(&tinfo->list); + kfree(tinfo); + found = true; + break; + } + } + mutex_unlock(&targets_mutex); + + WARN_ON(found == false); +} -- cgit v1.2.3 From 15d83c4d7cef5c067a8b075ce59e97df4f60706e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:00 -0700 Subject: bpf: Allow loading of a bpf_iter program A bpf_iter program is a tracing program with attach type BPF_TRACE_ITER. The load attribute attach_btf_id is used by the verifier against a particular kernel function, which represents a target, e.g., __bpf_iter__bpf_map for target bpf_map which is implemented later. The program return value must be 0 or 1 for now. 0 : successful, except potential seq_file buffer overflow which is handled by seq_file reader. 1 : request to restart the same object In the future, other return values may be used for filtering or teminating the iterator. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175900.2474947-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 36 ++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 21 +++++++++++++++++++++ 2 files changed, 57 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5a8119d17d14..dec182d8395a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -12,6 +12,7 @@ struct bpf_iter_target_info { bpf_iter_init_seq_priv_t init_seq_private; bpf_iter_fini_seq_priv_t fini_seq_private; u32 seq_priv_size; + u32 btf_id; /* cached value */ }; static struct list_head targets = LIST_HEAD_INIT(targets); @@ -57,3 +58,38 @@ void bpf_iter_unreg_target(const char *target) WARN_ON(found == false); } + +static void cache_btf_id(struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + tinfo->btf_id = prog->aux->attach_btf_id; +} + +bool bpf_iter_prog_supported(struct bpf_prog *prog) +{ + const char *attach_fname = prog->aux->attach_func_name; + u32 prog_btf_id = prog->aux->attach_btf_id; + const char *prefix = BPF_ITER_FUNC_PREFIX; + struct bpf_iter_target_info *tinfo; + int prefix_len = strlen(prefix); + bool supported = false; + + if (strncmp(attach_fname, prefix, prefix_len)) + return false; + + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) { + supported = true; + break; + } + if (!strcmp(attach_fname + prefix_len, tinfo->target)) { + cache_btf_id(tinfo, prog); + supported = true; + break; + } + } + mutex_unlock(&targets_mutex); + + return supported; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 70ad009577f8..d725ff7d11db 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7101,6 +7101,10 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; range = tnum_const(0); break; + case BPF_PROG_TYPE_TRACING: + if (env->prog->expected_attach_type != BPF_TRACE_ITER) + return 0; + break; default: return 0; } @@ -10481,6 +10485,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) struct bpf_prog *tgt_prog = prog->aux->linked_prog; u32 btf_id = prog->aux->attach_btf_id; const char prefix[] = "btf_trace_"; + struct btf_func_model fmodel; int ret = 0, subprog = -1, i; struct bpf_trampoline *tr; const struct btf_type *t; @@ -10622,6 +10627,22 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; prog->aux->attach_btf_trace = true; return 0; + case BPF_TRACE_ITER: + if (!btf_type_is_func(t)) { + verbose(env, "attach_btf_id %u is not a function\n", + btf_id); + return -EINVAL; + } + t = btf_type_by_id(btf, t->type); + if (!btf_type_is_func_proto(t)) + return -EINVAL; + prog->aux->attach_func_name = tname; + prog->aux->attach_func_proto = t; + if (!bpf_iter_prog_supported(prog)) + return -EINVAL; + ret = btf_distill_func_proto(&env->log, btf, t, + tname, &fmodel); + return ret; default: if (!prog_extension) return -EINVAL; -- cgit v1.2.3 From de4e05cac46d206f9090051ef09930514bff73e4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:01 -0700 Subject: bpf: Support bpf tracing/iter programs for BPF_LINK_CREATE Given a bpf program, the step to create an anonymous bpf iterator is: - create a bpf_iter_link, which combines bpf program and the target. In the future, there could be more information recorded in the link. A link_fd will be returned to the user space. - create an anonymous bpf iterator with the given link_fd. The bpf_iter_link can be pinned to bpffs mount file system to create a file based bpf iterator as well. The benefit to use of bpf_iter_link: - using bpf link simplifies design and implementation as bpf link is used for other tracing bpf programs. - for file based bpf iterator, bpf_iter_link provides a standard way to replace underlying bpf programs. - for both anonymous and free based iterators, bpf link query capability can be leveraged. The patch added support of tracing/iter programs for BPF_LINK_CREATE. A new link type BPF_LINK_TYPE_ITER is added to facilitate link querying. Currently, only prog_id is needed, so there is no additional in-kernel show_fdinfo() and fill_link_info() hook is needed for BPF_LINK_TYPE_ITER link. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175901.2475084-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 14 ++++++++++++ 2 files changed, 76 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index dec182d8395a..03f5832909db 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -15,6 +15,11 @@ struct bpf_iter_target_info { u32 btf_id; /* cached value */ }; +struct bpf_iter_link { + struct bpf_link link; + struct bpf_iter_target_info *tinfo; +}; + static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); @@ -93,3 +98,60 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) return supported; } + +static void bpf_iter_link_release(struct bpf_link *link) +{ +} + +static void bpf_iter_link_dealloc(struct bpf_link *link) +{ + struct bpf_iter_link *iter_link = + container_of(link, struct bpf_iter_link, link); + + kfree(iter_link); +} + +static const struct bpf_link_ops bpf_iter_link_lops = { + .release = bpf_iter_link_release, + .dealloc = bpf_iter_link_dealloc, +}; + +int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + struct bpf_link_primer link_primer; + struct bpf_iter_target_info *tinfo; + struct bpf_iter_link *link; + bool existed = false; + u32 prog_btf_id; + int err; + + if (attr->link_create.target_fd || attr->link_create.flags) + return -EINVAL; + + prog_btf_id = prog->aux->attach_btf_id; + mutex_lock(&targets_mutex); + list_for_each_entry(tinfo, &targets, list) { + if (tinfo->btf_id == prog_btf_id) { + existed = true; + break; + } + } + mutex_unlock(&targets_mutex); + if (!existed) + return -ENOENT; + + link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); + if (!link) + return -ENOMEM; + + bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); + link->tinfo = tinfo; + + err = bpf_link_prime(&link->link, &link_primer); + if (err) { + kfree(link); + return err; + } + + return bpf_link_settle(&link_primer); +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bb1ab7da6103..6ffe2d8fb6c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2729,6 +2729,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_GETSOCKOPT: case BPF_CGROUP_SETSOCKOPT: return BPF_PROG_TYPE_CGROUP_SOCKOPT; + case BPF_TRACE_ITER: + return BPF_PROG_TYPE_TRACING; default: return BPF_PROG_TYPE_UNSPEC; } @@ -3729,6 +3731,15 @@ err_put: return err; } +static int tracing_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + if (attr->link_create.attach_type == BPF_TRACE_ITER && + prog->expected_attach_type == BPF_TRACE_ITER) + return bpf_iter_link_attach(attr, prog); + + return -EINVAL; +} + #define BPF_LINK_CREATE_LAST_FIELD link_create.flags static int link_create(union bpf_attr *attr) { @@ -3765,6 +3776,9 @@ static int link_create(union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SOCKOPT: ret = cgroup_bpf_link_attach(attr, prog); break; + case BPF_PROG_TYPE_TRACING: + ret = tracing_bpf_link_attach(attr, prog); + break; default: ret = -EINVAL; } -- cgit v1.2.3 From 2057c92bc927f09b22f5609425eb37d7e782f484 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:02 -0700 Subject: bpf: Support bpf tracing/iter programs for BPF_LINK_UPDATE Added BPF_LINK_UPDATE support for tracing/iter programs. This way, a file based bpf iterator, which holds a reference to the link, can have its bpf program updated without creating new files. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175902.2475262-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 03f5832909db..0542a243b78c 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -23,6 +23,9 @@ struct bpf_iter_link { static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); +/* protect bpf_iter_link changes */ +static DEFINE_MUTEX(link_mutex); + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -111,9 +114,37 @@ static void bpf_iter_link_dealloc(struct bpf_link *link) kfree(iter_link); } +static int bpf_iter_link_replace(struct bpf_link *link, + struct bpf_prog *new_prog, + struct bpf_prog *old_prog) +{ + int ret = 0; + + mutex_lock(&link_mutex); + if (old_prog && link->prog != old_prog) { + ret = -EPERM; + goto out_unlock; + } + + if (link->prog->type != new_prog->type || + link->prog->expected_attach_type != new_prog->expected_attach_type || + link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) { + ret = -EINVAL; + goto out_unlock; + } + + old_prog = xchg(&link->prog, new_prog); + bpf_prog_put(old_prog); + +out_unlock: + mutex_unlock(&link_mutex); + return ret; +} + static const struct bpf_link_ops bpf_iter_link_lops = { .release = bpf_iter_link_release, .dealloc = bpf_iter_link_dealloc, + .update_prog = bpf_iter_link_replace, }; int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) -- cgit v1.2.3 From fd4f12bc38c3ad9107169e7c9e6e7f81d93dda97 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:04 -0700 Subject: bpf: Implement bpf_seq_read() for bpf iterator bpf iterator uses seq_file to provide a lossless way to transfer data to user space. But we want to call bpf program after all objects have been traversed, and bpf program may write additional data to the seq_file buffer. The current seq_read() does not work for this use case. Besides allowing stop() function to write to the buffer, the bpf_seq_read() also fixed the buffer size to one page. If any single call of show() or stop() will emit data more than one page to cause overflow, -E2BIG error code will be returned to user space. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175904.2475468-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0542a243b78c..832973ee80fa 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -26,6 +26,129 @@ static DEFINE_MUTEX(targets_mutex); /* protect bpf_iter_link changes */ static DEFINE_MUTEX(link_mutex); +/* bpf_seq_read, a customized and simpler version for bpf iterator. + * no_llseek is assumed for this file. + * The following are differences from seq_read(): + * . fixed buffer size (PAGE_SIZE) + * . assuming no_llseek + * . stop() may call bpf program, handling potential overflow there + */ +static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + size_t n, offs, copied = 0; + int err = 0; + void *p; + + mutex_lock(&seq->lock); + + if (!seq->buf) { + seq->size = PAGE_SIZE; + seq->buf = kmalloc(seq->size, GFP_KERNEL); + if (!seq->buf) { + err = -ENOMEM; + goto done; + } + } + + if (seq->count) { + n = min(seq->count, size); + err = copy_to_user(buf, seq->buf + seq->from, n); + if (err) { + err = -EFAULT; + goto done; + } + seq->count -= n; + seq->from += n; + copied = n; + goto done; + } + + seq->from = 0; + p = seq->op->start(seq, &seq->index); + if (!p) + goto stop; + if (IS_ERR(p)) { + err = PTR_ERR(p); + seq->op->stop(seq, p); + seq->count = 0; + goto done; + } + + err = seq->op->show(seq, p); + if (err > 0) { + seq->count = 0; + } else if (err < 0 || seq_has_overflowed(seq)) { + if (!err) + err = -E2BIG; + seq->op->stop(seq, p); + seq->count = 0; + goto done; + } + + while (1) { + loff_t pos = seq->index; + + offs = seq->count; + p = seq->op->next(seq, p, &seq->index); + if (pos == seq->index) { + pr_info_ratelimited("buggy seq_file .next function %ps " + "did not updated position index\n", + seq->op->next); + seq->index++; + } + + if (IS_ERR_OR_NULL(p)) + break; + + if (seq->count >= size) + break; + + err = seq->op->show(seq, p); + if (err > 0) { + seq->count = offs; + } else if (err < 0 || seq_has_overflowed(seq)) { + seq->count = offs; + if (offs == 0) { + if (!err) + err = -E2BIG; + seq->op->stop(seq, p); + goto done; + } + break; + } + } +stop: + offs = seq->count; + /* bpf program called if !p */ + seq->op->stop(seq, p); + if (!p && seq_has_overflowed(seq)) { + seq->count = offs; + if (offs == 0) { + err = -E2BIG; + goto done; + } + } + + n = min(seq->count, size); + err = copy_to_user(buf, seq->buf, n); + if (err) { + err = -EFAULT; + goto done; + } + copied = n; + seq->count -= n; + seq->from = n; +done: + if (!copied) + copied = err; + else + *ppos += copied; + mutex_unlock(&seq->lock); + return copied; +} + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; -- cgit v1.2.3 From ac51d99bf81caac8d8881fe52098948110d0de68 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:05 -0700 Subject: bpf: Create anonymous bpf iterator A new bpf command BPF_ITER_CREATE is added. The anonymous bpf iterator is seq_file based. The seq_file private data are referenced by targets. The bpf_iter infrastructure allocated additional space at seq_file->private before the space used by targets to store some meta data, e.g., prog: prog to run session_id: an unique id for each opened seq_file seq_num: how many times bpf programs are queried in this session done_stop: an internal state to decide whether bpf program should be called in seq_ops->stop() or not The seq_num will start from 0 for valid objects. The bpf program may see the same seq_num more than once if - seq_file buffer overflow happens and the same object is retried by bpf_seq_read(), or - the bpf program explicitly requests a retry of the same object Since module is not supported for bpf_iter, all target registeration happens at __init time, so there is no need to change bpf_iter_unreg_target() as it is used mostly in error path of the init function at which time no bpf iterators have been created yet. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175905.2475770-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 26 ++++++++++ 2 files changed, 155 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 832973ee80fa..e7129b57865f 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -2,6 +2,7 @@ /* Copyright (c) 2020 Facebook */ #include +#include #include #include @@ -20,12 +21,24 @@ struct bpf_iter_link { struct bpf_iter_target_info *tinfo; }; +struct bpf_iter_priv_data { + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u64 session_id; + u64 seq_num; + bool done_stop; + u8 target_private[] __aligned(8); +}; + static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); /* protect bpf_iter_link changes */ static DEFINE_MUTEX(link_mutex); +/* incremented on every opened seq_file */ +static atomic64_t session_id; + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -149,6 +162,33 @@ done: return copied; } +static int iter_release(struct inode *inode, struct file *file) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + + seq = file->private_data; + if (!seq) + return 0; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + + if (iter_priv->tinfo->fini_seq_private) + iter_priv->tinfo->fini_seq_private(seq->private); + + bpf_prog_put(iter_priv->prog); + seq->private = iter_priv; + + return seq_release_private(inode, file); +} + +static const struct file_operations bpf_iter_fops = { + .llseek = no_llseek, + .read = bpf_seq_read, + .release = iter_release, +}; + int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -309,3 +349,92 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return bpf_link_settle(&link_primer); } + +static void init_seq_meta(struct bpf_iter_priv_data *priv_data, + struct bpf_iter_target_info *tinfo, + struct bpf_prog *prog) +{ + priv_data->tinfo = tinfo; + priv_data->prog = prog; + priv_data->session_id = atomic64_inc_return(&session_id); + priv_data->seq_num = 0; + priv_data->done_stop = false; +} + +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) +{ + struct bpf_iter_priv_data *priv_data; + struct bpf_iter_target_info *tinfo; + struct bpf_prog *prog; + u32 total_priv_dsize; + struct seq_file *seq; + int err = 0; + + mutex_lock(&link_mutex); + prog = link->link.prog; + bpf_prog_inc(prog); + mutex_unlock(&link_mutex); + + tinfo = link->tinfo; + total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + + tinfo->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + if (!priv_data) { + err = -ENOMEM; + goto release_prog; + } + + if (tinfo->init_seq_private) { + err = tinfo->init_seq_private(priv_data->target_private); + if (err) + goto release_seq_file; + } + + init_seq_meta(priv_data, tinfo, prog); + seq = file->private_data; + seq->private = priv_data->target_private; + + return 0; + +release_seq_file: + seq_release_private(file->f_inode, file); + file->private_data = NULL; +release_prog: + bpf_prog_put(prog); + return err; +} + +int bpf_iter_new_fd(struct bpf_link *link) +{ + struct file *file; + unsigned int flags; + int err, fd; + + if (link->ops != &bpf_iter_link_lops) + return -EINVAL; + + flags = O_RDONLY | O_CLOEXEC; + fd = get_unused_fd_flags(flags); + if (fd < 0) + return fd; + + file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto free_fd; + } + + err = prepare_seq_file(file, + container_of(link, struct bpf_iter_link, link)); + if (err) + goto free_file; + + fd_install(fd, file); + return fd; + +free_file: + fput(file); +free_fd: + put_unused_fd(fd); + return err; +} diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6ffe2d8fb6c7..a293e88ee01a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3941,6 +3941,29 @@ static int bpf_enable_stats(union bpf_attr *attr) return -EINVAL; } +#define BPF_ITER_CREATE_LAST_FIELD iter_create.flags + +static int bpf_iter_create(union bpf_attr *attr) +{ + struct bpf_link *link; + int err; + + if (CHECK_ATTR(BPF_ITER_CREATE)) + return -EINVAL; + + if (attr->iter_create.flags) + return -EINVAL; + + link = bpf_link_get_from_fd(attr->iter_create.link_fd); + if (IS_ERR(link)) + return PTR_ERR(link); + + err = bpf_iter_new_fd(link); + bpf_link_put(link); + + return err; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr; @@ -4068,6 +4091,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_ENABLE_STATS: err = bpf_enable_stats(&attr); break; + case BPF_ITER_CREATE: + err = bpf_iter_create(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 367ec3e4834cbd611401c2c40a23c22c825474f1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:06 -0700 Subject: bpf: Create file bpf iterator To produce a file bpf iterator, the fd must be corresponding to a link_fd assocciated with a trace/iter program. When the pinned file is opened, a seq_file will be generated. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175906.2475893-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 17 ++++++++++++++++- kernel/bpf/inode.c | 5 ++++- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index e7129b57865f..090f09b0eacb 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -39,6 +39,8 @@ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; +static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -162,6 +164,13 @@ done: return copied; } +static int iter_open(struct inode *inode, struct file *file) +{ + struct bpf_iter_link *link = inode->i_private; + + return prepare_seq_file(file, link); +} + static int iter_release(struct inode *inode, struct file *file) { struct bpf_iter_priv_data *iter_priv; @@ -183,7 +192,8 @@ static int iter_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static const struct file_operations bpf_iter_fops = { +const struct file_operations bpf_iter_fops = { + .open = iter_open, .llseek = no_llseek, .read = bpf_seq_read, .release = iter_release, @@ -310,6 +320,11 @@ static const struct bpf_link_ops bpf_iter_link_lops = { .update_prog = bpf_iter_link_replace, }; +bool bpf_link_is_iter(struct bpf_link *link) +{ + return link->ops == &bpf_iter_link_lops; +} + int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) { struct bpf_link_primer link_primer; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 95087d9f4ed3..fb878ba3f22f 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -358,8 +358,11 @@ static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) { + struct bpf_link *link = arg; + return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, - &bpffs_obj_fops); + bpf_link_is_iter(link) ? + &bpf_iter_fops : &bpffs_obj_fops); } static struct dentry * -- cgit v1.2.3 From e5158d987b72c3f318b4b52a01ac6f3997bd0c00 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:07 -0700 Subject: bpf: Implement common macros/helpers for target iterators Macro DEFINE_BPF_ITER_FUNC is implemented so target can define an init function to capture the BTF type which represents the target. The bpf_iter_meta is a structure holding meta data, common to all targets in the bpf program. Additional marker functions are called before or after bpf_seq_read() show()/next()/stop() callback functions to help calculate precise seq_num and whether call bpf_prog inside stop(). Two functions, bpf_iter_get_info() and bpf_iter_run_prog(), are implemented so target can get needed information from bpf_iter infrastructure and can run the program. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175907.2475956-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 80 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 090f09b0eacb..30efd15cd4a0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -41,6 +41,33 @@ static atomic64_t session_id; static int prepare_seq_file(struct file *file, struct bpf_iter_link *link); +static void bpf_iter_inc_seq_num(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->seq_num++; +} + +static void bpf_iter_dec_seq_num(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->seq_num--; +} + +static void bpf_iter_done_stop(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + iter_priv->done_stop = true; +} + /* bpf_seq_read, a customized and simpler version for bpf iterator. * no_llseek is assumed for this file. * The following are differences from seq_read(): @@ -93,6 +120,10 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, err = seq->op->show(seq, p); if (err > 0) { + /* object is skipped, decrease seq_num, so next + * valid object can reuse the same seq_num. + */ + bpf_iter_dec_seq_num(seq); seq->count = 0; } else if (err < 0 || seq_has_overflowed(seq)) { if (!err) @@ -117,11 +148,15 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, if (IS_ERR_OR_NULL(p)) break; + /* got a valid next object, increase seq_num */ + bpf_iter_inc_seq_num(seq); + if (seq->count >= size) break; err = seq->op->show(seq, p); if (err > 0) { + bpf_iter_dec_seq_num(seq); seq->count = offs; } else if (err < 0 || seq_has_overflowed(seq)) { seq->count = offs; @@ -138,11 +173,15 @@ stop: offs = seq->count; /* bpf program called if !p */ seq->op->stop(seq, p); - if (!p && seq_has_overflowed(seq)) { - seq->count = offs; - if (offs == 0) { - err = -E2BIG; - goto done; + if (!p) { + if (!seq_has_overflowed(seq)) { + bpf_iter_done_stop(seq); + } else { + seq->count = offs; + if (offs == 0) { + err = -E2BIG; + goto done; + } } } @@ -453,3 +492,39 @@ free_fd: put_unused_fd(fd); return err; } + +struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) +{ + struct bpf_iter_priv_data *iter_priv; + struct seq_file *seq; + void *seq_priv; + + seq = meta->seq; + if (seq->file->f_op != &bpf_iter_fops) + return NULL; + + seq_priv = seq->private; + iter_priv = container_of(seq_priv, struct bpf_iter_priv_data, + target_private); + + if (in_stop && iter_priv->done_stop) + return NULL; + + meta->session_id = iter_priv->session_id; + meta->seq_num = iter_priv->seq_num; + + return iter_priv->prog; +} + +int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) +{ + int ret; + + rcu_read_lock(); + migrate_disable(); + ret = BPF_PROG_RUN(prog, ctx); + migrate_enable(); + rcu_read_unlock(); + + return ret == 0 ? 0 : -EAGAIN; +} -- cgit v1.2.3 From 6086d29def80edd78f9832ea6eafa74e3818f6a7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:09 -0700 Subject: bpf: Add bpf_map iterator Implement seq_file operations to traverse all bpf_maps. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175909.2476096-1-yhs@fb.com --- kernel/bpf/Makefile | 2 +- kernel/bpf/map_iter.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 19 ++++++++++ 3 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/map_iter.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 6a8b0febd3f6..b2b5eefc5254 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c new file mode 100644 index 000000000000..8162e0c00b9f --- /dev/null +++ b/kernel/bpf/map_iter.c @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include +#include +#include +#include + +struct bpf_iter_seq_map_info { + u32 mid; +}; + +static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_map_info *info = seq->private; + struct bpf_map *map; + + map = bpf_map_get_curr_or_next(&info->mid); + if (!map) + return NULL; + + ++*pos; + return map; +} + +static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_map_info *info = seq->private; + struct bpf_map *map; + + ++*pos; + ++info->mid; + bpf_map_put((struct bpf_map *)v); + map = bpf_map_get_curr_or_next(&info->mid); + if (!map) + return NULL; + + return map; +} + +struct bpf_iter__bpf_map { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct bpf_map *, map); +}; + +DEFINE_BPF_ITER_FUNC(bpf_map, struct bpf_iter_meta *meta, struct bpf_map *map) + +static int __bpf_map_seq_show(struct seq_file *seq, void *v, bool in_stop) +{ + struct bpf_iter__bpf_map ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret = 0; + + ctx.meta = &meta; + ctx.map = v; + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (prog) + ret = bpf_iter_run_prog(prog, &ctx); + + return ret; +} + +static int bpf_map_seq_show(struct seq_file *seq, void *v) +{ + return __bpf_map_seq_show(seq, v, false); +} + +static void bpf_map_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__bpf_map_seq_show(seq, v, true); + else + bpf_map_put((struct bpf_map *)v); +} + +static const struct seq_operations bpf_map_seq_ops = { + .start = bpf_map_seq_start, + .next = bpf_map_seq_next, + .stop = bpf_map_seq_stop, + .show = bpf_map_seq_show, +}; + +static int __init bpf_map_iter_init(void) +{ + struct bpf_iter_reg reg_info = { + .target = "bpf_map", + .seq_ops = &bpf_map_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), + }; + + return bpf_iter_reg_target(®_info); +} + +late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a293e88ee01a..de2a75500233 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2934,6 +2934,25 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr, return err; } +struct bpf_map *bpf_map_get_curr_or_next(u32 *id) +{ + struct bpf_map *map; + + spin_lock_bh(&map_idr_lock); +again: + map = idr_get_next(&map_idr, id); + if (map) { + map = __bpf_map_inc_not_zero(map, false); + if (IS_ERR(map)) { + (*id)++; + goto again; + } + } + spin_unlock_bh(&map_idr_lock); + + return map; +} + #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id struct bpf_prog *bpf_prog_by_id(u32 id) -- cgit v1.2.3 From eaaacd23910f2d7c4b22d43f591002cc217d294b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:11 -0700 Subject: bpf: Add task and task/file iterator targets Only the tasks belonging to "current" pid namespace are enumerated. For task/file target, the bpf program will have access to struct task_struct *task u32 fd struct file *file where fd/file is an open file for the task. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175911.2476407-1-yhs@fb.com --- kernel/bpf/Makefile | 2 +- kernel/bpf/task_iter.c | 333 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 334 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/task_iter.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index b2b5eefc5254..37b2d8620153 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,7 +2,7 @@ obj-y := core.o CFLAGS_core.o += $(call cc-disable-warning, override-init) -obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c new file mode 100644 index 000000000000..aeed662d8451 --- /dev/null +++ b/kernel/bpf/task_iter.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ + +#include +#include +#include +#include +#include +#include + +struct bpf_iter_seq_task_common { + struct pid_namespace *ns; +}; + +struct bpf_iter_seq_task_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + u32 tid; +}; + +static struct task_struct *task_seq_get_next(struct pid_namespace *ns, + u32 *tid) +{ + struct task_struct *task = NULL; + struct pid *pid; + + rcu_read_lock(); + pid = idr_get_next(&ns->idr, tid); + if (pid) + task = get_pid_task(pid, PIDTYPE_PID); + rcu_read_unlock(); + + return task; +} + +static void *task_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_info *info = seq->private; + struct task_struct *task; + + task = task_seq_get_next(info->common.ns, &info->tid); + if (!task) + return NULL; + + ++*pos; + return task; +} + +static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_info *info = seq->private; + struct task_struct *task; + + ++*pos; + ++info->tid; + put_task_struct((struct task_struct *)v); + task = task_seq_get_next(info->common.ns, &info->tid); + if (!task) + return NULL; + + return task; +} + +struct bpf_iter__task { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); +}; + +DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) + +static int __task_seq_show(struct seq_file *seq, struct task_struct *task, + bool in_stop) +{ + struct bpf_iter_meta meta; + struct bpf_iter__task ctx; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + meta.seq = seq; + ctx.meta = &meta; + ctx.task = task; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_seq_show(struct seq_file *seq, void *v) +{ + return __task_seq_show(seq, v, false); +} + +static void task_seq_stop(struct seq_file *seq, void *v) +{ + if (!v) + (void)__task_seq_show(seq, v, true); + else + put_task_struct((struct task_struct *)v); +} + +static const struct seq_operations task_seq_ops = { + .start = task_seq_start, + .next = task_seq_next, + .stop = task_seq_stop, + .show = task_seq_show, +}; + +struct bpf_iter_seq_task_file_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + struct task_struct *task; + struct files_struct *files; + u32 tid; + u32 fd; +}; + +static struct file * +task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, + struct task_struct **task, struct files_struct **fstruct) +{ + struct pid_namespace *ns = info->common.ns; + u32 curr_tid = info->tid, max_fds; + struct files_struct *curr_files; + struct task_struct *curr_task; + int curr_fd = info->fd; + + /* If this function returns a non-NULL file object, + * it held a reference to the task/files_struct/file. + * Otherwise, it does not hold any reference. + */ +again: + if (*task) { + curr_task = *task; + curr_files = *fstruct; + curr_fd = info->fd; + } else { + curr_task = task_seq_get_next(ns, &curr_tid); + if (!curr_task) + return NULL; + + curr_files = get_files_struct(curr_task); + if (!curr_files) { + put_task_struct(curr_task); + curr_tid = ++(info->tid); + info->fd = 0; + goto again; + } + + /* set *fstruct, *task and info->tid */ + *fstruct = curr_files; + *task = curr_task; + if (curr_tid == info->tid) { + curr_fd = info->fd; + } else { + info->tid = curr_tid; + curr_fd = 0; + } + } + + rcu_read_lock(); + max_fds = files_fdtable(curr_files)->max_fds; + for (; curr_fd < max_fds; curr_fd++) { + struct file *f; + + f = fcheck_files(curr_files, curr_fd); + if (!f) + continue; + + /* set info->fd */ + info->fd = curr_fd; + get_file(f); + rcu_read_unlock(); + return f; + } + + /* the current task is done, go to the next task */ + rcu_read_unlock(); + put_files_struct(curr_files); + put_task_struct(curr_task); + *task = NULL; + *fstruct = NULL; + info->fd = 0; + curr_tid = ++(info->tid); + goto again; +} + +static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct files_struct *files = NULL; + struct task_struct *task = NULL; + struct file *file; + + file = task_file_seq_get_next(info, &task, &files); + if (!file) { + info->files = NULL; + info->task = NULL; + return NULL; + } + + ++*pos; + info->task = task; + info->files = files; + + return file; +} + +static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct files_struct *files = info->files; + struct task_struct *task = info->task; + struct file *file; + + ++*pos; + ++info->fd; + fput((struct file *)v); + file = task_file_seq_get_next(info, &task, &files); + if (!file) { + info->files = NULL; + info->task = NULL; + return NULL; + } + + info->task = task; + info->files = files; + + return file; +} + +struct bpf_iter__task_file { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); + u32 fd __aligned(8); + __bpf_md_ptr(struct file *, file); +}; + +DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, + struct task_struct *task, u32 fd, + struct file *file) + +static int __task_file_seq_show(struct seq_file *seq, struct file *file, + bool in_stop) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + struct bpf_iter__task_file ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.task = info->task; + ctx.fd = info->fd; + ctx.file = file; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_file_seq_show(struct seq_file *seq, void *v) +{ + return __task_file_seq_show(seq, v, false); +} + +static void task_file_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_task_file_info *info = seq->private; + + if (!v) { + (void)__task_file_seq_show(seq, v, true); + } else { + fput((struct file *)v); + put_files_struct(info->files); + put_task_struct(info->task); + info->files = NULL; + info->task = NULL; + } +} + +static int init_seq_pidns(void *priv_data) +{ + struct bpf_iter_seq_task_common *common = priv_data; + + common->ns = get_pid_ns(task_active_pid_ns(current)); + return 0; +} + +static void fini_seq_pidns(void *priv_data) +{ + struct bpf_iter_seq_task_common *common = priv_data; + + put_pid_ns(common->ns); +} + +static const struct seq_operations task_file_seq_ops = { + .start = task_file_seq_start, + .next = task_file_seq_next, + .stop = task_file_seq_stop, + .show = task_file_seq_show, +}; + +static int __init task_iter_init(void) +{ + struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", + .seq_ops = &task_file_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), + }; + struct bpf_iter_reg task_reg_info = { + .target = "task", + .seq_ops = &task_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), + }; + int ret; + + ret = bpf_iter_reg_target(&task_reg_info); + if (ret) + return ret; + + return bpf_iter_reg_target(&task_file_reg_info); +} +late_initcall(task_iter_init); -- cgit v1.2.3 From b121b341e5983bdccf7a5d6cf9236a45c965a31f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:12 -0700 Subject: bpf: Add PTR_TO_BTF_ID_OR_NULL support Add bpf_reg_type PTR_TO_BTF_ID_OR_NULL support. For tracing/iter program, the bpf program context definition, e.g., for previous bpf_map target, looks like struct bpf_iter__bpf_map { struct bpf_iter_meta *meta; struct bpf_map *map; }; The kernel guarantees that meta is not NULL, but map pointer maybe NULL. The NULL map indicates that all objects have been traversed, so bpf program can take proper action, e.g., do final aggregation and/or send final report to user space. Add btf_id_or_null_non0_off to prog->aux structure, to indicate that if the context access offset is not 0, set to PTR_TO_BTF_ID_OR_NULL instead of PTR_TO_BTF_ID. This bit is set for tracing/iter program. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175912.2476576-1-yhs@fb.com --- kernel/bpf/btf.c | 5 ++++- kernel/bpf/verifier.c | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index a2cfba89a8e1..c490fbde22d4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3790,7 +3790,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - info->reg_type = PTR_TO_BTF_ID; + if (off != 0 && prog->aux->btf_id_or_null_non0_off) + info->reg_type = PTR_TO_BTF_ID_OR_NULL; + else + info->reg_type = PTR_TO_BTF_ID; if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d725ff7d11db..36b2a38a06fe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -398,7 +398,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type) return type == PTR_TO_MAP_VALUE_OR_NULL || type == PTR_TO_SOCKET_OR_NULL || type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL; + type == PTR_TO_TCP_SOCK_OR_NULL || + type == PTR_TO_BTF_ID_OR_NULL; } static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) @@ -483,6 +484,7 @@ static const char * const reg_type_str[] = { [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", }; static char slot_type_char[] = { @@ -543,7 +545,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID) + if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL) verbose(env, "%s", kernel_type_name(reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) @@ -2139,6 +2141,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID_OR_NULL: return true; default: return false; @@ -2659,7 +2662,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID) + if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) *btf_id = info.btf_id; else env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; @@ -3243,7 +3246,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (reg_type == PTR_TO_BTF_ID) + if (reg_type == PTR_TO_BTF_ID || + reg_type == PTR_TO_BTF_ID_OR_NULL) regs[value_regno].btf_id = btf_id; } regs[value_regno].type = reg_type; @@ -6572,6 +6576,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_SOCK_COMMON; } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { reg->type = PTR_TO_TCP_SOCK; + } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { + reg->type = PTR_TO_BTF_ID; } if (is_null) { /* We don't need id and ref_obj_id from this point @@ -8429,6 +8435,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID_OR_NULL: return false; default: return true; @@ -10640,6 +10647,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; if (!bpf_iter_prog_supported(prog)) return -EINVAL; + prog->aux->btf_id_or_null_non0_off = true; ret = btf_distill_func_proto(&env->log, btf, t, tname, &fmodel); return ret; -- cgit v1.2.3 From 492e639f0c222784e2e0f121966375f641c61b15 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:14 -0700 Subject: bpf: Add bpf_seq_printf and bpf_seq_write helpers Two helpers bpf_seq_printf and bpf_seq_write, are added for writing data to the seq_file buffer. bpf_seq_printf supports common format string flag/width/type fields so at least I can get identical results for netlink and ipv6_route targets. For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW specifically indicates a write failure due to overflow, which means the object will be repeated in the next bpf invocation if object collection stays the same. Note that if the object collection is changed, depending how collection traversal is done, even if the object still in the collection, it may not be visited. For bpf_seq_printf, format %s, %p{i,I}{4,6} needs to read kernel memory. Reading kernel memory may fail in the following two cases: - invalid kernel address, or - valid kernel address but requiring a major fault If reading kernel memory failed, the %s string will be an empty string and %p{i,I}{4,6} will be all 0. Not returning error to bpf program is consistent with what bpf_trace_printk() does for now. bpf_seq_printf may return -EBUSY meaning that internal percpu buffer for memory copy of strings or other pointees is not available. Bpf program can return 1 to indicate it wants the same object to be repeated. Right now, this should not happen on no-RT kernels since migrate_disable(), which guards bpf prog call, calls preempt_disable(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175914.2476661-1-yhs@fb.com --- kernel/trace/bpf_trace.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e875c95d3ced..d961428fb5b6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -457,6 +457,212 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } +#define MAX_SEQ_PRINTF_VARARGS 12 +#define MAX_SEQ_PRINTF_MAX_MEMCPY 6 +#define MAX_SEQ_PRINTF_STR_LEN 128 + +struct bpf_seq_printf_buf { + char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN]; +}; +static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf); +static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used); + +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, + const void *, data, u32, data_len) +{ + int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0; + int i, buf_used, copy_size, num_args; + u64 params[MAX_SEQ_PRINTF_VARARGS]; + struct bpf_seq_printf_buf *bufs; + const u64 *args = data; + + buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used); + if (WARN_ON_ONCE(buf_used > 1)) { + err = -EBUSY; + goto out; + } + + bufs = this_cpu_ptr(&bpf_seq_printf_buf); + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + goto out; + + if (data_len & 7) + goto out; + + for (i = 0; i < fmt_size; i++) { + if (fmt[i] == '%') { + if (fmt[i + 1] == '%') + i++; + else if (!data || !data_len) + goto out; + } + } + + num_args = data_len / 8; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + /* only printable ascii for now. */ + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) { + err = -EINVAL; + goto out; + } + + if (fmt[i] != '%') + continue; + + if (fmt[i + 1] == '%') { + i++; + continue; + } + + if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) { + err = -E2BIG; + goto out; + } + + if (fmt_cnt >= num_args) { + err = -EINVAL; + goto out; + } + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + + /* skip optional "[0 +-][num]" width formating field */ + while (fmt[i] == '0' || fmt[i] == '+' || fmt[i] == '-' || + fmt[i] == ' ') + i++; + if (fmt[i] >= '1' && fmt[i] <= '9') { + i++; + while (fmt[i] >= '0' && fmt[i] <= '9') + i++; + } + + if (fmt[i] == 's') { + /* try our best to copy */ + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + err = strncpy_from_unsafe(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + MAX_SEQ_PRINTF_STR_LEN); + if (err < 0) + bufs->buf[memcpy_cnt][0] = '\0'; + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'p') { + if (fmt[i + 1] == 0 || + fmt[i + 1] == 'K' || + fmt[i + 1] == 'x') { + /* just kernel pointers */ + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + continue; + } + + /* only support "%pI4", "%pi4", "%pI6" and "%pi6". */ + if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I') { + err = -EINVAL; + goto out; + } + if (fmt[i + 2] != '4' && fmt[i + 2] != '6') { + err = -EINVAL; + goto out; + } + + if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) { + err = -E2BIG; + goto out; + } + + + copy_size = (fmt[i + 2] == '4') ? 4 : 16; + + err = probe_kernel_read(bufs->buf[memcpy_cnt], + (void *) (long) args[fmt_cnt], + copy_size); + if (err < 0) + memset(bufs->buf[memcpy_cnt], 0, copy_size); + params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt]; + + i += 2; + fmt_cnt++; + memcpy_cnt++; + continue; + } + + if (fmt[i] == 'l') { + i++; + if (fmt[i] == 'l') + i++; + } + + if (fmt[i] != 'i' && fmt[i] != 'd' && + fmt[i] != 'u' && fmt[i] != 'x') { + err = -EINVAL; + goto out; + } + + params[fmt_cnt] = args[fmt_cnt]; + fmt_cnt++; + } + + /* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give + * all of them to seq_printf(). + */ + seq_printf(m, fmt, params[0], params[1], params[2], params[3], + params[4], params[5], params[6], params[7], params[8], + params[9], params[10], params[11]); + + err = seq_has_overflowed(m) ? -EOVERFLOW : 0; +out: + this_cpu_dec(bpf_seq_printf_buf_used); + return err; +} + +static int bpf_seq_printf_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_printf_proto = { + .func = bpf_seq_printf, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_printf_btf_ids, +}; + +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len) +{ + return seq_write(m, data, len) ? -EOVERFLOW : 0; +} + +static int bpf_seq_write_btf_ids[5]; +static const struct bpf_func_proto bpf_seq_write_proto = { + .func = bpf_seq_write, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE_OR_ZERO, + .btf_id = bpf_seq_write_btf_ids, +}; + static __always_inline int get_map_perf_counter(struct bpf_map *map, u64 flags, u64 *value, u64 *enabled, u64 *running) @@ -1226,6 +1432,14 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_xdp_output: return &bpf_xdp_output_proto; #endif + case BPF_FUNC_seq_printf: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_printf_proto : + NULL; + case BPF_FUNC_seq_write: + return prog->expected_attach_type == BPF_TRACE_ITER ? + &bpf_seq_write_proto : + NULL; default: return raw_tp_prog_func_proto(func_id, prog); } -- cgit v1.2.3 From 1d68f22b3d53d368d5cc8d09de890250cae5c945 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:15 -0700 Subject: bpf: Handle spilled PTR_TO_BTF_ID properly when checking stack_boundary This specifically to handle the case like below: // ptr below is a socket ptr identified by PTR_TO_BTF_ID u64 param[2] = { ptr, val }; bpf_seq_printf(seq, fmt, sizeof(fmt), param, sizeof(param)); In this case, the 16 bytes stack for "param" contains: 8 bytes for ptr with spilled PTR_TO_BTF_ID 8 bytes for val as STACK_MISC The current verifier will complain the ptr should not be visible to the helper. ... 16: (7b) *(u64 *)(r10 -64) = r2 18: (7b) *(u64 *)(r10 -56) = r1 19: (bf) r4 = r10 ; 20: (07) r4 += -64 ; BPF_SEQ_PRINTF(seq, fmt1, (long)s, s->sk_protocol); 21: (bf) r1 = r6 22: (18) r2 = 0xffffa8d00018605a 24: (b4) w3 = 10 25: (b4) w5 = 16 26: (85) call bpf_seq_printf#125 R0=inv(id=0) R1_w=ptr_seq_file(id=0,off=0,imm=0) R2_w=map_value(id=0,off=90,ks=4,vs=144,imm=0) R3_w=inv10 R4_w=fp-64 R5_w=inv16 R6=ptr_seq_file(id=0,off=0,imm=0) R7=ptr_netlink_sock(id=0,off=0,imm=0) R10=fp0 fp-56_w=mmmmmmmm fp-64_w=ptr_ last_idx 26 first_idx 13 regs=8 stack=0 before 25: (b4) w5 = 16 regs=8 stack=0 before 24: (b4) w3 = 10 invalid indirect read from stack off -64+0 size 16 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175915.2476783-1-yhs@fb.com --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 36b2a38a06fe..2a1826c76bb6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3494,6 +3494,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, *stype = STACK_MISC; goto mark; } + + if (state->stack[spi].slot_type[0] == STACK_SPILL && + state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID) + goto mark; + if (state->stack[spi].slot_type[0] == STACK_SPILL && state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); -- cgit v1.2.3 From 9c5f8a1008a121e4c6b24af211034e24b0b63081 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 9 May 2020 10:59:16 -0700 Subject: bpf: Support variable length array in tracing programs In /proc/net/ipv6_route, we have struct fib6_info { struct fib6_table *fib6_table; ... struct fib6_nh fib6_nh[0]; } struct fib6_nh { struct fib_nh_common nh_common; struct rt6_info **rt6i_pcpu; struct rt6_exception_bucket *rt6i_exception_bucket; }; struct fib_nh_common { ... u8 nhc_gw_family; ... } The access: struct fib6_nh *fib6_nh = &rt->fib6_nh; ... fib6_nh->nh_common.nhc_gw_family ... This patch ensures such an access is handled properly. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200509175916.2476853-1-yhs@fb.com --- kernel/bpf/btf.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c490fbde22d4..dcd233139294 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3833,6 +3833,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; const char *tname, *mname; + u32 vlen; again: tname = __btf_name_by_offset(btf_vmlinux, t->name_off); @@ -3841,7 +3842,43 @@ again: return -EINVAL; } + vlen = btf_type_vlen(t); if (off + size > t->size) { + /* If the last element is a variable size array, we may + * need to relax the rule. + */ + struct btf_array *array_elem; + + if (vlen == 0) + goto error; + + member = btf_type_member(t) + vlen - 1; + mtype = btf_type_skip_modifiers(btf_vmlinux, member->type, + NULL); + if (!btf_type_is_array(mtype)) + goto error; + + array_elem = (struct btf_array *)(mtype + 1); + if (array_elem->nelems != 0) + goto error; + + moff = btf_member_bit_offset(t, member) / 8; + if (off < moff) + goto error; + + /* Only allow structure for now, can be relaxed for + * other types later. + */ + elem_type = btf_type_skip_modifiers(btf_vmlinux, + array_elem->type, NULL); + if (!btf_type_is_struct(elem_type)) + goto error; + + off = (off - moff) % elem_type->size; + return btf_struct_access(log, elem_type, off, size, atype, + next_btf_id); + +error: bpf_log(log, "access beyond struct %s at off %u size %u\n", tname, off, size); return -EACCES; -- cgit v1.2.3 From 385bbf7b119a4feb6d6bcf3586f1bb1dd9c5b0a0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:50:57 -0500 Subject: bpf, libbpf: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200507185057.GA13981@embeddedor --- kernel/bpf/queue_stack_maps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index f697647ceb54..30e1373fd437 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -19,7 +19,7 @@ struct bpf_queue_stack { u32 head, tail; u32 size; /* max_entries + 1 */ - char elements[0] __aligned(8); + char elements[] __aligned(8); }; static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) -- cgit v1.2.3 From 2e3ed68bfcd9c5ca2cf8b88ba23a34992ccd0b1f Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:18 -0700 Subject: bpf: Add comments to interpret bpf_prog return values Add a short comment in bpf_iter_run_prog() function to explain how bpf_prog return value is converted to seq_ops->show() return value: bpf_prog return seq_ops()->show() return 0 0 1 -EAGAIN When show() return value is -EAGAIN, the current bpf_seq_read() will end. If the current seq_file buffer is empty, -EAGAIN will return to user space. Otherwise, the buffer will be copied to user space. In both cases, the next bpf_seq_read() call will try to show the same object which returned -EAGAIN previously. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180218.2949517-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 30efd15cd4a0..0a45a6cdfabd 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -526,5 +526,11 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) migrate_enable(); rcu_read_unlock(); + /* bpf program can only return 0 or 1: + * 0 : okay + * 1 : retry the same object + * The bpf_iter_run_prog() return value + * will be seq_ops->show() return value. + */ return ret == 0 ? 0 : -EAGAIN; } -- cgit v1.2.3 From 15172a46fa2796c1a1358a36babd31274716ed41 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:19 -0700 Subject: bpf: net: Refactor bpf_iter target registration Currently bpf_iter_reg_target takes parameters from target and allocates memory to save them. This is really not necessary, esp. in the future we may grow information passed from targets to bpf_iter manager. The patch refactors the code so target reg_info becomes static and bpf_iter manager can just take a reference to it. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200513180219.2949605-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 36 +++++++++++++++++------------------- kernel/bpf/map_iter.c | 18 +++++++++--------- kernel/bpf/task_iter.c | 30 ++++++++++++++++-------------- 3 files changed, 42 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 0a45a6cdfabd..051fb8cab62a 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -8,11 +8,7 @@ struct bpf_iter_target_info { struct list_head list; - const char *target; - const struct seq_operations *seq_ops; - bpf_iter_init_seq_priv_t init_seq_private; - bpf_iter_fini_seq_priv_t fini_seq_private; - u32 seq_priv_size; + const struct bpf_iter_reg *reg_info; u32 btf_id; /* cached value */ }; @@ -222,8 +218,8 @@ static int iter_release(struct inode *inode, struct file *file) iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); - if (iter_priv->tinfo->fini_seq_private) - iter_priv->tinfo->fini_seq_private(seq->private); + if (iter_priv->tinfo->reg_info->fini_seq_private) + iter_priv->tinfo->reg_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; @@ -238,7 +234,12 @@ const struct file_operations bpf_iter_fops = { .release = iter_release, }; -int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) +/* The argument reg_info will be cached in bpf_iter_target_info. + * The common practice is to declare target reg_info as + * a const static variable and passed as an argument to + * bpf_iter_reg_target(). + */ +int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; @@ -246,11 +247,7 @@ int bpf_iter_reg_target(struct bpf_iter_reg *reg_info) if (!tinfo) return -ENOMEM; - tinfo->target = reg_info->target; - tinfo->seq_ops = reg_info->seq_ops; - tinfo->init_seq_private = reg_info->init_seq_private; - tinfo->fini_seq_private = reg_info->fini_seq_private; - tinfo->seq_priv_size = reg_info->seq_priv_size; + tinfo->reg_info = reg_info; INIT_LIST_HEAD(&tinfo->list); mutex_lock(&targets_mutex); @@ -267,7 +264,7 @@ void bpf_iter_unreg_target(const char *target) mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { - if (!strcmp(target, tinfo->target)) { + if (!strcmp(target, tinfo->reg_info->target)) { list_del(&tinfo->list); kfree(tinfo); found = true; @@ -303,7 +300,7 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) supported = true; break; } - if (!strcmp(attach_fname + prefix_len, tinfo->target)) { + if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) { cache_btf_id(tinfo, prog); supported = true; break; @@ -431,15 +428,16 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link) tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + - tinfo->seq_priv_size; - priv_data = __seq_open_private(file, tinfo->seq_ops, total_priv_dsize); + tinfo->reg_info->seq_priv_size; + priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops, + total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } - if (tinfo->init_seq_private) { - err = tinfo->init_seq_private(priv_data->target_private); + if (tinfo->reg_info->init_seq_private) { + err = tinfo->reg_info->init_seq_private(priv_data->target_private); if (err) goto release_seq_file; } diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 8162e0c00b9f..c6216a5fe56e 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -81,17 +81,17 @@ static const struct seq_operations bpf_map_seq_ops = { .show = bpf_map_seq_show, }; +static const struct bpf_iter_reg bpf_map_reg_info = { + .target = "bpf_map", + .seq_ops = &bpf_map_seq_ops, + .init_seq_private = NULL, + .fini_seq_private = NULL, + .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), +}; + static int __init bpf_map_iter_init(void) { - struct bpf_iter_reg reg_info = { - .target = "bpf_map", - .seq_ops = &bpf_map_seq_ops, - .init_seq_private = NULL, - .fini_seq_private = NULL, - .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), - }; - - return bpf_iter_reg_target(®_info); + return bpf_iter_reg_target(&bpf_map_reg_info); } late_initcall(bpf_map_iter_init); diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index aeed662d8451..bd7bfd83d9e0 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -306,22 +306,24 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; +static const struct bpf_iter_reg task_reg_info = { + .target = "task", + .seq_ops = &task_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), +}; + +static const struct bpf_iter_reg task_file_reg_info = { + .target = "task_file", + .seq_ops = &task_file_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), +}; + static int __init task_iter_init(void) { - struct bpf_iter_reg task_file_reg_info = { - .target = "task_file", - .seq_ops = &task_file_seq_ops, - .init_seq_private = init_seq_pidns, - .fini_seq_private = fini_seq_pidns, - .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), - }; - struct bpf_iter_reg task_reg_info = { - .target = "task", - .seq_ops = &task_seq_ops, - .init_seq_private = init_seq_pidns, - .fini_seq_private = fini_seq_pidns, - .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), - }; int ret; ret = bpf_iter_reg_target(&task_reg_info); -- cgit v1.2.3 From ab2ee4fcb9d61fd57db70db694adbcf54662bd80 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:20 -0700 Subject: bpf: Change func bpf_iter_unreg_target() signature Change func bpf_iter_unreg_target() parameter from target name to target reg_info, similar to bpf_iter_reg_target(). Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180220.2949737-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 051fb8cab62a..644f8626b2c0 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -257,14 +257,14 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) return 0; } -void bpf_iter_unreg_target(const char *target) +void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; bool found = false; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { - if (!strcmp(target, tinfo->reg_info->target)) { + if (reg_info == tinfo->reg_info) { list_del(&tinfo->list); kfree(tinfo); found = true; -- cgit v1.2.3 From 3c32cc1bceba8a1755dc35cd97516f6c67856844 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 11:02:21 -0700 Subject: bpf: Enable bpf_iter targets registering ctx argument types Commit b121b341e598 ("bpf: Add PTR_TO_BTF_ID_OR_NULL support") adds a field btf_id_or_null_non0_off to bpf_prog->aux structure to indicate that the first ctx argument is PTR_TO_BTF_ID reg_type and all others are PTR_TO_BTF_ID_OR_NULL. This approach does not really scale if we have other different reg types in the future, e.g., a pointer to a buffer. This patch enables bpf_iter targets registering ctx argument reg types which may be different from the default one. For example, for pointers to structures, the default reg_type is PTR_TO_BTF_ID for tracing program. The target can register a particular pointer type as PTR_TO_BTF_ID_OR_NULL which can be used by the verifier to enforce accesses. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513180221.2949882-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 5 +++++ kernel/bpf/btf.c | 15 ++++++++++----- kernel/bpf/map_iter.c | 5 +++++ kernel/bpf/task_iter.c | 12 ++++++++++++ kernel/bpf/verifier.c | 1 - 5 files changed, 32 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 644f8626b2c0..dd612b80b9fe 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -308,6 +308,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog) } mutex_unlock(&targets_mutex); + if (supported) { + prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; + prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; + } + return supported; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dcd233139294..58c9af1d4808 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3694,7 +3694,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, struct bpf_verifier_log *log = info->log; const struct btf_param *args; u32 nr_args, arg; - int ret; + int i, ret; if (off % 8) { bpf_log(log, "func '%s' offset %d is not multiple of 8\n", @@ -3790,10 +3790,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return true; /* this is a pointer to another type */ - if (off != 0 && prog->aux->btf_id_or_null_non0_off) - info->reg_type = PTR_TO_BTF_ID_OR_NULL; - else - info->reg_type = PTR_TO_BTF_ID; + info->reg_type = PTR_TO_BTF_ID; + for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { + const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + + if (ctx_arg_info->offset == off) { + info->reg_type = ctx_arg_info->reg_type; + break; + } + } if (tgt_prog) { ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg); diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index c6216a5fe56e..c69071e334bf 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -87,6 +87,11 @@ static const struct bpf_iter_reg bpf_map_reg_info = { .init_seq_private = NULL, .fini_seq_private = NULL, .seq_priv_size = sizeof(struct bpf_iter_seq_map_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__bpf_map, map), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init bpf_map_iter_init(void) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index bd7bfd83d9e0..a9b7264dda08 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -312,6 +312,11 @@ static const struct bpf_iter_reg task_reg_info = { .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task, task), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static const struct bpf_iter_reg task_file_reg_info = { @@ -320,6 +325,13 @@ static const struct bpf_iter_reg task_file_reg_info = { .init_seq_private = init_seq_pidns, .fini_seq_private = fini_seq_pidns, .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task_file, task), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__task_file, file), + PTR_TO_BTF_ID_OR_NULL }, + }, }; static int __init task_iter_init(void) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2a1826c76bb6..a3f2af756fd6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10652,7 +10652,6 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) prog->aux->attach_func_proto = t; if (!bpf_iter_prog_supported(prog)) return -EINVAL; - prog->aux->btf_id_or_null_non0_off = true; ret = btf_distill_func_proto(&env->log, btf, t, tname, &fmodel); return ret; -- cgit v1.2.3 From c70f34a8ac66c2cb05593ef5760142e5f862a9b4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 13 May 2020 22:51:37 -0700 Subject: bpf: Fix bpf_iter's task iterator logic task_seq_get_next might stop prematurely if get_pid_task() fails to get task_struct. Failure to do so doesn't mean that there are no more tasks with higher pids. Procfs's iteration algorithm (see next_tgid in fs/proc/base.c) does a retry in such case. After this fix, instead of stopping prematurely after about 300 tasks on my server, bpf_iter program now returns >4000, which sounds much closer to reality. Fixes: eaaacd23910f ("bpf: Add task and task/file iterator targets") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200514055137.1564581-1-andriin@fb.com --- kernel/bpf/task_iter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index a9b7264dda08..4dbf2b6035f8 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -27,9 +27,15 @@ static struct task_struct *task_seq_get_next(struct pid_namespace *ns, struct pid *pid; rcu_read_lock(); +retry: pid = idr_get_next(&ns->idr, tid); - if (pid) + if (pid) { task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ++*tid; + goto retry; + } + } rcu_read_unlock(); return task; -- cgit v1.2.3