From f99bf205dab026ef434520198af2fcb7dae0efdb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 19 Nov 2015 11:56:22 +0100 Subject: bpf: add show_fdinfo handler for maps Add a handler for show_fdinfo() to be used by the anon-inodes backend for eBPF maps, and dump the map specification there. Not only useful for admins, but also it provides a minimal way to compare specs from ELF vs pinned object. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0d3313d02a7e..6d1407bc1531 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -93,6 +93,23 @@ void bpf_map_put(struct bpf_map *map) } } +#ifdef CONFIG_PROC_FS +static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_map *map = filp->private_data; + + seq_printf(m, + "map_type:\t%u\n" + "key_size:\t%u\n" + "value_size:\t%u\n" + "max_entries:\t%u\n", + map->map_type, + map->key_size, + map->value_size, + map->max_entries); +} +#endif + static int bpf_map_release(struct inode *inode, struct file *filp) { struct bpf_map *map = filp->private_data; @@ -108,7 +125,10 @@ static int bpf_map_release(struct inode *inode, struct file *filp) } static const struct file_operations bpf_map_fops = { - .release = bpf_map_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_map_show_fdinfo, +#endif + .release = bpf_map_release, }; int bpf_map_new_fd(struct bpf_map *map) -- cgit v1.2.3 From b11cfb5807e30333b36c02701382b820b7dcf0d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Nov 2015 15:55:52 -0500 Subject: cgroup: record ancestor IDs and reimplement cgroup_is_descendant() using it cgroup_is_descendant() currently walks up the hierarchy and compares each ancestor to the cgroup in question. While enough for cgroup core usages, this can't be used in hot paths to test cgroup membership. This patch adds cgroup->ancestor_ids[] which records the IDs of all ancestors including self and cgroup->level for the nesting level. This allows testing whether a given cgroup is a descendant of another in three finite steps - testing whether the two belong to the same hierarchy, whether the descendant candidate is at the same or a higher level than the ancestor and comparing the recorded ancestor_id at the matching level. cgroup_is_descendant() is accordingly reimplmented and made inline. Signed-off-by: Tejun Heo --- kernel/cgroup.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f1603c153890..3190040792c8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -459,25 +459,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) } EXPORT_SYMBOL_GPL(of_css); -/** - * cgroup_is_descendant - test ancestry - * @cgrp: the cgroup to be tested - * @ancestor: possible ancestor of @cgrp - * - * Test whether @cgrp is a descendant of @ancestor. It also returns %true - * if @cgrp == @ancestor. This function is safe to call as long as @cgrp - * and @ancestor are accessible. - */ -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) -{ - while (cgrp) { - if (cgrp == ancestor) - return true; - cgrp = cgroup_parent(cgrp); - } - return false; -} - static int notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -1903,6 +1884,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) if (ret < 0) goto out; root_cgrp->id = ret; + root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); @@ -4846,11 +4828,11 @@ err_free_css: static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { - struct cgroup *parent, *cgrp; + struct cgroup *parent, *cgrp, *tcgrp; struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; - int ssid, ret; + int level, ssid, ret; /* Do not accept '\n' to prevent making /proc//cgroup unparsable. */ @@ -4861,9 +4843,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (!parent) return -ENODEV; root = parent->root; + level = parent->level + 1; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); + cgrp = kzalloc(sizeof(*cgrp) + + sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); if (!cgrp) { ret = -ENOMEM; goto out_unlock; @@ -4887,6 +4871,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->self.parent = &parent->self; cgrp->root = root; + cgrp->level = level; + + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); -- cgit v1.2.3 From 16af439645455fbf36984ca5e72f31073ee19ab7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Nov 2015 15:55:52 -0500 Subject: cgroup: implement cgroup_get_from_path() and expose cgroup_put() Implement cgroup_get_from_path() using kernfs_walk_and_get() which obtains a default hierarchy cgroup from its path. This will be used to allow cgroup path based matching from outside cgroup proper - e.g. networking and perf. v2: Add EXPORT_SYMBOL_GPL(cgroup_get_from_path). Signed-off-by: Tejun Heo --- kernel/cgroup.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3190040792c8..3db5e8f5b702 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -434,11 +434,6 @@ static bool cgroup_tryget(struct cgroup *cgrp) return css_tryget(&cgrp->self); } -static void cgroup_put(struct cgroup *cgrp) -{ - css_put(&cgrp->self); -} - struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -5753,6 +5748,40 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) return id > 0 ? idr_find(&ss->css_idr, id) : NULL; } +/** + * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path + * @path: path on the default hierarchy + * + * Find the cgroup at @path on the default hierarchy, increment its + * reference count and return it. Returns pointer to the found cgroup on + * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) + * if @path points to a non-directory. + */ +struct cgroup *cgroup_get_from_path(const char *path) +{ + struct kernfs_node *kn; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + + kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); + if (kn) { + if (kernfs_type(kn) == KERNFS_DIR) { + cgrp = kn->priv; + cgroup_get(cgrp); + } else { + cgrp = ERR_PTR(-ENOTDIR); + } + kernfs_put(kn); + } else { + cgrp = ERR_PTR(-ENOENT); + } + + mutex_unlock(&cgroup_mutex); + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_path); + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) -- cgit v1.2.3 From bd1060a1d67128bb8fbe2e1384c518912cbe54e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Dec 2015 17:38:53 -0500 Subject: sock, cgroup: add sock->sk_cgroup In cgroup v1, dealing with cgroup membership was difficult because the number of membership associations was unbound. As a result, cgroup v1 grew several controllers whose primary purpose is either tagging membership or pull in configuration knobs from other subsystems so that cgroup membership test can be avoided. net_cls and net_prio controllers are examples of the latter. They allow configuring network-specific attributes from cgroup side so that network subsystem can avoid testing cgroup membership; unfortunately, these are not only cumbersome but also problematic. Both net_cls and net_prio aren't properly hierarchical. Both inherit configuration from the parent on creation but there's no interaction afterwards. An ancestor doesn't restrict the behavior in its subtree in anyway and configuration changes aren't propagated downwards. Especially when combined with cgroup delegation, this is problematic because delegatees can mess up whatever network configuration implemented at the system level. net_prio would allow the delegatees to set whatever priority value regardless of CAP_NET_ADMIN and net_cls the same for classid. While it is possible to solve these issues from controller side by implementing hierarchical allowable ranges in both controllers, it would involve quite a bit of complexity in the controllers and further obfuscate network configuration as it becomes even more difficult to tell what's actually being configured looking from the network side. While not much can be done for v1 at this point, as membership handling is sane on cgroup v2, it'd be better to make cgroup matching behave like other network matches and classifiers than introducing further complications. In preparation, this patch updates sock->sk_cgrp_data handling so that it points to the v2 cgroup that sock was created in until either net_prio or net_cls is used. Once either of the two is used, sock->sk_cgrp_data reverts to its previous role of carrying prioidx and classid. This is to avoid adding yet another cgroup related field to struct sock. As the mode switching can happen at most once per boot, the switching mechanism is aimed at lowering hot path overhead. It may leak a finite, likely small, number of cgroup refs and report spurious prioidx or classid on switching; however, dynamic updates of prioidx and classid have always been racy and lossy - socks between creation and fd installation are never updated, config changes don't update existing sockets at all, and prioidx may index with dead and recycled cgroup IDs. Non-critical inaccuracies from small race windows won't make any noticeable difference. This patch doesn't make use of the pointer yet. The following patch will implement netfilter match for cgroup2 membership. v2: Use sock_cgroup_data to avoid inflating struct sock w/ another cgroup specific field. v3: Add comments explaining why sock_data_prioidx() and sock_data_classid() use different fallback values. Signed-off-by: Tejun Heo Cc: Daniel Borkmann Cc: Daniel Wagner CC: Neil Horman Signed-off-by: David S. Miller --- kernel/cgroup.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3db5e8f5b702..4f8f7927b422 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,8 +57,8 @@ #include /* TODO: replace with more sophisticated array */ #include #include - #include +#include /* * pidlists linger the following amount before being destroyed. The goal @@ -5782,6 +5782,59 @@ struct cgroup *cgroup_get_from_path(const char *path) } EXPORT_SYMBOL_GPL(cgroup_get_from_path); +/* + * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data + * definition in cgroup-defs.h. + */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) + +spinlock_t cgroup_sk_update_lock; +static bool cgroup_sk_alloc_disabled __read_mostly; + +void cgroup_sk_alloc_disable(void) +{ + if (cgroup_sk_alloc_disabled) + return; + pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); + cgroup_sk_alloc_disabled = true; +} + +#else + +#define cgroup_sk_alloc_disabled false + +#endif + +void cgroup_sk_alloc(struct sock_cgroup_data *skcd) +{ + if (cgroup_sk_alloc_disabled) + return; + + rcu_read_lock(); + + while (true) { + struct css_set *cset; + + cset = task_css_set(current); + if (likely(cgroup_tryget(cset->dfl_cgrp))) { + skcd->val = (unsigned long)cset->dfl_cgrp; + break; + } + cpu_relax(); + } + + rcu_read_unlock(); +} + +void cgroup_sk_free(struct sock_cgroup_data *skcd) +{ + cgroup_put(sock_cgroup_ptr(skcd)); +} + +#endif /* CONFIG_SOCK_CGROUP_DATA */ + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) -- cgit v1.2.3 From bb35a6ef7da492e7df1fe8772716ff88c172b4cc Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 10 Dec 2015 22:33:49 +0100 Subject: bpf, inode: allow for rename and link ops Add support for renaming and hard links to the fs. Most of this can be implemented by using simple library operations under the same constraints that we don't use a reserved name like elsewhere. Linking can be useful to share/manage things like maps across subsystem users. It works within the file system boundary, but is not allowed for directories. Symbolic links are explicitly not implemented here, as it can be better done already by doing bind mounts inside bpf fs to set up shared directories f.e. useful when using volumes in docker containers that map a private working directory into /sys/fs/bpf/ which contains itself a bind mounted path from the host's /sys/fs/bpf/ mount that is shared among multiple containers. For single maps instead of whole directory, hard links can be easily used to do the same. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/inode.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5a8a797d50b7..f2ece3c174a5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -187,11 +187,31 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, } } +static int bpf_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_link(old_dentry, dir, new_dentry); +} + +static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static const struct inode_operations bpf_dir_iops = { .lookup = simple_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .rmdir = simple_rmdir, + .rename = bpf_rename, + .link = bpf_link, .unlink = simple_unlink, }; -- cgit v1.2.3 From 3fa4cc9c2df37b393b968dcc3bb2ab1e2ff7ea7f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 14 Dec 2015 11:24:06 -0500 Subject: net, cgroup: cgroup_sk_updat_lock was missing initializer bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") added global spinlock cgroup_sk_update_lock but erroneously skipped initializer leading to uninitialized spinlock warning. Fix it by using DEFINE_SPINLOCK(). Signed-off-by: Tejun Heo Reported-by: Dexuan Cui Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Signed-off-by: David S. Miller --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4f8f7927b422..4466273f59e1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5790,7 +5790,7 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path); #if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) -spinlock_t cgroup_sk_update_lock; +DEFINE_SPINLOCK(cgroup_sk_update_lock); static bool cgroup_sk_alloc_disabled __read_mostly; void cgroup_sk_alloc_disable(void) -- cgit v1.2.3 From 8b614aebecdf2b1f72d51b1527f5a75d218b78e2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 17 Dec 2015 23:51:54 +0100 Subject: bpf: move clearing of A/X into classic to eBPF migration prologue Back in the days where eBPF (or back then "internal BPF" ;->) was not exposed to user space, and only the classic BPF programs internally translated into eBPF programs, we missed the fact that for classic BPF A and X needed to be cleared. It was fixed back then via 83d5b7ef99c9 ("net: filter: initialize A and X registers"), and thus classic BPF specifics were added to the eBPF interpreter core to work around it. This added some confusion for JIT developers later on that take the eBPF interpreter code as an example for deriving their JIT. F.e. in f75298f5c3fe ("s390/bpf: clear correct BPF accumulator register"), at least X could leak stack memory. Furthermore, since this is only needed for classic BPF translations and not for eBPF (verifier takes care that read access to regs cannot be done uninitialized), more complexity is added to JITs as they need to determine whether they deal with migrations or native eBPF where they can just omit clearing A/X in their prologue and thus reduce image size a bit, see f.e. cde66c2d88da ("s390/bpf: Only clear A and X for converted BPF programs"). In other cases (x86, arm64), A and X is being cleared in the prologue also for eBPF case, which is unnecessary. Lets move this into the BPF migration in bpf_convert_filter() where it actually belongs as long as the number of eBPF JITs are still few. It can thus be done generically; allowing us to remove the quirk from __bpf_prog_run() and to slightly reduce JIT image size in case of eBPF, while reducing code duplication on this matter in current(/future) eBPF JITs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Reviewed-by: Michael Holzheu Tested-by: Michael Holzheu Cc: Zi Shen Lim Cc: Yang Shi Acked-by: Yang Shi Acked-by: Zi Shen Lim Signed-off-by: David S. Miller --- kernel/bpf/core.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 334b1bdd572c..972d9a8e4ac4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -306,10 +306,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; ARG1 = (u64) (unsigned long) ctx; - /* Registers used in classic BPF programs need to be reset first. */ - regs[BPF_REG_A] = 0; - regs[BPF_REG_X] = 0; - select_insn: goto *jumptable[insn->code]; -- cgit v1.2.3 From 6591f1e6662dd595effb52a54e42a6d2d2b03e51 Mon Sep 17 00:00:00 2001 From: "tom.leiming@gmail.com" Date: Tue, 29 Dec 2015 22:40:25 +0800 Subject: bpf: hash: use atomic count Preparing for removing global per-hashtable lock, so the counter need to be defined as aotmic_t first. Acked-by: Daniel Borkmann Signed-off-by: Ming Lei Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 34777b3746fa..2615388009a4 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -18,7 +18,7 @@ struct bpf_htab { struct bpf_map map; struct hlist_head *buckets; raw_spinlock_t lock; - u32 count; /* number of elements in this hashtable */ + atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; @@ -106,7 +106,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) INIT_HLIST_HEAD(&htab->buckets[i]); raw_spin_lock_init(&htab->lock); - htab->count = 0; + atomic_set(&htab->count, 0); return &htab->map; @@ -256,7 +256,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, l_old = lookup_elem_raw(head, l_new->hash, key, key_size); - if (!l_old && unlikely(htab->count >= map->max_entries)) { + if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { /* if elem with this 'key' doesn't exist and we've reached * max_entries limit, fail insertion of new elem */ @@ -284,7 +284,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_del_rcu(&l_old->hash_node); kfree_rcu(l_old, rcu); } else { - htab->count++; + atomic_inc(&htab->count); } raw_spin_unlock_irqrestore(&htab->lock, flags); @@ -319,7 +319,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) if (l) { hlist_del_rcu(&l->hash_node); - htab->count--; + atomic_dec(&htab->count); kfree_rcu(l, rcu); ret = 0; } @@ -339,7 +339,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - htab->count--; + atomic_dec(&htab->count); kfree(l); } } -- cgit v1.2.3 From 45d8390c56bd2851097736c1c20ad958880168df Mon Sep 17 00:00:00 2001 From: "tom.leiming@gmail.com" Date: Tue, 29 Dec 2015 22:40:26 +0800 Subject: bpf: hash: move select_bucket() out of htab's spinlock The spinlock is just used for protecting the per-bucket hlist, so it isn't needed for selecting bucket. Acked-by: Daniel Borkmann Signed-off-by: Ming Lei Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2615388009a4..d857fcb3607b 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -248,12 +248,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); l_new->hash = htab_map_hash(l_new->key, key_size); + head = select_bucket(htab, l_new->hash); /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&htab->lock, flags); - head = select_bucket(htab, l_new->hash); - l_old = lookup_elem_raw(head, l_new->hash, key, key_size); if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { @@ -310,11 +309,10 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; hash = htab_map_hash(key, key_size); + head = select_bucket(htab, hash); raw_spin_lock_irqsave(&htab->lock, flags); - head = select_bucket(htab, hash); - l = lookup_elem_raw(head, hash, key, key_size); if (l) { -- cgit v1.2.3 From 688ecfe60220516e8b6707c832ec02e92522dd85 Mon Sep 17 00:00:00 2001 From: "tom.leiming@gmail.com" Date: Tue, 29 Dec 2015 22:40:27 +0800 Subject: bpf: hash: use per-bucket spinlock Both htab_map_update_elem() and htab_map_delete_elem() can be called from eBPF program, and they may be in kernel hot path, so it isn't efficient to use a per-hashtable lock in this two helpers. The per-hashtable spinlock is used for protecting bucket's hlist, and per-bucket lock is just enough. This patch converts the per-hashtable lock into per-bucket spinlock, so that contention can be decreased a lot. Signed-off-by: Ming Lei Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 50 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d857fcb3607b..c5b30fd8a315 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -14,10 +14,14 @@ #include #include +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + struct bpf_htab { struct bpf_map map; - struct hlist_head *buckets; - raw_spinlock_t lock; + struct bucket *buckets; atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ @@ -79,33 +83,34 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) + htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - if ((u64) htab->n_buckets * sizeof(struct hlist_head) + + if ((u64) htab->n_buckets * sizeof(struct bucket) + (u64) htab->elem_size * htab->map.max_entries >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + + htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) + htab->elem_size * htab->map.max_entries, PAGE_SIZE) >> PAGE_SHIFT; err = -ENOMEM; - htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), GFP_USER | __GFP_NOWARN); if (!htab->buckets) { - htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket)); if (!htab->buckets) goto free_htab; } - for (i = 0; i < htab->n_buckets; i++) - INIT_HLIST_HEAD(&htab->buckets[i]); + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } - raw_spin_lock_init(&htab->lock); atomic_set(&htab->count, 0); return &htab->map; @@ -120,11 +125,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len) return jhash(key, key_len, 0); } -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) { return &htab->buckets[hash & (htab->n_buckets - 1)]; } +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { @@ -227,6 +237,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old; struct hlist_head *head; + struct bucket *b; unsigned long flags; u32 key_size; int ret; @@ -248,10 +259,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); l_new->hash = htab_map_hash(l_new->key, key_size); - head = select_bucket(htab, l_new->hash); + b = __select_bucket(htab, l_new->hash); + head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&htab->lock, flags); + raw_spin_lock_irqsave(&b->lock, flags); l_old = lookup_elem_raw(head, l_new->hash, key, key_size); @@ -285,11 +297,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, } else { atomic_inc(&htab->count); } - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); return 0; err: - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); kfree(l_new); return ret; } @@ -299,6 +311,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; + struct bucket *b; struct htab_elem *l; unsigned long flags; u32 hash, key_size; @@ -309,9 +322,10 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; hash = htab_map_hash(key, key_size); - head = select_bucket(htab, hash); + b = __select_bucket(htab, hash); + head = &b->head; - raw_spin_lock_irqsave(&htab->lock, flags); + raw_spin_lock_irqsave(&b->lock, flags); l = lookup_elem_raw(head, hash, key, key_size); @@ -322,7 +336,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) ret = 0; } - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); return ret; } -- cgit v1.2.3 From 229394e8e62a4191d592842cf67e80c62a492937 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Tue, 12 Jan 2016 20:17:08 +0100 Subject: net: bpf: reject invalid shifts On ARM64, a BUG() is triggered in the eBPF JIT if a filter with a constant shift that can't be encoded in the immediate field of the UBFM/SBFM instructions is passed to the JIT. Since these shifts amounts, which are negative or >= regsize, are invalid, reject them in the eBPF verifier and the classic BPF filter checker, for all architectures. Signed-off-by: Rabin Vincent Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7945d10b378..d1d3e8f57de9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1121,6 +1121,16 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if ((opcode == BPF_LSH || opcode == BPF_RSH || + opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { + int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; + + if (insn->imm < 0 || insn->imm >= size) { + verbose("invalid shift %d\n", insn->imm); + return -EINVAL; + } + } + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && regs[insn->dst_reg].type == FRAME_PTR && -- cgit v1.2.3