From f99bf205dab026ef434520198af2fcb7dae0efdb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 19 Nov 2015 11:56:22 +0100
Subject: bpf: add show_fdinfo handler for maps

Add a handler for show_fdinfo() to be used by the anon-inodes
backend for eBPF maps, and dump the map specification there. Not
only useful for admins, but also it provides a minimal way to
compare specs from ELF vs pinned object.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0d3313d02a7e..6d1407bc1531 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -93,6 +93,23 @@ void bpf_map_put(struct bpf_map *map)
 	}
 }
 
+#ifdef CONFIG_PROC_FS
+static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
+{
+	const struct bpf_map *map = filp->private_data;
+
+	seq_printf(m,
+		   "map_type:\t%u\n"
+		   "key_size:\t%u\n"
+		   "value_size:\t%u\n"
+		   "max_entries:\t%u\n",
+		   map->map_type,
+		   map->key_size,
+		   map->value_size,
+		   map->max_entries);
+}
+#endif
+
 static int bpf_map_release(struct inode *inode, struct file *filp)
 {
 	struct bpf_map *map = filp->private_data;
@@ -108,7 +125,10 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
 }
 
 static const struct file_operations bpf_map_fops = {
-	.release = bpf_map_release,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= bpf_map_show_fdinfo,
+#endif
+	.release	= bpf_map_release,
 };
 
 int bpf_map_new_fd(struct bpf_map *map)
-- 
cgit v1.2.3


From b11cfb5807e30333b36c02701382b820b7dcf0d5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Nov 2015 15:55:52 -0500
Subject: cgroup: record ancestor IDs and reimplement cgroup_is_descendant()
 using it

cgroup_is_descendant() currently walks up the hierarchy and compares
each ancestor to the cgroup in question.  While enough for cgroup core
usages, this can't be used in hot paths to test cgroup membership.
This patch adds cgroup->ancestor_ids[] which records the IDs of all
ancestors including self and cgroup->level for the nesting level.

This allows testing whether a given cgroup is a descendant of another
in three finite steps - testing whether the two belong to the same
hierarchy, whether the descendant candidate is at the same or a higher
level than the ancestor and comparing the recorded ancestor_id at the
matching level.  cgroup_is_descendant() is accordingly reimplmented
and made inline.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f1603c153890..3190040792c8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -459,25 +459,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 }
 EXPORT_SYMBOL_GPL(of_css);
 
-/**
- * cgroup_is_descendant - test ancestry
- * @cgrp: the cgroup to be tested
- * @ancestor: possible ancestor of @cgrp
- *
- * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
- * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
- * and @ancestor are accessible.
- */
-bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
-{
-	while (cgrp) {
-		if (cgrp == ancestor)
-			return true;
-		cgrp = cgroup_parent(cgrp);
-	}
-	return false;
-}
-
 static int notify_on_release(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -1903,6 +1884,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 	if (ret < 0)
 		goto out;
 	root_cgrp->id = ret;
+	root_cgrp->ancestor_ids[0] = ret;
 
 	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
 			      GFP_KERNEL);
@@ -4846,11 +4828,11 @@ err_free_css:
 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 			umode_t mode)
 {
-	struct cgroup *parent, *cgrp;
+	struct cgroup *parent, *cgrp, *tcgrp;
 	struct cgroup_root *root;
 	struct cgroup_subsys *ss;
 	struct kernfs_node *kn;
-	int ssid, ret;
+	int level, ssid, ret;
 
 	/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
 	 */
@@ -4861,9 +4843,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	if (!parent)
 		return -ENODEV;
 	root = parent->root;
+	level = parent->level + 1;
 
 	/* allocate the cgroup and its ID, 0 is reserved for the root */
-	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
+	cgrp = kzalloc(sizeof(*cgrp) +
+		       sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL);
 	if (!cgrp) {
 		ret = -ENOMEM;
 		goto out_unlock;
@@ -4887,6 +4871,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 
 	cgrp->self.parent = &parent->self;
 	cgrp->root = root;
+	cgrp->level = level;
+
+	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp))
+		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
 
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-- 
cgit v1.2.3


From 16af439645455fbf36984ca5e72f31073ee19ab7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Nov 2015 15:55:52 -0500
Subject: cgroup: implement cgroup_get_from_path() and expose cgroup_put()

Implement cgroup_get_from_path() using kernfs_walk_and_get() which
obtains a default hierarchy cgroup from its path.  This will be used
to allow cgroup path based matching from outside cgroup proper -
e.g. networking and perf.

v2: Add EXPORT_SYMBOL_GPL(cgroup_get_from_path).

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 39 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3190040792c8..3db5e8f5b702 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -434,11 +434,6 @@ static bool cgroup_tryget(struct cgroup *cgrp)
 	return css_tryget(&cgrp->self);
 }
 
-static void cgroup_put(struct cgroup *cgrp)
-{
-	css_put(&cgrp->self);
-}
-
 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 {
 	struct cgroup *cgrp = of->kn->parent->priv;
@@ -5753,6 +5748,40 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 	return id > 0 ? idr_find(&ss->css_idr, id) : NULL;
 }
 
+/**
+ * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
+ * @path: path on the default hierarchy
+ *
+ * Find the cgroup at @path on the default hierarchy, increment its
+ * reference count and return it.  Returns pointer to the found cgroup on
+ * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
+ * if @path points to a non-directory.
+ */
+struct cgroup *cgroup_get_from_path(const char *path)
+{
+	struct kernfs_node *kn;
+	struct cgroup *cgrp;
+
+	mutex_lock(&cgroup_mutex);
+
+	kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
+	if (kn) {
+		if (kernfs_type(kn) == KERNFS_DIR) {
+			cgrp = kn->priv;
+			cgroup_get(cgrp);
+		} else {
+			cgrp = ERR_PTR(-ENOTDIR);
+		}
+		kernfs_put(kn);
+	} else {
+		cgrp = ERR_PTR(-ENOENT);
+	}
+
+	mutex_unlock(&cgroup_mutex);
+	return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_path);
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
-- 
cgit v1.2.3


From bd1060a1d67128bb8fbe2e1384c518912cbe54e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 7 Dec 2015 17:38:53 -0500
Subject: sock, cgroup: add sock->sk_cgroup

In cgroup v1, dealing with cgroup membership was difficult because the
number of membership associations was unbound.  As a result, cgroup v1
grew several controllers whose primary purpose is either tagging
membership or pull in configuration knobs from other subsystems so
that cgroup membership test can be avoided.

net_cls and net_prio controllers are examples of the latter.  They
allow configuring network-specific attributes from cgroup side so that
network subsystem can avoid testing cgroup membership; unfortunately,
these are not only cumbersome but also problematic.

Both net_cls and net_prio aren't properly hierarchical.  Both inherit
configuration from the parent on creation but there's no interaction
afterwards.  An ancestor doesn't restrict the behavior in its subtree
in anyway and configuration changes aren't propagated downwards.
Especially when combined with cgroup delegation, this is problematic
because delegatees can mess up whatever network configuration
implemented at the system level.  net_prio would allow the delegatees
to set whatever priority value regardless of CAP_NET_ADMIN and net_cls
the same for classid.

While it is possible to solve these issues from controller side by
implementing hierarchical allowable ranges in both controllers, it
would involve quite a bit of complexity in the controllers and further
obfuscate network configuration as it becomes even more difficult to
tell what's actually being configured looking from the network side.
While not much can be done for v1 at this point, as membership
handling is sane on cgroup v2, it'd be better to make cgroup matching
behave like other network matches and classifiers than introducing
further complications.

In preparation, this patch updates sock->sk_cgrp_data handling so that
it points to the v2 cgroup that sock was created in until either
net_prio or net_cls is used.  Once either of the two is used,
sock->sk_cgrp_data reverts to its previous role of carrying prioidx
and classid.  This is to avoid adding yet another cgroup related field
to struct sock.

As the mode switching can happen at most once per boot, the switching
mechanism is aimed at lowering hot path overhead.  It may leak a
finite, likely small, number of cgroup refs and report spurious
prioidx or classid on switching; however, dynamic updates of prioidx
and classid have always been racy and lossy - socks between creation
and fd installation are never updated, config changes don't update
existing sockets at all, and prioidx may index with dead and recycled
cgroup IDs.  Non-critical inaccuracies from small race windows won't
make any noticeable difference.

This patch doesn't make use of the pointer yet.  The following patch
will implement netfilter match for cgroup2 membership.

v2: Use sock_cgroup_data to avoid inflating struct sock w/ another
    cgroup specific field.

v3: Add comments explaining why sock_data_prioidx() and
    sock_data_classid() use different fallback values.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Daniel Wagner <daniel.wagner@bmw-carit.de>
CC: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/cgroup.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3db5e8f5b702..4f8f7927b422 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,8 +57,8 @@
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/kthread.h>
 #include <linux/delay.h>
-
 #include <linux/atomic.h>
+#include <net/sock.h>
 
 /*
  * pidlists linger the following amount before being destroyed.  The goal
@@ -5782,6 +5782,59 @@ struct cgroup *cgroup_get_from_path(const char *path)
 }
 EXPORT_SYMBOL_GPL(cgroup_get_from_path);
 
+/*
+ * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
+ * definition in cgroup-defs.h.
+ */
+#ifdef CONFIG_SOCK_CGROUP_DATA
+
+#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
+
+spinlock_t cgroup_sk_update_lock;
+static bool cgroup_sk_alloc_disabled __read_mostly;
+
+void cgroup_sk_alloc_disable(void)
+{
+	if (cgroup_sk_alloc_disabled)
+		return;
+	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
+	cgroup_sk_alloc_disabled = true;
+}
+
+#else
+
+#define cgroup_sk_alloc_disabled	false
+
+#endif
+
+void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
+{
+	if (cgroup_sk_alloc_disabled)
+		return;
+
+	rcu_read_lock();
+
+	while (true) {
+		struct css_set *cset;
+
+		cset = task_css_set(current);
+		if (likely(cgroup_tryget(cset->dfl_cgrp))) {
+			skcd->val = (unsigned long)cset->dfl_cgrp;
+			break;
+		}
+		cpu_relax();
+	}
+
+	rcu_read_unlock();
+}
+
+void cgroup_sk_free(struct sock_cgroup_data *skcd)
+{
+	cgroup_put(sock_cgroup_ptr(skcd));
+}
+
+#endif	/* CONFIG_SOCK_CGROUP_DATA */
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
-- 
cgit v1.2.3


From bb35a6ef7da492e7df1fe8772716ff88c172b4cc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 10 Dec 2015 22:33:49 +0100
Subject: bpf, inode: allow for rename and link ops

Add support for renaming and hard links to the fs. Most of this can be
implemented by using simple library operations under the same constraints
that we don't use a reserved name like elsewhere. Linking can be useful
to share/manage things like maps across subsystem users. It works within
the file system boundary, but is not allowed for directories.

Symbolic links are explicitly not implemented here, as it can be better
done already by doing bind mounts inside bpf fs to set up shared directories
f.e. useful when using volumes in docker containers that map a private
working directory into /sys/fs/bpf/ which contains itself a bind mounted
path from the host's /sys/fs/bpf/ mount that is shared among multiple
containers. For single maps instead of whole directory, hard links can
be easily used to do the same.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/inode.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5a8a797d50b7..f2ece3c174a5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -187,11 +187,31 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
 	}
 }
 
+static int bpf_link(struct dentry *old_dentry, struct inode *dir,
+		    struct dentry *new_dentry)
+{
+	if (bpf_dname_reserved(new_dentry))
+		return -EPERM;
+
+	return simple_link(old_dentry, dir, new_dentry);
+}
+
+static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	if (bpf_dname_reserved(new_dentry))
+		return -EPERM;
+
+	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
 static const struct inode_operations bpf_dir_iops = {
 	.lookup		= simple_lookup,
 	.mknod		= bpf_mkobj,
 	.mkdir		= bpf_mkdir,
 	.rmdir		= simple_rmdir,
+	.rename		= bpf_rename,
+	.link		= bpf_link,
 	.unlink		= simple_unlink,
 };
 
-- 
cgit v1.2.3


From 3fa4cc9c2df37b393b968dcc3bb2ab1e2ff7ea7f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 14 Dec 2015 11:24:06 -0500
Subject: net, cgroup: cgroup_sk_updat_lock was missing initializer

bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") added global
spinlock cgroup_sk_update_lock but erroneously skipped initializer
leading to uninitialized spinlock warning.  Fix it by using
DEFINE_SPINLOCK().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Dexuan Cui <decui@microsoft.com>
Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4f8f7927b422..4466273f59e1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5790,7 +5790,7 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
 
 #if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
 
-spinlock_t cgroup_sk_update_lock;
+DEFINE_SPINLOCK(cgroup_sk_update_lock);
 static bool cgroup_sk_alloc_disabled __read_mostly;
 
 void cgroup_sk_alloc_disable(void)
-- 
cgit v1.2.3


From 8b614aebecdf2b1f72d51b1527f5a75d218b78e2 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 17 Dec 2015 23:51:54 +0100
Subject: bpf: move clearing of A/X into classic to eBPF migration prologue

Back in the days where eBPF (or back then "internal BPF" ;->) was not
exposed to user space, and only the classic BPF programs internally
translated into eBPF programs, we missed the fact that for classic BPF
A and X needed to be cleared. It was fixed back then via 83d5b7ef99c9
("net: filter: initialize A and X registers"), and thus classic BPF
specifics were added to the eBPF interpreter core to work around it.

This added some confusion for JIT developers later on that take the
eBPF interpreter code as an example for deriving their JIT. F.e. in
f75298f5c3fe ("s390/bpf: clear correct BPF accumulator register"), at
least X could leak stack memory. Furthermore, since this is only needed
for classic BPF translations and not for eBPF (verifier takes care
that read access to regs cannot be done uninitialized), more complexity
is added to JITs as they need to determine whether they deal with
migrations or native eBPF where they can just omit clearing A/X in
their prologue and thus reduce image size a bit, see f.e. cde66c2d88da
("s390/bpf: Only clear A and X for converted BPF programs"). In other
cases (x86, arm64), A and X is being cleared in the prologue also for
eBPF case, which is unnecessary.

Lets move this into the BPF migration in bpf_convert_filter() where it
actually belongs as long as the number of eBPF JITs are still few. It
can thus be done generically; allowing us to remove the quirk from
__bpf_prog_run() and to slightly reduce JIT image size in case of eBPF,
while reducing code duplication on this matter in current(/future) eBPF
JITs.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Reviewed-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Tested-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Zi Shen Lim <zlim.lnx@gmail.com>
Cc: Yang Shi <yang.shi@linaro.org>
Acked-by: Yang Shi <yang.shi@linaro.org>
Acked-by: Zi Shen Lim <zlim.lnx@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 334b1bdd572c..972d9a8e4ac4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -306,10 +306,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
 	ARG1 = (u64) (unsigned long) ctx;
 
-	/* Registers used in classic BPF programs need to be reset first. */
-	regs[BPF_REG_A] = 0;
-	regs[BPF_REG_X] = 0;
-
 select_insn:
 	goto *jumptable[insn->code];
 
-- 
cgit v1.2.3


From 6591f1e6662dd595effb52a54e42a6d2d2b03e51 Mon Sep 17 00:00:00 2001
From: "tom.leiming@gmail.com" <tom.leiming@gmail.com>
Date: Tue, 29 Dec 2015 22:40:25 +0800
Subject: bpf: hash: use atomic count

Preparing for removing global per-hashtable lock, so
the counter need to be defined as aotmic_t first.

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/hashtab.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 34777b3746fa..2615388009a4 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,7 +18,7 @@ struct bpf_htab {
 	struct bpf_map map;
 	struct hlist_head *buckets;
 	raw_spinlock_t lock;
-	u32 count;	/* number of elements in this hashtable */
+	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
 };
@@ -106,7 +106,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		INIT_HLIST_HEAD(&htab->buckets[i]);
 
 	raw_spin_lock_init(&htab->lock);
-	htab->count = 0;
+	atomic_set(&htab->count, 0);
 
 	return &htab->map;
 
@@ -256,7 +256,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 
 	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
 
-	if (!l_old && unlikely(htab->count >= map->max_entries)) {
+	if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
 		/* if elem with this 'key' doesn't exist and we've reached
 		 * max_entries limit, fail insertion of new elem
 		 */
@@ -284,7 +284,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 		hlist_del_rcu(&l_old->hash_node);
 		kfree_rcu(l_old, rcu);
 	} else {
-		htab->count++;
+		atomic_inc(&htab->count);
 	}
 	raw_spin_unlock_irqrestore(&htab->lock, flags);
 
@@ -319,7 +319,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 
 	if (l) {
 		hlist_del_rcu(&l->hash_node);
-		htab->count--;
+		atomic_dec(&htab->count);
 		kfree_rcu(l, rcu);
 		ret = 0;
 	}
@@ -339,7 +339,7 @@ static void delete_all_elements(struct bpf_htab *htab)
 
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
-			htab->count--;
+			atomic_dec(&htab->count);
 			kfree(l);
 		}
 	}
-- 
cgit v1.2.3


From 45d8390c56bd2851097736c1c20ad958880168df Mon Sep 17 00:00:00 2001
From: "tom.leiming@gmail.com" <tom.leiming@gmail.com>
Date: Tue, 29 Dec 2015 22:40:26 +0800
Subject: bpf: hash: move select_bucket() out of htab's spinlock

The spinlock is just used for protecting the per-bucket
hlist, so it isn't needed for selecting bucket.

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/hashtab.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 2615388009a4..d857fcb3607b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -248,12 +248,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
 
 	l_new->hash = htab_map_hash(l_new->key, key_size);
+	head = select_bucket(htab, l_new->hash);
 
 	/* bpf_map_update_elem() can be called in_irq() */
 	raw_spin_lock_irqsave(&htab->lock, flags);
 
-	head = select_bucket(htab, l_new->hash);
-
 	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
 
 	if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
@@ -310,11 +309,10 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	key_size = map->key_size;
 
 	hash = htab_map_hash(key, key_size);
+	head = select_bucket(htab, hash);
 
 	raw_spin_lock_irqsave(&htab->lock, flags);
 
-	head = select_bucket(htab, hash);
-
 	l = lookup_elem_raw(head, hash, key, key_size);
 
 	if (l) {
-- 
cgit v1.2.3


From 688ecfe60220516e8b6707c832ec02e92522dd85 Mon Sep 17 00:00:00 2001
From: "tom.leiming@gmail.com" <tom.leiming@gmail.com>
Date: Tue, 29 Dec 2015 22:40:27 +0800
Subject: bpf: hash: use per-bucket spinlock

Both htab_map_update_elem() and htab_map_delete_elem() can be
called from eBPF program, and they may be in kernel hot path,
so it isn't efficient to use a per-hashtable lock in this two
helpers.

The per-hashtable spinlock is used for protecting bucket's
hlist, and per-bucket lock is just enough. This patch converts
the per-hashtable lock into per-bucket spinlock, so that
contention can be decreased a lot.

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/hashtab.c | 50 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d857fcb3607b..c5b30fd8a315 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -14,10 +14,14 @@
 #include <linux/filter.h>
 #include <linux/vmalloc.h>
 
+struct bucket {
+	struct hlist_head head;
+	raw_spinlock_t lock;
+};
+
 struct bpf_htab {
 	struct bpf_map map;
-	struct hlist_head *buckets;
-	raw_spinlock_t lock;
+	struct bucket *buckets;
 	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
@@ -79,33 +83,34 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
-	    htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
+	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
 		goto free_htab;
 
-	if ((u64) htab->n_buckets * sizeof(struct hlist_head) +
+	if ((u64) htab->n_buckets * sizeof(struct bucket) +
 	    (u64) htab->elem_size * htab->map.max_entries >=
 	    U32_MAX - PAGE_SIZE)
 		/* make sure page count doesn't overflow */
 		goto free_htab;
 
-	htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+	htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
 				   htab->elem_size * htab->map.max_entries,
 				   PAGE_SIZE) >> PAGE_SHIFT;
 
 	err = -ENOMEM;
-	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
 				      GFP_USER | __GFP_NOWARN);
 
 	if (!htab->buckets) {
-		htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+		htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
 		if (!htab->buckets)
 			goto free_htab;
 	}
 
-	for (i = 0; i < htab->n_buckets; i++)
-		INIT_HLIST_HEAD(&htab->buckets[i]);
+	for (i = 0; i < htab->n_buckets; i++) {
+		INIT_HLIST_HEAD(&htab->buckets[i].head);
+		raw_spin_lock_init(&htab->buckets[i].lock);
+	}
 
-	raw_spin_lock_init(&htab->lock);
 	atomic_set(&htab->count, 0);
 
 	return &htab->map;
@@ -120,11 +125,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len)
 	return jhash(key, key_len, 0);
 }
 
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
 {
 	return &htab->buckets[hash & (htab->n_buckets - 1)];
 }
 
+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+	return &__select_bucket(htab, hash)->head;
+}
+
 static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
 					 void *key, u32 key_size)
 {
@@ -227,6 +237,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct htab_elem *l_new, *l_old;
 	struct hlist_head *head;
+	struct bucket *b;
 	unsigned long flags;
 	u32 key_size;
 	int ret;
@@ -248,10 +259,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
 
 	l_new->hash = htab_map_hash(l_new->key, key_size);
-	head = select_bucket(htab, l_new->hash);
+	b = __select_bucket(htab, l_new->hash);
+	head = &b->head;
 
 	/* bpf_map_update_elem() can be called in_irq() */
-	raw_spin_lock_irqsave(&htab->lock, flags);
+	raw_spin_lock_irqsave(&b->lock, flags);
 
 	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
 
@@ -285,11 +297,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	} else {
 		atomic_inc(&htab->count);
 	}
-	raw_spin_unlock_irqrestore(&htab->lock, flags);
+	raw_spin_unlock_irqrestore(&b->lock, flags);
 
 	return 0;
 err:
-	raw_spin_unlock_irqrestore(&htab->lock, flags);
+	raw_spin_unlock_irqrestore(&b->lock, flags);
 	kfree(l_new);
 	return ret;
 }
@@ -299,6 +311,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_head *head;
+	struct bucket *b;
 	struct htab_elem *l;
 	unsigned long flags;
 	u32 hash, key_size;
@@ -309,9 +322,10 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	key_size = map->key_size;
 
 	hash = htab_map_hash(key, key_size);
-	head = select_bucket(htab, hash);
+	b = __select_bucket(htab, hash);
+	head = &b->head;
 
-	raw_spin_lock_irqsave(&htab->lock, flags);
+	raw_spin_lock_irqsave(&b->lock, flags);
 
 	l = lookup_elem_raw(head, hash, key, key_size);
 
@@ -322,7 +336,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 		ret = 0;
 	}
 
-	raw_spin_unlock_irqrestore(&htab->lock, flags);
+	raw_spin_unlock_irqrestore(&b->lock, flags);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 229394e8e62a4191d592842cf67e80c62a492937 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin@rab.in>
Date: Tue, 12 Jan 2016 20:17:08 +0100
Subject: net: bpf: reject invalid shifts

On ARM64, a BUG() is triggered in the eBPF JIT if a filter with a
constant shift that can't be encoded in the immediate field of the
UBFM/SBFM instructions is passed to the JIT.  Since these shifts
amounts, which are negative or >= regsize, are invalid, reject them in
the eBPF verifier and the classic BPF filter checker, for all
architectures.

Signed-off-by: Rabin Vincent <rabin@rab.in>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a7945d10b378..d1d3e8f57de9 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1121,6 +1121,16 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 			return -EINVAL;
 		}
 
+		if ((opcode == BPF_LSH || opcode == BPF_RSH ||
+		     opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) {
+			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
+
+			if (insn->imm < 0 || insn->imm >= size) {
+				verbose("invalid shift %d\n", insn->imm);
+				return -EINVAL;
+			}
+		}
+
 		/* pattern match 'bpf_add Rx, imm' instruction */
 		if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
 		    regs[insn->dst_reg].type == FRAME_PTR &&
-- 
cgit v1.2.3