From 60a3b2253c413cf601783b070507d7dd6620c954 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 2 Sep 2014 22:53:44 +0200
Subject: net: bpf: make eBPF interpreter images read-only

With eBPF getting more extended and exposure to user space is on it's way,
hardening the memory range the interpreter uses to steer its command flow
seems appropriate.  This patch moves the to be interpreted bytecode to
read-only pages.

In case we execute a corrupted BPF interpreter image for some reason e.g.
caused by an attacker which got past a verifier stage, it would not only
provide arbitrary read/write memory access but arbitrary function calls
as well. After setting up the BPF interpreter image, its contents do not
change until destruction time, thus we can setup the image on immutable
made pages in order to mitigate modifications to that code. The idea
is derived from commit 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit
against spraying attacks").

This is possible because bpf_prog is not part of sk_filter anymore.
After setup bpf_prog cannot be altered during its life-time. This prevents
any modifications to the entire bpf_prog structure (incl. function/JIT
image pointer).

Every eBPF program (including classic BPF that are migrated) have to call
bpf_prog_select_runtime() to select either interpreter or a JIT image
as a last setup step, and they all are being freed via bpf_prog_free(),
including non-JIT. Therefore, we can easily integrate this into the
eBPF life-time, plus since we directly allocate a bpf_prog, we have no
performance penalty.

Tested with seccomp and test_bpf testsuite in JIT/non-JIT mode and manual
inspection of kernel_page_tables.  Brad Spengler proposed the same idea
via Twitter during development of this patch.

Joint work with Hannes Frederic Sowa.

Suggested-by: Brad Spengler <spender@grsecurity.net>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Kees Cook <keescook@chromium.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 78 insertions(+), 2 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7f0dbcbb34af..b54bb2c2e494 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -22,6 +22,7 @@
  */
 #include <linux/filter.h>
 #include <linux/skbuff.h>
+#include <linux/vmalloc.h>
 #include <asm/unaligned.h>
 
 /* Registers */
@@ -63,6 +64,67 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
 	return NULL;
 }
 
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+			  gfp_extra_flags;
+	struct bpf_work_struct *ws;
+	struct bpf_prog *fp;
+
+	size = round_up(size, PAGE_SIZE);
+	fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+	if (fp == NULL)
+		return NULL;
+
+	ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags);
+	if (ws == NULL) {
+		vfree(fp);
+		return NULL;
+	}
+
+	fp->pages = size / PAGE_SIZE;
+	fp->work = ws;
+
+	return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_alloc);
+
+struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
+				  gfp_t gfp_extra_flags)
+{
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+			  gfp_extra_flags;
+	struct bpf_prog *fp;
+
+	BUG_ON(fp_old == NULL);
+
+	size = round_up(size, PAGE_SIZE);
+	if (size <= fp_old->pages * PAGE_SIZE)
+		return fp_old;
+
+	fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+	if (fp != NULL) {
+		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
+		fp->pages = size / PAGE_SIZE;
+
+		/* We keep fp->work from fp_old around in the new
+		 * reallocated structure.
+		 */
+		fp_old->work = NULL;
+		__bpf_prog_free(fp_old);
+	}
+
+	return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_realloc);
+
+void __bpf_prog_free(struct bpf_prog *fp)
+{
+	kfree(fp->work);
+	vfree(fp);
+}
+EXPORT_SYMBOL_GPL(__bpf_prog_free);
+
 /* Base function for offset calculation. Needs to go into .text section,
  * therefore keeping it non-static as well; will also be used by JITs
  * anyway later on, so do not let the compiler omit it.
@@ -523,12 +585,26 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
 
 	/* Probe if internal BPF can be JITed */
 	bpf_int_jit_compile(fp);
+	/* Lock whole bpf_prog as read-only */
+	bpf_prog_lock_ro(fp);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
-/* free internal BPF program */
+static void bpf_prog_free_deferred(struct work_struct *work)
+{
+	struct bpf_work_struct *ws;
+
+	ws = container_of(work, struct bpf_work_struct, work);
+	bpf_jit_free(ws->prog);
+}
+
+/* Free internal BPF program */
 void bpf_prog_free(struct bpf_prog *fp)
 {
-	bpf_jit_free(fp);
+	struct bpf_work_struct *ws = fp->work;
+
+	INIT_WORK(&ws->work, bpf_prog_free_deferred);
+	ws->prog = fp;
+	schedule_work(&ws->work);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_free);
-- 
cgit v1.2.3


From 02ab695bb37ee9ad515df0d0790d5977505dd04a Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 4 Sep 2014 22:17:17 -0700
Subject: net: filter: add "load 64-bit immediate" eBPF instruction

add BPF_LD_IMM64 instruction to load 64-bit immediate value into a register.
All previous instructions were 8-byte. This is first 16-byte instruction.
Two consecutive 'struct bpf_insn' blocks are interpreted as single instruction:
insn[0].code = BPF_LD | BPF_DW | BPF_IMM
insn[0].dst_reg = destination register
insn[0].imm = lower 32-bit
insn[1].code = 0
insn[1].imm = upper 32-bit
All unused fields must be zero.

Classic BPF has similar instruction: BPF_LD | BPF_W | BPF_IMM
which loads 32-bit immediate value into a register.

x64 JITs it as single 'movabsq %rax, imm64'
arm64 may JIT as sequence of four 'movk x0, #imm16, lsl #shift' insn

Note that old eBPF programs are binary compatible with new interpreter.

It helps eBPF programs load 64-bit constant into a register with one
instruction instead of using two registers and 4 instructions:
BPF_MOV32_IMM(R1, imm32)
BPF_ALU64_IMM(BPF_LSH, R1, 32)
BPF_MOV32_IMM(R2, imm32)
BPF_ALU64_REG(BPF_OR, R1, R2)

User space generated programs will use this instruction to load constants only.

To tell kernel that user space needs a pointer the _pseudo_ variant of
this instruction may be added later, which will use extra bits of encoding
to indicate what type of pointer user space is asking kernel to provide.
For example 'off' or 'src_reg' fields can be used for such purpose.
src_reg = 1 could mean that user space is asking kernel to validate and
load in-kernel map pointer.
src_reg = 2 could mean that user space needs readonly data section pointer
src_reg = 3 could mean that user space needs a pointer to per-cpu local data
All such future pseudo instructions will not be carrying the actual pointer
as part of the instruction, but rather will be treated as a request to kernel
to provide one. The kernel will verify the request_for_a_pointer, then
will drop _pseudo_ marking and will store actual internal pointer inside
the instruction, so the end result is the interpreter and JITs never
see pseudo BPF_LD_IMM64 insns and only operate on generic BPF_LD_IMM64 that
loads 64-bit immediate into a register. User space never operates on direct
pointers and verifier can easily recognize request_for_pointer vs other
instructions.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b54bb2c2e494..2c2bfaacce66 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -242,6 +242,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
 		[BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
 		[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+		[BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
 	};
 	void *ptr;
 	int off;
@@ -301,6 +302,10 @@ select_insn:
 	ALU64_MOV_K:
 		DST = IMM;
 		CONT;
+	LD_IMM_DW:
+		DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
+		insn++;
+		CONT;
 	ALU64_ARSH_X:
 		(*(s64 *) &DST) >>= SRC;
 		CONT;
-- 
cgit v1.2.3


From 738cbe72adc5c8f2016c4c68aa5162631d4f27e1 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 8 Sep 2014 08:04:47 +0200
Subject: net: bpf: consolidate JIT binary allocator

Introduced in commit 314beb9bcabf ("x86: bpf_jit_comp: secure bpf jit
against spraying attacks") and later on replicated in aa2d2c73c21f
("s390/bpf,jit: address randomize and write protect jit code") for
s390 architecture, write protection for BPF JIT images got added and
a random start address of the JIT code, so that it's not on a page
boundary anymore.

Since both use a very similar allocator for the BPF binary header,
we can consolidate this code into the BPF core as it's mostly JIT
independant anyway.

This will also allow for future archs that support DEBUG_SET_MODULE_RONX
to just reuse instead of reimplementing it.

JIT tested on x86_64 and s390x with BPF test suite.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2c2bfaacce66..8ee520f0ec70 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -20,9 +20,12 @@
  * Andi Kleen - Fix a few bad bugs and races.
  * Kris Katterjohn - Added many additional checks in bpf_check_classic()
  */
+
 #include <linux/filter.h>
 #include <linux/skbuff.h>
 #include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/moduleloader.h>
 #include <asm/unaligned.h>
 
 /* Registers */
@@ -125,6 +128,42 @@ void __bpf_prog_free(struct bpf_prog *fp)
 }
 EXPORT_SYMBOL_GPL(__bpf_prog_free);
 
+struct bpf_binary_header *
+bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
+		     unsigned int alignment,
+		     bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+	struct bpf_binary_header *hdr;
+	unsigned int size, hole, start;
+
+	/* Most of BPF filters are really small, but if some of them
+	 * fill a page, allow at least 128 extra bytes to insert a
+	 * random section of illegal instructions.
+	 */
+	size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
+	hdr = module_alloc(size);
+	if (hdr == NULL)
+		return NULL;
+
+	/* Fill space with illegal/arch-dep instructions. */
+	bpf_fill_ill_insns(hdr, size);
+
+	hdr->pages = size / PAGE_SIZE;
+	hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
+		     PAGE_SIZE - sizeof(*hdr));
+	start = (prandom_u32() % hole) & ~(alignment - 1);
+
+	/* Leave a random number of instructions before BPF code. */
+	*image_ptr = &hdr->image[start];
+
+	return hdr;
+}
+
+void bpf_jit_binary_free(struct bpf_binary_header *hdr)
+{
+	module_free(NULL, hdr);
+}
+
 /* Base function for offset calculation. Needs to go into .text section,
  * therefore keeping it non-static as well; will also be used by JITs
  * anyway later on, so do not let the compiler omit it.
-- 
cgit v1.2.3


From b954d83421d51d822c42e5ab7b65069b25ad3005 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Wed, 10 Sep 2014 15:01:02 +0200
Subject: net: bpf: only build bpf_jit_binary_{alloc, free}() when jit selected

Since BPF JIT depends on the availability of module_alloc() and
module_free() helpers (HAVE_BPF_JIT and MODULES), we better build
that code only in case we have BPF_JIT in our config enabled, just
like with other JIT code. Fixes builds for arm/marzen_defconfig
and sh/rsk7269_defconfig.

====================
kernel/built-in.o: In function `bpf_jit_binary_alloc':
/home/cwang/linux/kernel/bpf/core.c:144: undefined reference to `module_alloc'
kernel/built-in.o: In function `bpf_jit_binary_free':
/home/cwang/linux/kernel/bpf/core.c:164: undefined reference to `module_free'
make: *** [vmlinux] Error 1
====================

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Fixes: 738cbe72adc5 ("net: bpf: consolidate JIT binary allocator")
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8ee520f0ec70..8b7002488251 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -128,6 +128,7 @@ void __bpf_prog_free(struct bpf_prog *fp)
 }
 EXPORT_SYMBOL_GPL(__bpf_prog_free);
 
+#ifdef CONFIG_BPF_JIT
 struct bpf_binary_header *
 bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
 		     unsigned int alignment,
@@ -163,6 +164,7 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr)
 {
 	module_free(NULL, hdr);
 }
+#endif /* CONFIG_BPF_JIT */
 
 /* Base function for offset calculation. Needs to go into .text section,
  * therefore keeping it non-static as well; will also be used by JITs
-- 
cgit v1.2.3


From 99c55f7d47c0dc6fc64729f37bf435abf43f4c60 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 26 Sep 2014 00:16:57 -0700
Subject: bpf: introduce BPF syscall and maps

BPF syscall is a multiplexor for a range of different operations on eBPF.
This patch introduces syscall with single command to create a map.
Next patch adds commands to access maps.

'maps' is a generic storage of different types for sharing data between kernel
and userspace.

Userspace example:
/* this syscall wrapper creates a map with given type and attributes
 * and returns map_fd on success.
 * use close(map_fd) to delete the map
 */
int bpf_create_map(enum bpf_map_type map_type, int key_size,
                   int value_size, int max_entries)
{
    union bpf_attr attr = {
        .map_type = map_type,
        .key_size = key_size,
        .value_size = value_size,
        .max_entries = max_entries
    };

    return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
}

'union bpf_attr' is backwards compatible with future extensions.

More details in Documentation/networking/filter.txt and in manpage

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/Makefile  |   2 +-
 kernel/bpf/syscall.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 170 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/syscall.c

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a71145e2769..e9f7334ed07a 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1 @@
-obj-y := core.o
+obj-y := core.o syscall.o
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000000..428a0e23adc0
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,169 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+
+static LIST_HEAD(bpf_map_types);
+
+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+{
+	struct bpf_map_type_list *tl;
+	struct bpf_map *map;
+
+	list_for_each_entry(tl, &bpf_map_types, list_node) {
+		if (tl->type == attr->map_type) {
+			map = tl->ops->map_alloc(attr);
+			if (IS_ERR(map))
+				return map;
+			map->ops = tl->ops;
+			map->map_type = attr->map_type;
+			return map;
+		}
+	}
+	return ERR_PTR(-EINVAL);
+}
+
+/* boot time registration of different map implementations */
+void bpf_register_map_type(struct bpf_map_type_list *tl)
+{
+	list_add(&tl->list_node, &bpf_map_types);
+}
+
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+	struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+	/* implementation dependent freeing */
+	map->ops->map_free(map);
+}
+
+/* decrement map refcnt and schedule it for freeing via workqueue
+ * (unrelying map implementation ops->map_free() might sleep)
+ */
+void bpf_map_put(struct bpf_map *map)
+{
+	if (atomic_dec_and_test(&map->refcnt)) {
+		INIT_WORK(&map->work, bpf_map_free_deferred);
+		schedule_work(&map->work);
+	}
+}
+
+static int bpf_map_release(struct inode *inode, struct file *filp)
+{
+	struct bpf_map *map = filp->private_data;
+
+	bpf_map_put(map);
+	return 0;
+}
+
+static const struct file_operations bpf_map_fops = {
+	.release = bpf_map_release,
+};
+
+/* helper macro to check that unused fields 'union bpf_attr' are zero */
+#define CHECK_ATTR(CMD) \
+	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
+		   sizeof(attr->CMD##_LAST_FIELD), 0, \
+		   sizeof(*attr) - \
+		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
+		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
+
+#define BPF_MAP_CREATE_LAST_FIELD max_entries
+/* called via syscall */
+static int map_create(union bpf_attr *attr)
+{
+	struct bpf_map *map;
+	int err;
+
+	err = CHECK_ATTR(BPF_MAP_CREATE);
+	if (err)
+		return -EINVAL;
+
+	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
+	map = find_and_alloc_map(attr);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	atomic_set(&map->refcnt, 1);
+
+	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+
+	if (err < 0)
+		/* failed to allocate fd */
+		goto free_map;
+
+	return err;
+
+free_map:
+	map->ops->map_free(map);
+	return err;
+}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+	union bpf_attr attr = {};
+	int err;
+
+	/* the syscall is limited to root temporarily. This restriction will be
+	 * lifted when security audit is clean. Note that eBPF+tracing must have
+	 * this restriction, since it may pass kernel data to user space
+	 */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!access_ok(VERIFY_READ, uattr, 1))
+		return -EFAULT;
+
+	if (size > PAGE_SIZE)	/* silly large */
+		return -E2BIG;
+
+	/* If we're handed a bigger struct than we know of,
+	 * ensure all the unknown bits are 0 - i.e. new
+	 * user-space does not rely on any kernel feature
+	 * extensions we dont know about yet.
+	 */
+	if (size > sizeof(attr)) {
+		unsigned char __user *addr;
+		unsigned char __user *end;
+		unsigned char val;
+
+		addr = (void __user *)uattr + sizeof(attr);
+		end  = (void __user *)uattr + size;
+
+		for (; addr < end; addr++) {
+			err = get_user(val, addr);
+			if (err)
+				return err;
+			if (val)
+				return -E2BIG;
+		}
+		size = sizeof(attr);
+	}
+
+	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
+	if (copy_from_user(&attr, uattr, size) != 0)
+		return -EFAULT;
+
+	switch (cmd) {
+	case BPF_MAP_CREATE:
+		err = map_create(&attr);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
-- 
cgit v1.2.3


From db20fd2b01087bdfbe30bce314a198eefedcc42e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 26 Sep 2014 00:16:59 -0700
Subject: bpf: add lookup/update/delete/iterate methods to BPF maps

'maps' is a generic storage of different types for sharing data between kernel
and userspace.

The maps are accessed from user space via BPF syscall, which has commands:

- create a map with given type and attributes
  fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
  returns fd or negative error

- lookup key in a given map referenced by fd
  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero and stores found elem into value or negative error

- create or update key/value pair in a given map
  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero or negative error

- find and delete element by key in a given map
  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key

- iterate map elements (based on input key return next_key)
  err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->next_key

- close(fd) deletes the map

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 235 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 428a0e23adc0..f94349ecaf61 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -13,6 +13,7 @@
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/anon_inodes.h>
+#include <linux/file.h>
 
 static LIST_HEAD(bpf_map_types);
 
@@ -111,6 +112,228 @@ free_map:
 	return err;
 }
 
+/* if error is returned, fd is released.
+ * On success caller should complete fd access with matching fdput()
+ */
+struct bpf_map *bpf_map_get(struct fd f)
+{
+	struct bpf_map *map;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &bpf_map_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	map = f.file->private_data;
+
+	return map;
+}
+
+/* helper to convert user pointers passed inside __aligned_u64 fields */
+static void __user *u64_to_ptr(__u64 val)
+{
+	return (void __user *) (unsigned long) val;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+
+static int map_lookup_elem(union bpf_attr *attr)
+{
+	void __user *ukey = u64_to_ptr(attr->key);
+	void __user *uvalue = u64_to_ptr(attr->value);
+	int ufd = attr->map_fd;
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+	void *key, *value;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+		return -EINVAL;
+
+	map = bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	err = -ESRCH;
+	rcu_read_lock();
+	value = map->ops->map_lookup_elem(map, key);
+	if (!value)
+		goto err_unlock;
+
+	err = -EFAULT;
+	if (copy_to_user(uvalue, value, map->value_size) != 0)
+		goto err_unlock;
+
+	err = 0;
+
+err_unlock:
+	rcu_read_unlock();
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+
+static int map_update_elem(union bpf_attr *attr)
+{
+	void __user *ukey = u64_to_ptr(attr->key);
+	void __user *uvalue = u64_to_ptr(attr->value);
+	int ufd = attr->map_fd;
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+	void *key, *value;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
+		return -EINVAL;
+
+	map = bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	err = -ENOMEM;
+	value = kmalloc(map->value_size, GFP_USER);
+	if (!value)
+		goto free_key;
+
+	err = -EFAULT;
+	if (copy_from_user(value, uvalue, map->value_size) != 0)
+		goto free_value;
+
+	/* eBPF program that use maps are running under rcu_read_lock(),
+	 * therefore all map accessors rely on this fact, so do the same here
+	 */
+	rcu_read_lock();
+	err = map->ops->map_update_elem(map, key, value);
+	rcu_read_unlock();
+
+free_value:
+	kfree(value);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+
+static int map_delete_elem(union bpf_attr *attr)
+{
+	void __user *ukey = u64_to_ptr(attr->key);
+	int ufd = attr->map_fd;
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+	void *key;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
+		return -EINVAL;
+
+	map = bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	rcu_read_lock();
+	err = map->ops->map_delete_elem(map, key);
+	rcu_read_unlock();
+
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
+
+static int map_get_next_key(union bpf_attr *attr)
+{
+	void __user *ukey = u64_to_ptr(attr->key);
+	void __user *unext_key = u64_to_ptr(attr->next_key);
+	int ufd = attr->map_fd;
+	struct fd f = fdget(ufd);
+	struct bpf_map *map;
+	void *key, *next_key;
+	int err;
+
+	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
+		return -EINVAL;
+
+	map = bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	err = -ENOMEM;
+	key = kmalloc(map->key_size, GFP_USER);
+	if (!key)
+		goto err_put;
+
+	err = -EFAULT;
+	if (copy_from_user(key, ukey, map->key_size) != 0)
+		goto free_key;
+
+	err = -ENOMEM;
+	next_key = kmalloc(map->key_size, GFP_USER);
+	if (!next_key)
+		goto free_key;
+
+	rcu_read_lock();
+	err = map->ops->map_get_next_key(map, key, next_key);
+	rcu_read_unlock();
+	if (err)
+		goto free_next_key;
+
+	err = -EFAULT;
+	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
+		goto free_next_key;
+
+	err = 0;
+
+free_next_key:
+	kfree(next_key);
+free_key:
+	kfree(key);
+err_put:
+	fdput(f);
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -160,6 +383,18 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_CREATE:
 		err = map_create(&attr);
 		break;
+	case BPF_MAP_LOOKUP_ELEM:
+		err = map_lookup_elem(&attr);
+		break;
+	case BPF_MAP_UPDATE_ELEM:
+		err = map_update_elem(&attr);
+		break;
+	case BPF_MAP_DELETE_ELEM:
+		err = map_delete_elem(&attr);
+		break;
+	case BPF_MAP_GET_NEXT_KEY:
+		err = map_get_next_key(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 09756af46893c18839062976c3252e93a1beeba7 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 26 Sep 2014 00:17:00 -0700
Subject: bpf: expand BPF syscall with program load/unload

eBPF programs are similar to kernel modules. They are loaded by the user
process and automatically unloaded when process exits. Each eBPF program is
a safe run-to-completion set of instructions. eBPF verifier statically
determines that the program terminates and is safe to execute.

The following syscall wrapper can be used to load the program:
int bpf_prog_load(enum bpf_prog_type prog_type,
                  const struct bpf_insn *insns, int insn_cnt,
                  const char *license)
{
    union bpf_attr attr = {
        .prog_type = prog_type,
        .insns = ptr_to_u64(insns),
        .insn_cnt = insn_cnt,
        .license = ptr_to_u64(license),
    };

    return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
}
where 'insns' is an array of eBPF instructions and 'license' is a string
that must be GPL compatible to call helper functions marked gpl_only

Upon succesful load the syscall returns prog_fd.
Use close(prog_fd) to unload the program.

User space tests and examples follow in the later patches

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c    |  29 ++++-----
 kernel/bpf/syscall.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 180 insertions(+), 14 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8b7002488251..f0c30c59b317 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -27,6 +27,7 @@
 #include <linux/random.h>
 #include <linux/moduleloader.h>
 #include <asm/unaligned.h>
+#include <linux/bpf.h>
 
 /* Registers */
 #define BPF_R0	regs[BPF_REG_0]
@@ -71,7 +72,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 {
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
 			  gfp_extra_flags;
-	struct bpf_work_struct *ws;
+	struct bpf_prog_aux *aux;
 	struct bpf_prog *fp;
 
 	size = round_up(size, PAGE_SIZE);
@@ -79,14 +80,14 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 	if (fp == NULL)
 		return NULL;
 
-	ws = kmalloc(sizeof(*ws), GFP_KERNEL | gfp_extra_flags);
-	if (ws == NULL) {
+	aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
+	if (aux == NULL) {
 		vfree(fp);
 		return NULL;
 	}
 
 	fp->pages = size / PAGE_SIZE;
-	fp->work = ws;
+	fp->aux = aux;
 
 	return fp;
 }
@@ -110,10 +111,10 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
 		fp->pages = size / PAGE_SIZE;
 
-		/* We keep fp->work from fp_old around in the new
+		/* We keep fp->aux from fp_old around in the new
 		 * reallocated structure.
 		 */
-		fp_old->work = NULL;
+		fp_old->aux = NULL;
 		__bpf_prog_free(fp_old);
 	}
 
@@ -123,7 +124,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_realloc);
 
 void __bpf_prog_free(struct bpf_prog *fp)
 {
-	kfree(fp->work);
+	kfree(fp->aux);
 	vfree(fp);
 }
 EXPORT_SYMBOL_GPL(__bpf_prog_free);
@@ -638,19 +639,19 @@ EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
-	struct bpf_work_struct *ws;
+	struct bpf_prog_aux *aux;
 
-	ws = container_of(work, struct bpf_work_struct, work);
-	bpf_jit_free(ws->prog);
+	aux = container_of(work, struct bpf_prog_aux, work);
+	bpf_jit_free(aux->prog);
 }
 
 /* Free internal BPF program */
 void bpf_prog_free(struct bpf_prog *fp)
 {
-	struct bpf_work_struct *ws = fp->work;
+	struct bpf_prog_aux *aux = fp->aux;
 
-	INIT_WORK(&ws->work, bpf_prog_free_deferred);
-	ws->prog = fp;
-	schedule_work(&ws->work);
+	INIT_WORK(&aux->work, bpf_prog_free_deferred);
+	aux->prog = fp;
+	schedule_work(&aux->work);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_free);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f94349ecaf61..0afb4eaa1887 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -14,6 +14,8 @@
 #include <linux/slab.h>
 #include <linux/anon_inodes.h>
 #include <linux/file.h>
+#include <linux/license.h>
+#include <linux/filter.h>
 
 static LIST_HEAD(bpf_map_types);
 
@@ -334,6 +336,166 @@ err_put:
 	return err;
 }
 
+static LIST_HEAD(bpf_prog_types);
+
+static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
+{
+	struct bpf_prog_type_list *tl;
+
+	list_for_each_entry(tl, &bpf_prog_types, list_node) {
+		if (tl->type == type) {
+			prog->aux->ops = tl->ops;
+			prog->aux->prog_type = type;
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+	list_add(&tl->list_node, &bpf_prog_types);
+}
+
+/* drop refcnt on maps used by eBPF program and free auxilary data */
+static void free_used_maps(struct bpf_prog_aux *aux)
+{
+	int i;
+
+	for (i = 0; i < aux->used_map_cnt; i++)
+		bpf_map_put(aux->used_maps[i]);
+
+	kfree(aux->used_maps);
+}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+	if (atomic_dec_and_test(&prog->aux->refcnt)) {
+		free_used_maps(prog->aux);
+		bpf_prog_free(prog);
+	}
+}
+
+static int bpf_prog_release(struct inode *inode, struct file *filp)
+{
+	struct bpf_prog *prog = filp->private_data;
+
+	bpf_prog_put(prog);
+	return 0;
+}
+
+static const struct file_operations bpf_prog_fops = {
+        .release = bpf_prog_release,
+};
+
+static struct bpf_prog *get_prog(struct fd f)
+{
+	struct bpf_prog *prog;
+
+	if (!f.file)
+		return ERR_PTR(-EBADF);
+
+	if (f.file->f_op != &bpf_prog_fops) {
+		fdput(f);
+		return ERR_PTR(-EINVAL);
+	}
+
+	prog = f.file->private_data;
+
+	return prog;
+}
+
+/* called by sockets/tracing/seccomp before attaching program to an event
+ * pairs with bpf_prog_put()
+ */
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+	struct fd f = fdget(ufd);
+	struct bpf_prog *prog;
+
+	prog = get_prog(f);
+
+	if (IS_ERR(prog))
+		return prog;
+
+	atomic_inc(&prog->aux->refcnt);
+	fdput(f);
+	return prog;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define	BPF_PROG_LOAD_LAST_FIELD license
+
+static int bpf_prog_load(union bpf_attr *attr)
+{
+	enum bpf_prog_type type = attr->prog_type;
+	struct bpf_prog *prog;
+	int err;
+	char license[128];
+	bool is_gpl;
+
+	if (CHECK_ATTR(BPF_PROG_LOAD))
+		return -EINVAL;
+
+	/* copy eBPF program license from user space */
+	if (strncpy_from_user(license, u64_to_ptr(attr->license),
+			      sizeof(license) - 1) < 0)
+		return -EFAULT;
+	license[sizeof(license) - 1] = 0;
+
+	/* eBPF programs must be GPL compatible to use GPL-ed functions */
+	is_gpl = license_is_gpl_compatible(license);
+
+	if (attr->insn_cnt >= BPF_MAXINSNS)
+		return -EINVAL;
+
+	/* plain bpf_prog allocation */
+	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
+	if (!prog)
+		return -ENOMEM;
+
+	prog->len = attr->insn_cnt;
+
+	err = -EFAULT;
+	if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
+			   prog->len * sizeof(struct bpf_insn)) != 0)
+		goto free_prog;
+
+	prog->orig_prog = NULL;
+	prog->jited = false;
+
+	atomic_set(&prog->aux->refcnt, 1);
+	prog->aux->is_gpl_compatible = is_gpl;
+
+	/* find program type: socket_filter vs tracing_filter */
+	err = find_prog_type(type, prog);
+	if (err < 0)
+		goto free_prog;
+
+	/* run eBPF verifier */
+	/* err = bpf_check(prog, tb); */
+
+	if (err < 0)
+		goto free_used_maps;
+
+	/* eBPF program is ready to be JITed */
+	bpf_prog_select_runtime(prog);
+
+	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+
+	if (err < 0)
+		/* failed to allocate fd */
+		goto free_used_maps;
+
+	return err;
+
+free_used_maps:
+	free_used_maps(prog->aux);
+free_prog:
+	bpf_prog_free(prog);
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -395,6 +557,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_GET_NEXT_KEY:
 		err = map_get_next_key(&attr);
 		break;
+	case BPF_PROG_LOAD:
+		err = bpf_prog_load(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 0a542a86d73b1577e7d4f55fc95dcffd3fe62643 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 26 Sep 2014 00:17:01 -0700
Subject: bpf: handle pseudo BPF_CALL insn

in native eBPF programs userspace is using pseudo BPF_CALL instructions
which encode one of 'enum bpf_func_id' inside insn->imm field.
Verifier checks that program using correct function arguments to given func_id.
If all checks passed, kernel needs to fixup BPF_CALL->imm fields by
replacing func_id with in-kernel function pointer.
eBPF interpreter just calls the function.

In-kernel eBPF users continue to use generic BPF_CALL.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0afb4eaa1887..b513659d120f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -357,6 +357,40 @@ void bpf_register_prog_type(struct bpf_prog_type_list *tl)
 	list_add(&tl->list_node, &bpf_prog_types);
 }
 
+/* fixup insn->imm field of bpf_call instructions:
+ * if (insn->imm == BPF_FUNC_map_lookup_elem)
+ *      insn->imm = bpf_map_lookup_elem - __bpf_call_base;
+ * else if (insn->imm == BPF_FUNC_map_update_elem)
+ *      insn->imm = bpf_map_update_elem - __bpf_call_base;
+ * else ...
+ *
+ * this function is called after eBPF program passed verification
+ */
+static void fixup_bpf_calls(struct bpf_prog *prog)
+{
+	const struct bpf_func_proto *fn;
+	int i;
+
+	for (i = 0; i < prog->len; i++) {
+		struct bpf_insn *insn = &prog->insnsi[i];
+
+		if (insn->code == (BPF_JMP | BPF_CALL)) {
+			/* we reach here when program has bpf_call instructions
+			 * and it passed bpf_check(), means that
+			 * ops->get_func_proto must have been supplied, check it
+			 */
+			BUG_ON(!prog->aux->ops->get_func_proto);
+
+			fn = prog->aux->ops->get_func_proto(insn->imm);
+			/* all functions that have prototype and verifier allowed
+			 * programs to call them, must be real in-kernel functions
+			 */
+			BUG_ON(!fn->func);
+			insn->imm = fn->func - __bpf_call_base;
+		}
+	}
+}
+
 /* drop refcnt on maps used by eBPF program and free auxilary data */
 static void free_used_maps(struct bpf_prog_aux *aux)
 {
@@ -478,6 +512,9 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err < 0)
 		goto free_used_maps;
 
+	/* fixup BPF_CALL->imm field */
+	fixup_bpf_calls(prog);
+
 	/* eBPF program is ready to be JITed */
 	bpf_prog_select_runtime(prog);
 
-- 
cgit v1.2.3


From 51580e798cb61b0fc63fa3aa6c5c975375aa0550 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 26 Sep 2014 00:17:02 -0700
Subject: bpf: verifier (add docs)

this patch adds all of eBPF verfier documentation and empty bpf_check()

The end goal for the verifier is to statically check safety of the program.

Verifier will catch:
- loops
- out of range jumps
- unreachable instructions
- invalid instructions
- uninitialized register access
- uninitialized stack access
- misaligned stack access
- out of range stack access
- invalid calling convention

More details in Documentation/networking/filter.txt

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/Makefile   |   2 +-
 kernel/bpf/syscall.c  |   2 +-
 kernel/bpf/verifier.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 2 deletions(-)
 create mode 100644 kernel/bpf/verifier.c

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e9f7334ed07a..3c726b0995b7 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1 @@
-obj-y := core.o syscall.o
+obj-y := core.o syscall.o verifier.o
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b513659d120f..74b3628c5fdb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -507,7 +507,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 		goto free_prog;
 
 	/* run eBPF verifier */
-	/* err = bpf_check(prog, tb); */
+	err = bpf_check(prog, attr);
 
 	if (err < 0)
 		goto free_used_maps;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
new file mode 100644
index 000000000000..d6f9c3d6b4d7
--- /dev/null
+++ b/kernel/bpf/verifier.c
@@ -0,0 +1,133 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/netlink.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+
+/* bpf_check() is a static code analyzer that walks eBPF program
+ * instruction by instruction and updates register/stack state.
+ * All paths of conditional branches are analyzed until 'bpf_exit' insn.
+ *
+ * The first pass is depth-first-search to check that the program is a DAG.
+ * It rejects the following programs:
+ * - larger than BPF_MAXINSNS insns
+ * - if loop is present (detected via back-edge)
+ * - unreachable insns exist (shouldn't be a forest. program = one function)
+ * - out of bounds or malformed jumps
+ * The second pass is all possible path descent from the 1st insn.
+ * Since it's analyzing all pathes through the program, the length of the
+ * analysis is limited to 32k insn, which may be hit even if total number of
+ * insn is less then 4K, but there are too many branches that change stack/regs.
+ * Number of 'branches to be analyzed' is limited to 1k
+ *
+ * On entry to each instruction, each register has a type, and the instruction
+ * changes the types of the registers depending on instruction semantics.
+ * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
+ * copied to R1.
+ *
+ * All registers are 64-bit.
+ * R0 - return register
+ * R1-R5 argument passing registers
+ * R6-R9 callee saved registers
+ * R10 - frame pointer read-only
+ *
+ * At the start of BPF program the register R1 contains a pointer to bpf_context
+ * and has type PTR_TO_CTX.
+ *
+ * Verifier tracks arithmetic operations on pointers in case:
+ *    BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ *    BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
+ * 1st insn copies R10 (which has FRAME_PTR) type into R1
+ * and 2nd arithmetic instruction is pattern matched to recognize
+ * that it wants to construct a pointer to some element within stack.
+ * So after 2nd insn, the register R1 has type PTR_TO_STACK
+ * (and -20 constant is saved for further stack bounds checking).
+ * Meaning that this reg is a pointer to stack plus known immediate constant.
+ *
+ * Most of the time the registers have UNKNOWN_VALUE type, which
+ * means the register has some value, but it's not a valid pointer.
+ * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ *
+ * When verifier sees load or store instructions the type of base register
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * types recognized by check_mem_access() function.
+ *
+ * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
+ * and the range of [ptr, ptr + map's value_size) is accessible.
+ *
+ * registers used to pass values to function calls are checked against
+ * function argument constraints.
+ *
+ * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
+ * It means that the register type passed to this function must be
+ * PTR_TO_STACK and it will be used inside the function as
+ * 'pointer to map element key'
+ *
+ * For example the argument constraints for bpf_map_lookup_elem():
+ *   .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ *   .arg1_type = ARG_CONST_MAP_PTR,
+ *   .arg2_type = ARG_PTR_TO_MAP_KEY,
+ *
+ * ret_type says that this function returns 'pointer to map elem value or null'
+ * function expects 1st argument to be a const pointer to 'struct bpf_map' and
+ * 2nd argument should be a pointer to stack, which will be used inside
+ * the helper function as a pointer to map element key.
+ *
+ * On the kernel side the helper function looks like:
+ * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+ * {
+ *    struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ *    void *key = (void *) (unsigned long) r2;
+ *    void *value;
+ *
+ *    here kernel can access 'key' and 'map' pointers safely, knowing that
+ *    [key, key + map->key_size) bytes are valid and were initialized on
+ *    the stack of eBPF program.
+ * }
+ *
+ * Corresponding eBPF program may look like:
+ *    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),  // after this insn R2 type is FRAME_PTR
+ *    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
+ *    BPF_LD_MAP_FD(BPF_REG_1, map_fd),      // after this insn R1 type is CONST_PTR_TO_MAP
+ *    BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ * here verifier looks at prototype of map_lookup_elem() and sees:
+ * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
+ * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
+ *
+ * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
+ * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
+ * and were initialized prior to this call.
+ * If it's ok, then verifier allows this BPF_CALL insn and looks at
+ * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
+ * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
+ * returns ether pointer to map value or NULL.
+ *
+ * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
+ * insn, the register holding that pointer in the true branch changes state to
+ * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
+ * branch. See check_cond_jmp_op().
+ *
+ * After the call R0 is set to return type of the function and registers R1-R5
+ * are set to NOT_INIT to indicate that they are no longer readable.
+ */
+
+int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
+{
+	int ret = -EINVAL;
+
+	return ret;
+}
-- 
cgit v1.2.3


From cbd357008604925355ae7b54a09137dabb81b580 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 26 Sep 2014 00:17:03 -0700
Subject: bpf: verifier (add ability to receive verification log)

add optional attributes for BPF_PROG_LOAD syscall:
union bpf_attr {
    struct {
	...
	__u32         log_level; /* verbosity level of eBPF verifier */
	__u32         log_size;  /* size of user buffer */
	__aligned_u64 log_buf;   /* user supplied 'char *buffer' */
    };
};

when log_level > 0 the verifier will return its verification log in the user
supplied buffer 'log_buf' which can be used by program author to analyze why
verifier rejected given program.

'Understanding eBPF verifier messages' section of Documentation/networking/filter.txt
provides several examples of these messages, like the program:

  BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
  BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
  BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
  BPF_LD_MAP_FD(BPF_REG_1, 0),
  BPF_CALL_FUNC(BPF_FUNC_map_lookup_elem),
  BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
  BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
  BPF_EXIT_INSN(),

will be rejected with the following multi-line message in log_buf:

  0: (7a) *(u64 *)(r10 -8) = 0
  1: (bf) r2 = r10
  2: (07) r2 += -8
  3: (b7) r1 = 0
  4: (85) call 1
  5: (15) if r0 == 0x0 goto pc+1
   R0=map_ptr R10=fp
  6: (7a) *(u64 *)(r0 +4) = 0
  misaligned access off 4 size 8

The format of the output can change at any time as verifier evolves.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c  |   2 +-
 kernel/bpf/verifier.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 236 insertions(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 74b3628c5fdb..ba61c8c16032 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -458,7 +458,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 }
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD license
+#define	BPF_PROG_LOAD_LAST_FIELD log_buf
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d6f9c3d6b4d7..871edc1f2e1f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,9 +125,244 @@
  * are set to NOT_INIT to indicate that they are no longer readable.
  */
 
+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct verifier_env {
+};
+
+/* verbose verifier prints what it's seeing
+ * bpf_check() is called under lock, so no race to access these global vars
+ */
+static u32 log_level, log_size, log_len;
+static char *log_buf;
+
+static DEFINE_MUTEX(bpf_verifier_lock);
+
+/* log_level controls verbosity level of eBPF verifier.
+ * verbose() is used to dump the verification trace to the log, so the user
+ * can figure out what's wrong with the program
+ */
+static void verbose(const char *fmt, ...)
+{
+	va_list args;
+
+	if (log_level == 0 || log_len >= log_size - 1)
+		return;
+
+	va_start(args, fmt);
+	log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+	va_end(args);
+}
+
+static const char *const bpf_class_string[] = {
+	[BPF_LD]    = "ld",
+	[BPF_LDX]   = "ldx",
+	[BPF_ST]    = "st",
+	[BPF_STX]   = "stx",
+	[BPF_ALU]   = "alu",
+	[BPF_JMP]   = "jmp",
+	[BPF_RET]   = "BUG",
+	[BPF_ALU64] = "alu64",
+};
+
+static const char *const bpf_alu_string[] = {
+	[BPF_ADD >> 4]  = "+=",
+	[BPF_SUB >> 4]  = "-=",
+	[BPF_MUL >> 4]  = "*=",
+	[BPF_DIV >> 4]  = "/=",
+	[BPF_OR  >> 4]  = "|=",
+	[BPF_AND >> 4]  = "&=",
+	[BPF_LSH >> 4]  = "<<=",
+	[BPF_RSH >> 4]  = ">>=",
+	[BPF_NEG >> 4]  = "neg",
+	[BPF_MOD >> 4]  = "%=",
+	[BPF_XOR >> 4]  = "^=",
+	[BPF_MOV >> 4]  = "=",
+	[BPF_ARSH >> 4] = "s>>=",
+	[BPF_END >> 4]  = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+	[BPF_W >> 3]  = "u32",
+	[BPF_H >> 3]  = "u16",
+	[BPF_B >> 3]  = "u8",
+	[BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[] = {
+	[BPF_JA >> 4]   = "jmp",
+	[BPF_JEQ >> 4]  = "==",
+	[BPF_JGT >> 4]  = ">",
+	[BPF_JGE >> 4]  = ">=",
+	[BPF_JSET >> 4] = "&",
+	[BPF_JNE >> 4]  = "!=",
+	[BPF_JSGT >> 4] = "s>",
+	[BPF_JSGE >> 4] = "s>=",
+	[BPF_CALL >> 4] = "call",
+	[BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_insn(struct bpf_insn *insn)
+{
+	u8 class = BPF_CLASS(insn->code);
+
+	if (class == BPF_ALU || class == BPF_ALU64) {
+		if (BPF_SRC(insn->code) == BPF_X)
+			verbose("(%02x) %sr%d %s %sr%d\n",
+				insn->code, class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg,
+				bpf_alu_string[BPF_OP(insn->code) >> 4],
+				class == BPF_ALU ? "(u32) " : "",
+				insn->src_reg);
+		else
+			verbose("(%02x) %sr%d %s %s%d\n",
+				insn->code, class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg,
+				bpf_alu_string[BPF_OP(insn->code) >> 4],
+				class == BPF_ALU ? "(u32) " : "",
+				insn->imm);
+	} else if (class == BPF_STX) {
+		if (BPF_MODE(insn->code) == BPF_MEM)
+			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg,
+				insn->off, insn->src_reg);
+		else if (BPF_MODE(insn->code) == BPF_XADD)
+			verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg, insn->off,
+				insn->src_reg);
+		else
+			verbose("BUG_%02x\n", insn->code);
+	} else if (class == BPF_ST) {
+		if (BPF_MODE(insn->code) != BPF_MEM) {
+			verbose("BUG_st_%02x\n", insn->code);
+			return;
+		}
+		verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+			insn->code,
+			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+			insn->dst_reg,
+			insn->off, insn->imm);
+	} else if (class == BPF_LDX) {
+		if (BPF_MODE(insn->code) != BPF_MEM) {
+			verbose("BUG_ldx_%02x\n", insn->code);
+			return;
+		}
+		verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+			insn->code, insn->dst_reg,
+			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+			insn->src_reg, insn->off);
+	} else if (class == BPF_LD) {
+		if (BPF_MODE(insn->code) == BPF_ABS) {
+			verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IND) {
+			verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->src_reg, insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IMM) {
+			verbose("(%02x) r%d = 0x%x\n",
+				insn->code, insn->dst_reg, insn->imm);
+		} else {
+			verbose("BUG_ld_%02x\n", insn->code);
+			return;
+		}
+	} else if (class == BPF_JMP) {
+		u8 opcode = BPF_OP(insn->code);
+
+		if (opcode == BPF_CALL) {
+			verbose("(%02x) call %d\n", insn->code, insn->imm);
+		} else if (insn->code == (BPF_JMP | BPF_JA)) {
+			verbose("(%02x) goto pc%+d\n",
+				insn->code, insn->off);
+		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+			verbose("(%02x) exit\n", insn->code);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
+			verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+				insn->code, insn->dst_reg,
+				bpf_jmp_string[BPF_OP(insn->code) >> 4],
+				insn->src_reg, insn->off);
+		} else {
+			verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+				insn->code, insn->dst_reg,
+				bpf_jmp_string[BPF_OP(insn->code) >> 4],
+				insn->imm, insn->off);
+		}
+	} else {
+		verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+	}
+}
+
 int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 {
+	char __user *log_ubuf = NULL;
+	struct verifier_env *env;
 	int ret = -EINVAL;
 
+	if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
+		return -E2BIG;
+
+	/* 'struct verifier_env' can be global, but since it's not small,
+	 * allocate/free it every time bpf_check() is called
+	 */
+	env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+	if (!env)
+		return -ENOMEM;
+
+	/* grab the mutex to protect few globals used by verifier */
+	mutex_lock(&bpf_verifier_lock);
+
+	if (attr->log_level || attr->log_buf || attr->log_size) {
+		/* user requested verbose verifier output
+		 * and supplied buffer to store the verification trace
+		 */
+		log_level = attr->log_level;
+		log_ubuf = (char __user *) (unsigned long) attr->log_buf;
+		log_size = attr->log_size;
+		log_len = 0;
+
+		ret = -EINVAL;
+		/* log_* values have to be sane */
+		if (log_size < 128 || log_size > UINT_MAX >> 8 ||
+		    log_level == 0 || log_ubuf == NULL)
+			goto free_env;
+
+		ret = -ENOMEM;
+		log_buf = vmalloc(log_size);
+		if (!log_buf)
+			goto free_env;
+	} else {
+		log_level = 0;
+	}
+
+	/* ret = do_check(env); */
+
+	if (log_level && log_len >= log_size - 1) {
+		BUG_ON(log_len >= log_size);
+		/* verifier log exceeded user supplied buffer */
+		ret = -ENOSPC;
+		/* fall through to return what was recorded */
+	}
+
+	/* copy verifier log back to user space including trailing zero */
+	if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+		ret = -EFAULT;
+		goto free_log_buf;
+	}
+
+
+free_log_buf:
+	if (log_level)
+		vfree(log_buf);
+free_env:
+	kfree(env);
+	mutex_unlock(&bpf_verifier_lock);
 	return ret;
 }
-- 
cgit v1.2.3


From 0246e64d9a5fcd4805198de59b9b5cf1f974eb41 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 26 Sep 2014 00:17:04 -0700
Subject: bpf: handle pseudo BPF_LD_IMM64 insn

eBPF programs passed from userspace are using pseudo BPF_LD_IMM64 instructions
to refer to process-local map_fd. Scan the program for such instructions and
if FDs are valid, convert them to 'struct bpf_map' pointers which will be used
by verifier to check access to maps in bpf_map_lookup/update() calls.
If program passes verifier, convert pseudo BPF_LD_IMM64 into generic by dropping
BPF_PSEUDO_MAP_FD flag.

Note that eBPF interpreter is generic and knows nothing about pseudo insns.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 871edc1f2e1f..7227543e474b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,10 +125,15 @@
  * are set to NOT_INIT to indicate that they are no longer readable.
  */
 
+#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+
 /* single container for all structs
  * one verifier_env per bpf_check() call
  */
 struct verifier_env {
+	struct bpf_prog *prog;		/* eBPF program being verified */
+	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+	u32 used_map_cnt;		/* number of used maps */
 };
 
 /* verbose verifier prints what it's seeing
@@ -300,6 +305,115 @@ static void print_bpf_insn(struct bpf_insn *insn)
 	}
 }
 
+/* return the map pointer stored inside BPF_LD_IMM64 instruction */
+static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
+{
+	u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
+
+	return (struct bpf_map *) (unsigned long) imm64;
+}
+
+/* look for pseudo eBPF instructions that access map FDs and
+ * replace them with actual map pointers
+ */
+static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+{
+	struct bpf_insn *insn = env->prog->insnsi;
+	int insn_cnt = env->prog->len;
+	int i, j;
+
+	for (i = 0; i < insn_cnt; i++, insn++) {
+		if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+			struct bpf_map *map;
+			struct fd f;
+
+			if (i == insn_cnt - 1 || insn[1].code != 0 ||
+			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
+			    insn[1].off != 0) {
+				verbose("invalid bpf_ld_imm64 insn\n");
+				return -EINVAL;
+			}
+
+			if (insn->src_reg == 0)
+				/* valid generic load 64-bit imm */
+				goto next_insn;
+
+			if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
+				verbose("unrecognized bpf_ld_imm64 insn\n");
+				return -EINVAL;
+			}
+
+			f = fdget(insn->imm);
+
+			map = bpf_map_get(f);
+			if (IS_ERR(map)) {
+				verbose("fd %d is not pointing to valid bpf_map\n",
+					insn->imm);
+				fdput(f);
+				return PTR_ERR(map);
+			}
+
+			/* store map pointer inside BPF_LD_IMM64 instruction */
+			insn[0].imm = (u32) (unsigned long) map;
+			insn[1].imm = ((u64) (unsigned long) map) >> 32;
+
+			/* check whether we recorded this map already */
+			for (j = 0; j < env->used_map_cnt; j++)
+				if (env->used_maps[j] == map) {
+					fdput(f);
+					goto next_insn;
+				}
+
+			if (env->used_map_cnt >= MAX_USED_MAPS) {
+				fdput(f);
+				return -E2BIG;
+			}
+
+			/* remember this map */
+			env->used_maps[env->used_map_cnt++] = map;
+
+			/* hold the map. If the program is rejected by verifier,
+			 * the map will be released by release_maps() or it
+			 * will be used by the valid program until it's unloaded
+			 * and all maps are released in free_bpf_prog_info()
+			 */
+			atomic_inc(&map->refcnt);
+
+			fdput(f);
+next_insn:
+			insn++;
+			i++;
+		}
+	}
+
+	/* now all pseudo BPF_LD_IMM64 instructions load valid
+	 * 'struct bpf_map *' into a register instead of user map_fd.
+	 * These pointers will be used later by verifier to validate map access.
+	 */
+	return 0;
+}
+
+/* drop refcnt of maps used by the rejected program */
+static void release_maps(struct verifier_env *env)
+{
+	int i;
+
+	for (i = 0; i < env->used_map_cnt; i++)
+		bpf_map_put(env->used_maps[i]);
+}
+
+/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
+static void convert_pseudo_ld_imm64(struct verifier_env *env)
+{
+	struct bpf_insn *insn = env->prog->insnsi;
+	int insn_cnt = env->prog->len;
+	int i;
+
+	for (i = 0; i < insn_cnt; i++, insn++)
+		if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
+			insn->src_reg = 0;
+}
+
 int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 {
 	char __user *log_ubuf = NULL;
@@ -316,6 +430,8 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 	if (!env)
 		return -ENOMEM;
 
+	env->prog = prog;
+
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
 
@@ -342,8 +458,14 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 		log_level = 0;
 	}
 
+	ret = replace_map_fd_with_map_ptr(env);
+	if (ret < 0)
+		goto skip_full_check;
+
 	/* ret = do_check(env); */
 
+skip_full_check:
+
 	if (log_level && log_len >= log_size - 1) {
 		BUG_ON(log_len >= log_size);
 		/* verifier log exceeded user supplied buffer */
@@ -357,11 +479,36 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 		goto free_log_buf;
 	}
 
+	if (ret == 0 && env->used_map_cnt) {
+		/* if program passed verifier, update used_maps in bpf_prog_info */
+		prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
+						     sizeof(env->used_maps[0]),
+						     GFP_KERNEL);
+
+		if (!prog->aux->used_maps) {
+			ret = -ENOMEM;
+			goto free_log_buf;
+		}
+
+		memcpy(prog->aux->used_maps, env->used_maps,
+		       sizeof(env->used_maps[0]) * env->used_map_cnt);
+		prog->aux->used_map_cnt = env->used_map_cnt;
+
+		/* program is valid. Convert pseudo bpf_ld_imm64 into generic
+		 * bpf_ld_imm64 instructions
+		 */
+		convert_pseudo_ld_imm64(env);
+	}
 
 free_log_buf:
 	if (log_level)
 		vfree(log_buf);
 free_env:
+	if (!prog->aux->used_maps)
+		/* if we didn't copy map pointers into bpf_prog_info, release
+		 * them now. Otherwise free_bpf_prog_info() will release them.
+		 */
+		release_maps(env);
 	kfree(env);
 	mutex_unlock(&bpf_verifier_lock);
 	return ret;
-- 
cgit v1.2.3


From 475fb78fbf48592ce541627c60a7b331060e31f5 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 26 Sep 2014 00:17:05 -0700
Subject: bpf: verifier (add branch/goto checks)

check that control flow graph of eBPF program is a directed acyclic graph

check_cfg() does:
- detect loops
- detect unreachable instructions
- check that program terminates with BPF_EXIT insn
- check that all branches are within program boundary

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 189 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 189 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7227543e474b..c689ab8e2713 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -313,6 +313,191 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
 	return (struct bpf_map *) (unsigned long) imm64;
 }
 
+/* non-recursive DFS pseudo code
+ * 1  procedure DFS-iterative(G,v):
+ * 2      label v as discovered
+ * 3      let S be a stack
+ * 4      S.push(v)
+ * 5      while S is not empty
+ * 6            t <- S.pop()
+ * 7            if t is what we're looking for:
+ * 8                return t
+ * 9            for all edges e in G.adjacentEdges(t) do
+ * 10               if edge e is already labelled
+ * 11                   continue with the next edge
+ * 12               w <- G.adjacentVertex(t,e)
+ * 13               if vertex w is not discovered and not explored
+ * 14                   label e as tree-edge
+ * 15                   label w as discovered
+ * 16                   S.push(w)
+ * 17                   continue at 5
+ * 18               else if vertex w is discovered
+ * 19                   label e as back-edge
+ * 20               else
+ * 21                   // vertex w is explored
+ * 22                   label e as forward- or cross-edge
+ * 23           label t as explored
+ * 24           S.pop()
+ *
+ * convention:
+ * 0x10 - discovered
+ * 0x11 - discovered and fall-through edge labelled
+ * 0x12 - discovered and fall-through and branch edges labelled
+ * 0x20 - explored
+ */
+
+enum {
+	DISCOVERED = 0x10,
+	EXPLORED = 0x20,
+	FALLTHROUGH = 1,
+	BRANCH = 2,
+};
+
+static int *insn_stack;	/* stack of insns to process */
+static int cur_stack;	/* current stack index */
+static int *insn_state;
+
+/* t, w, e - match pseudo-code above:
+ * t - index of current instruction
+ * w - next instruction
+ * e - edge
+ */
+static int push_insn(int t, int w, int e, struct verifier_env *env)
+{
+	if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
+		return 0;
+
+	if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
+		return 0;
+
+	if (w < 0 || w >= env->prog->len) {
+		verbose("jump out of range from insn %d to %d\n", t, w);
+		return -EINVAL;
+	}
+
+	if (insn_state[w] == 0) {
+		/* tree-edge */
+		insn_state[t] = DISCOVERED | e;
+		insn_state[w] = DISCOVERED;
+		if (cur_stack >= env->prog->len)
+			return -E2BIG;
+		insn_stack[cur_stack++] = w;
+		return 1;
+	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+		verbose("back-edge from insn %d to %d\n", t, w);
+		return -EINVAL;
+	} else if (insn_state[w] == EXPLORED) {
+		/* forward- or cross-edge */
+		insn_state[t] = DISCOVERED | e;
+	} else {
+		verbose("insn state internal bug\n");
+		return -EFAULT;
+	}
+	return 0;
+}
+
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+static int check_cfg(struct verifier_env *env)
+{
+	struct bpf_insn *insns = env->prog->insnsi;
+	int insn_cnt = env->prog->len;
+	int ret = 0;
+	int i, t;
+
+	insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+	if (!insn_state)
+		return -ENOMEM;
+
+	insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+	if (!insn_stack) {
+		kfree(insn_state);
+		return -ENOMEM;
+	}
+
+	insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
+	insn_stack[0] = 0; /* 0 is the first instruction */
+	cur_stack = 1;
+
+peek_stack:
+	if (cur_stack == 0)
+		goto check_state;
+	t = insn_stack[cur_stack - 1];
+
+	if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+		u8 opcode = BPF_OP(insns[t].code);
+
+		if (opcode == BPF_EXIT) {
+			goto mark_explored;
+		} else if (opcode == BPF_CALL) {
+			ret = push_insn(t, t + 1, FALLTHROUGH, env);
+			if (ret == 1)
+				goto peek_stack;
+			else if (ret < 0)
+				goto err_free;
+		} else if (opcode == BPF_JA) {
+			if (BPF_SRC(insns[t].code) != BPF_K) {
+				ret = -EINVAL;
+				goto err_free;
+			}
+			/* unconditional jump with single edge */
+			ret = push_insn(t, t + insns[t].off + 1,
+					FALLTHROUGH, env);
+			if (ret == 1)
+				goto peek_stack;
+			else if (ret < 0)
+				goto err_free;
+		} else {
+			/* conditional jump with two edges */
+			ret = push_insn(t, t + 1, FALLTHROUGH, env);
+			if (ret == 1)
+				goto peek_stack;
+			else if (ret < 0)
+				goto err_free;
+
+			ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
+			if (ret == 1)
+				goto peek_stack;
+			else if (ret < 0)
+				goto err_free;
+		}
+	} else {
+		/* all other non-branch instructions with single
+		 * fall-through edge
+		 */
+		ret = push_insn(t, t + 1, FALLTHROUGH, env);
+		if (ret == 1)
+			goto peek_stack;
+		else if (ret < 0)
+			goto err_free;
+	}
+
+mark_explored:
+	insn_state[t] = EXPLORED;
+	if (cur_stack-- <= 0) {
+		verbose("pop stack internal bug\n");
+		ret = -EFAULT;
+		goto err_free;
+	}
+	goto peek_stack;
+
+check_state:
+	for (i = 0; i < insn_cnt; i++) {
+		if (insn_state[i] != EXPLORED) {
+			verbose("unreachable insn %d\n", i);
+			ret = -EINVAL;
+			goto err_free;
+		}
+	}
+	ret = 0; /* cfg looks good */
+
+err_free:
+	kfree(insn_state);
+	kfree(insn_stack);
+	return ret;
+}
+
 /* look for pseudo eBPF instructions that access map FDs and
  * replace them with actual map pointers
  */
@@ -462,6 +647,10 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 	if (ret < 0)
 		goto skip_full_check;
 
+	ret = check_cfg(env);
+	if (ret < 0)
+		goto skip_full_check;
+
 	/* ret = do_check(env); */
 
 skip_full_check:
-- 
cgit v1.2.3


From 17a5267067f3c372fec9ffb798d6eaba6b5e6a4c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 26 Sep 2014 00:17:06 -0700
Subject: bpf: verifier (add verifier core)

This patch adds verifier core which simulates execution of every insn and
records the state of registers and program stack. Every branch instruction seen
during simulation is pushed into state stack. When verifier reaches BPF_EXIT,
it pops the state from the stack and continues until it reaches BPF_EXIT again.
For program:
1: bpf_mov r1, xxx
2: if (r1 == 0) goto 5
3: bpf_mov r0, 1
4: goto 6
5: bpf_mov r0, 2
6: bpf_exit
The verifier will walk insns: 1, 2, 3, 4, 6
then it will pop the state recorded at insn#2 and will continue: 5, 6

This way it walks all possible paths through the program and checks all
possible values of registers. While doing so, it checks for:
- invalid instructions
- uninitialized register access
- uninitialized stack access
- misaligned stack access
- out of range stack access
- invalid calling convention
- instruction encoding is not using reserved fields

Kernel subsystem configures the verifier with two callbacks:

- bool (*is_valid_access)(int off, int size, enum bpf_access_type type);
  that provides information to the verifer which fields of 'ctx'
  are accessible (remember 'ctx' is the first argument to eBPF program)

- const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
  returns argument constraints of kernel helper functions that eBPF program
  may call, so that verifier can checks that R1-R5 types match the prototype

More details in Documentation/networking/filter.txt and in kernel/bpf/verifier.c

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1075 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 1074 insertions(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c689ab8e2713..a086dd3210a8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -125,6 +125,70 @@
  * are set to NOT_INIT to indicate that they are no longer readable.
  */
 
+/* types of values stored in eBPF registers */
+enum bpf_reg_type {
+	NOT_INIT = 0,		 /* nothing was written into register */
+	UNKNOWN_VALUE,		 /* reg doesn't contain a valid pointer */
+	PTR_TO_CTX,		 /* reg points to bpf_context */
+	CONST_PTR_TO_MAP,	 /* reg points to struct bpf_map */
+	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
+	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
+	FRAME_PTR,		 /* reg == frame_pointer */
+	PTR_TO_STACK,		 /* reg == frame_pointer + imm */
+	CONST_IMM,		 /* constant integer value */
+};
+
+struct reg_state {
+	enum bpf_reg_type type;
+	union {
+		/* valid when type == CONST_IMM | PTR_TO_STACK */
+		int imm;
+
+		/* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
+		 *   PTR_TO_MAP_VALUE_OR_NULL
+		 */
+		struct bpf_map *map_ptr;
+	};
+};
+
+enum bpf_stack_slot_type {
+	STACK_INVALID,    /* nothing was stored in this stack slot */
+	STACK_SPILL,      /* 1st byte of register spilled into stack */
+	STACK_SPILL_PART, /* other 7 bytes of register spill */
+	STACK_MISC	  /* BPF program wrote some data into this slot */
+};
+
+struct bpf_stack_slot {
+	enum bpf_stack_slot_type stype;
+	struct reg_state reg_st;
+};
+
+/* state of the program:
+ * type of all registers and stack info
+ */
+struct verifier_state {
+	struct reg_state regs[MAX_BPF_REG];
+	struct bpf_stack_slot stack[MAX_BPF_STACK];
+};
+
+/* linked list of verifier states used to prune search */
+struct verifier_state_list {
+	struct verifier_state state;
+	struct verifier_state_list *next;
+};
+
+/* verifier_state + insn_idx are pushed to stack when branch is encountered */
+struct verifier_stack_elem {
+	/* verifer state is 'st'
+	 * before processing instruction 'insn_idx'
+	 * and after processing instruction 'prev_insn_idx'
+	 */
+	struct verifier_state st;
+	int insn_idx;
+	int prev_insn_idx;
+	struct verifier_stack_elem *next;
+};
+
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
 /* single container for all structs
@@ -132,6 +196,9 @@
  */
 struct verifier_env {
 	struct bpf_prog *prog;		/* eBPF program being verified */
+	struct verifier_stack_elem *head; /* stack of verifier states to be processed */
+	int stack_size;			/* number of states to be processed */
+	struct verifier_state cur_state; /* current verifier state */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
 };
@@ -160,6 +227,45 @@ static void verbose(const char *fmt, ...)
 	va_end(args);
 }
 
+/* string representation of 'enum bpf_reg_type' */
+static const char * const reg_type_str[] = {
+	[NOT_INIT]		= "?",
+	[UNKNOWN_VALUE]		= "inv",
+	[PTR_TO_CTX]		= "ctx",
+	[CONST_PTR_TO_MAP]	= "map_ptr",
+	[PTR_TO_MAP_VALUE]	= "map_value",
+	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+	[FRAME_PTR]		= "fp",
+	[PTR_TO_STACK]		= "fp",
+	[CONST_IMM]		= "imm",
+};
+
+static void print_verifier_state(struct verifier_env *env)
+{
+	enum bpf_reg_type t;
+	int i;
+
+	for (i = 0; i < MAX_BPF_REG; i++) {
+		t = env->cur_state.regs[i].type;
+		if (t == NOT_INIT)
+			continue;
+		verbose(" R%d=%s", i, reg_type_str[t]);
+		if (t == CONST_IMM || t == PTR_TO_STACK)
+			verbose("%d", env->cur_state.regs[i].imm);
+		else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
+			 t == PTR_TO_MAP_VALUE_OR_NULL)
+			verbose("(ks=%d,vs=%d)",
+				env->cur_state.regs[i].map_ptr->key_size,
+				env->cur_state.regs[i].map_ptr->value_size);
+	}
+	for (i = 0; i < MAX_BPF_STACK; i++) {
+		if (env->cur_state.stack[i].stype == STACK_SPILL)
+			verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+				reg_type_str[env->cur_state.stack[i].reg_st.type]);
+	}
+	verbose("\n");
+}
+
 static const char *const bpf_class_string[] = {
 	[BPF_LD]    = "ld",
 	[BPF_LDX]   = "ldx",
@@ -305,6 +411,735 @@ static void print_bpf_insn(struct bpf_insn *insn)
 	}
 }
 
+static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+{
+	struct verifier_stack_elem *elem;
+	int insn_idx;
+
+	if (env->head == NULL)
+		return -1;
+
+	memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
+	insn_idx = env->head->insn_idx;
+	if (prev_insn_idx)
+		*prev_insn_idx = env->head->prev_insn_idx;
+	elem = env->head->next;
+	kfree(env->head);
+	env->head = elem;
+	env->stack_size--;
+	return insn_idx;
+}
+
+static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
+					 int prev_insn_idx)
+{
+	struct verifier_stack_elem *elem;
+
+	elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+	if (!elem)
+		goto err;
+
+	memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+	elem->insn_idx = insn_idx;
+	elem->prev_insn_idx = prev_insn_idx;
+	elem->next = env->head;
+	env->head = elem;
+	env->stack_size++;
+	if (env->stack_size > 1024) {
+		verbose("BPF program is too complex\n");
+		goto err;
+	}
+	return &elem->st;
+err:
+	/* pop all elements and return */
+	while (pop_stack(env, NULL) >= 0);
+	return NULL;
+}
+
+#define CALLER_SAVED_REGS 6
+static const int caller_saved[CALLER_SAVED_REGS] = {
+	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
+};
+
+static void init_reg_state(struct reg_state *regs)
+{
+	int i;
+
+	for (i = 0; i < MAX_BPF_REG; i++) {
+		regs[i].type = NOT_INIT;
+		regs[i].imm = 0;
+		regs[i].map_ptr = NULL;
+	}
+
+	/* frame pointer */
+	regs[BPF_REG_FP].type = FRAME_PTR;
+
+	/* 1st arg to a function */
+	regs[BPF_REG_1].type = PTR_TO_CTX;
+}
+
+static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+{
+	BUG_ON(regno >= MAX_BPF_REG);
+	regs[regno].type = UNKNOWN_VALUE;
+	regs[regno].imm = 0;
+	regs[regno].map_ptr = NULL;
+}
+
+enum reg_arg_type {
+	SRC_OP,		/* register is used as source operand */
+	DST_OP,		/* register is used as destination operand */
+	DST_OP_NO_MARK	/* same as above, check only, don't mark */
+};
+
+static int check_reg_arg(struct reg_state *regs, u32 regno,
+			 enum reg_arg_type t)
+{
+	if (regno >= MAX_BPF_REG) {
+		verbose("R%d is invalid\n", regno);
+		return -EINVAL;
+	}
+
+	if (t == SRC_OP) {
+		/* check whether register used as source operand can be read */
+		if (regs[regno].type == NOT_INIT) {
+			verbose("R%d !read_ok\n", regno);
+			return -EACCES;
+		}
+	} else {
+		/* check whether register used as dest operand can be written to */
+		if (regno == BPF_REG_FP) {
+			verbose("frame pointer is read only\n");
+			return -EACCES;
+		}
+		if (t == DST_OP)
+			mark_reg_unknown_value(regs, regno);
+	}
+	return 0;
+}
+
+static int bpf_size_to_bytes(int bpf_size)
+{
+	if (bpf_size == BPF_W)
+		return 4;
+	else if (bpf_size == BPF_H)
+		return 2;
+	else if (bpf_size == BPF_B)
+		return 1;
+	else if (bpf_size == BPF_DW)
+		return 8;
+	else
+		return -EINVAL;
+}
+
+/* check_stack_read/write functions track spill/fill of registers,
+ * stack boundary and alignment are checked in check_mem_access()
+ */
+static int check_stack_write(struct verifier_state *state, int off, int size,
+			     int value_regno)
+{
+	struct bpf_stack_slot *slot;
+	int i;
+
+	if (value_regno >= 0 &&
+	    (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
+	     state->regs[value_regno].type == PTR_TO_STACK ||
+	     state->regs[value_regno].type == PTR_TO_CTX)) {
+
+		/* register containing pointer is being spilled into stack */
+		if (size != 8) {
+			verbose("invalid size of register spill\n");
+			return -EACCES;
+		}
+
+		slot = &state->stack[MAX_BPF_STACK + off];
+		slot->stype = STACK_SPILL;
+		/* save register state */
+		slot->reg_st = state->regs[value_regno];
+		for (i = 1; i < 8; i++) {
+			slot = &state->stack[MAX_BPF_STACK + off + i];
+			slot->stype = STACK_SPILL_PART;
+			slot->reg_st.type = UNKNOWN_VALUE;
+			slot->reg_st.map_ptr = NULL;
+		}
+	} else {
+
+		/* regular write of data into stack */
+		for (i = 0; i < size; i++) {
+			slot = &state->stack[MAX_BPF_STACK + off + i];
+			slot->stype = STACK_MISC;
+			slot->reg_st.type = UNKNOWN_VALUE;
+			slot->reg_st.map_ptr = NULL;
+		}
+	}
+	return 0;
+}
+
+static int check_stack_read(struct verifier_state *state, int off, int size,
+			    int value_regno)
+{
+	int i;
+	struct bpf_stack_slot *slot;
+
+	slot = &state->stack[MAX_BPF_STACK + off];
+
+	if (slot->stype == STACK_SPILL) {
+		if (size != 8) {
+			verbose("invalid size of register spill\n");
+			return -EACCES;
+		}
+		for (i = 1; i < 8; i++) {
+			if (state->stack[MAX_BPF_STACK + off + i].stype !=
+			    STACK_SPILL_PART) {
+				verbose("corrupted spill memory\n");
+				return -EACCES;
+			}
+		}
+
+		if (value_regno >= 0)
+			/* restore register state from stack */
+			state->regs[value_regno] = slot->reg_st;
+		return 0;
+	} else {
+		for (i = 0; i < size; i++) {
+			if (state->stack[MAX_BPF_STACK + off + i].stype !=
+			    STACK_MISC) {
+				verbose("invalid read from stack off %d+%d size %d\n",
+					off, i, size);
+				return -EACCES;
+			}
+		}
+		if (value_regno >= 0)
+			/* have read misc data from the stack */
+			mark_reg_unknown_value(state->regs, value_regno);
+		return 0;
+	}
+}
+
+/* check read/write into map element returned by bpf_map_lookup_elem() */
+static int check_map_access(struct verifier_env *env, u32 regno, int off,
+			    int size)
+{
+	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+
+	if (off < 0 || off + size > map->value_size) {
+		verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+			map->value_size, off, size);
+		return -EACCES;
+	}
+	return 0;
+}
+
+/* check access to 'struct bpf_context' fields */
+static int check_ctx_access(struct verifier_env *env, int off, int size,
+			    enum bpf_access_type t)
+{
+	if (env->prog->aux->ops->is_valid_access &&
+	    env->prog->aux->ops->is_valid_access(off, size, t))
+		return 0;
+
+	verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+	return -EACCES;
+}
+
+/* check whether memory at (regno + off) is accessible for t = (read | write)
+ * if t==write, value_regno is a register which value is stored into memory
+ * if t==read, value_regno is a register which will receive the value from memory
+ * if t==write && value_regno==-1, some unknown value is stored into memory
+ * if t==read && value_regno==-1, don't care what we read from memory
+ */
+static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+			    int bpf_size, enum bpf_access_type t,
+			    int value_regno)
+{
+	struct verifier_state *state = &env->cur_state;
+	int size, err = 0;
+
+	size = bpf_size_to_bytes(bpf_size);
+	if (size < 0)
+		return size;
+
+	if (off % size != 0) {
+		verbose("misaligned access off %d size %d\n", off, size);
+		return -EACCES;
+	}
+
+	if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+		err = check_map_access(env, regno, off, size);
+		if (!err && t == BPF_READ && value_regno >= 0)
+			mark_reg_unknown_value(state->regs, value_regno);
+
+	} else if (state->regs[regno].type == PTR_TO_CTX) {
+		err = check_ctx_access(env, off, size, t);
+		if (!err && t == BPF_READ && value_regno >= 0)
+			mark_reg_unknown_value(state->regs, value_regno);
+
+	} else if (state->regs[regno].type == FRAME_PTR) {
+		if (off >= 0 || off < -MAX_BPF_STACK) {
+			verbose("invalid stack off=%d size=%d\n", off, size);
+			return -EACCES;
+		}
+		if (t == BPF_WRITE)
+			err = check_stack_write(state, off, size, value_regno);
+		else
+			err = check_stack_read(state, off, size, value_regno);
+	} else {
+		verbose("R%d invalid mem access '%s'\n",
+			regno, reg_type_str[state->regs[regno].type]);
+		return -EACCES;
+	}
+	return err;
+}
+
+static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+{
+	struct reg_state *regs = env->cur_state.regs;
+	int err;
+
+	if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
+	    insn->imm != 0) {
+		verbose("BPF_XADD uses reserved fields\n");
+		return -EINVAL;
+	}
+
+	/* check src1 operand */
+	err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+	if (err)
+		return err;
+
+	/* check src2 operand */
+	err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+	if (err)
+		return err;
+
+	/* check whether atomic_add can read the memory */
+	err = check_mem_access(env, insn->dst_reg, insn->off,
+			       BPF_SIZE(insn->code), BPF_READ, -1);
+	if (err)
+		return err;
+
+	/* check whether atomic_add can write into the same memory */
+	return check_mem_access(env, insn->dst_reg, insn->off,
+				BPF_SIZE(insn->code), BPF_WRITE, -1);
+}
+
+/* when register 'regno' is passed into function that will read 'access_size'
+ * bytes from that pointer, make sure that it's within stack boundary
+ * and all elements of stack are initialized
+ */
+static int check_stack_boundary(struct verifier_env *env,
+				int regno, int access_size)
+{
+	struct verifier_state *state = &env->cur_state;
+	struct reg_state *regs = state->regs;
+	int off, i;
+
+	if (regs[regno].type != PTR_TO_STACK)
+		return -EACCES;
+
+	off = regs[regno].imm;
+	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+	    access_size <= 0) {
+		verbose("invalid stack type R%d off=%d access_size=%d\n",
+			regno, off, access_size);
+		return -EACCES;
+	}
+
+	for (i = 0; i < access_size; i++) {
+		if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) {
+			verbose("invalid indirect read from stack off %d+%d size %d\n",
+				off, i, access_size);
+			return -EACCES;
+		}
+	}
+	return 0;
+}
+
+static int check_func_arg(struct verifier_env *env, u32 regno,
+			  enum bpf_arg_type arg_type, struct bpf_map **mapp)
+{
+	struct reg_state *reg = env->cur_state.regs + regno;
+	enum bpf_reg_type expected_type;
+	int err = 0;
+
+	if (arg_type == ARG_ANYTHING)
+		return 0;
+
+	if (reg->type == NOT_INIT) {
+		verbose("R%d !read_ok\n", regno);
+		return -EACCES;
+	}
+
+	if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+	    arg_type == ARG_PTR_TO_MAP_VALUE) {
+		expected_type = PTR_TO_STACK;
+	} else if (arg_type == ARG_CONST_STACK_SIZE) {
+		expected_type = CONST_IMM;
+	} else if (arg_type == ARG_CONST_MAP_PTR) {
+		expected_type = CONST_PTR_TO_MAP;
+	} else {
+		verbose("unsupported arg_type %d\n", arg_type);
+		return -EFAULT;
+	}
+
+	if (reg->type != expected_type) {
+		verbose("R%d type=%s expected=%s\n", regno,
+			reg_type_str[reg->type], reg_type_str[expected_type]);
+		return -EACCES;
+	}
+
+	if (arg_type == ARG_CONST_MAP_PTR) {
+		/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+		*mapp = reg->map_ptr;
+
+	} else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+		/* bpf_map_xxx(..., map_ptr, ..., key) call:
+		 * check that [key, key + map->key_size) are within
+		 * stack limits and initialized
+		 */
+		if (!*mapp) {
+			/* in function declaration map_ptr must come before
+			 * map_key, so that it's verified and known before
+			 * we have to check map_key here. Otherwise it means
+			 * that kernel subsystem misconfigured verifier
+			 */
+			verbose("invalid map_ptr to access map->key\n");
+			return -EACCES;
+		}
+		err = check_stack_boundary(env, regno, (*mapp)->key_size);
+
+	} else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
+		/* bpf_map_xxx(..., map_ptr, ..., value) call:
+		 * check [value, value + map->value_size) validity
+		 */
+		if (!*mapp) {
+			/* kernel subsystem misconfigured verifier */
+			verbose("invalid map_ptr to access map->value\n");
+			return -EACCES;
+		}
+		err = check_stack_boundary(env, regno, (*mapp)->value_size);
+
+	} else if (arg_type == ARG_CONST_STACK_SIZE) {
+		/* bpf_xxx(..., buf, len) call will access 'len' bytes
+		 * from stack pointer 'buf'. Check it
+		 * note: regno == len, regno - 1 == buf
+		 */
+		if (regno == 0) {
+			/* kernel subsystem misconfigured verifier */
+			verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+			return -EACCES;
+		}
+		err = check_stack_boundary(env, regno - 1, reg->imm);
+	}
+
+	return err;
+}
+
+static int check_call(struct verifier_env *env, int func_id)
+{
+	struct verifier_state *state = &env->cur_state;
+	const struct bpf_func_proto *fn = NULL;
+	struct reg_state *regs = state->regs;
+	struct bpf_map *map = NULL;
+	struct reg_state *reg;
+	int i, err;
+
+	/* find function prototype */
+	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
+		verbose("invalid func %d\n", func_id);
+		return -EINVAL;
+	}
+
+	if (env->prog->aux->ops->get_func_proto)
+		fn = env->prog->aux->ops->get_func_proto(func_id);
+
+	if (!fn) {
+		verbose("unknown func %d\n", func_id);
+		return -EINVAL;
+	}
+
+	/* eBPF programs must be GPL compatible to use GPL-ed functions */
+	if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) {
+		verbose("cannot call GPL only function from proprietary program\n");
+		return -EINVAL;
+	}
+
+	/* check args */
+	err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
+	if (err)
+		return err;
+	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
+	if (err)
+		return err;
+	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
+	if (err)
+		return err;
+	err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
+	if (err)
+		return err;
+	err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
+	if (err)
+		return err;
+
+	/* reset caller saved regs */
+	for (i = 0; i < CALLER_SAVED_REGS; i++) {
+		reg = regs + caller_saved[i];
+		reg->type = NOT_INIT;
+		reg->imm = 0;
+	}
+
+	/* update return register */
+	if (fn->ret_type == RET_INTEGER) {
+		regs[BPF_REG_0].type = UNKNOWN_VALUE;
+	} else if (fn->ret_type == RET_VOID) {
+		regs[BPF_REG_0].type = NOT_INIT;
+	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
+		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+		/* remember map_ptr, so that check_map_access()
+		 * can check 'value_size' boundary of memory access
+		 * to map element returned from bpf_map_lookup_elem()
+		 */
+		if (map == NULL) {
+			verbose("kernel subsystem misconfigured verifier\n");
+			return -EINVAL;
+		}
+		regs[BPF_REG_0].map_ptr = map;
+	} else {
+		verbose("unknown return type %d of func %d\n",
+			fn->ret_type, func_id);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* check validity of 32-bit and 64-bit arithmetic operations */
+static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+{
+	u8 opcode = BPF_OP(insn->code);
+	int err;
+
+	if (opcode == BPF_END || opcode == BPF_NEG) {
+		if (opcode == BPF_NEG) {
+			if (BPF_SRC(insn->code) != 0 ||
+			    insn->src_reg != BPF_REG_0 ||
+			    insn->off != 0 || insn->imm != 0) {
+				verbose("BPF_NEG uses reserved fields\n");
+				return -EINVAL;
+			}
+		} else {
+			if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
+			    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+				verbose("BPF_END uses reserved fields\n");
+				return -EINVAL;
+			}
+		}
+
+		/* check src operand */
+		err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+		if (err)
+			return err;
+
+		/* check dest operand */
+		err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+		if (err)
+			return err;
+
+	} else if (opcode == BPF_MOV) {
+
+		if (BPF_SRC(insn->code) == BPF_X) {
+			if (insn->imm != 0 || insn->off != 0) {
+				verbose("BPF_MOV uses reserved fields\n");
+				return -EINVAL;
+			}
+
+			/* check src operand */
+			err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+			if (err)
+				return err;
+		} else {
+			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+				verbose("BPF_MOV uses reserved fields\n");
+				return -EINVAL;
+			}
+		}
+
+		/* check dest operand */
+		err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+		if (err)
+			return err;
+
+		if (BPF_SRC(insn->code) == BPF_X) {
+			if (BPF_CLASS(insn->code) == BPF_ALU64) {
+				/* case: R1 = R2
+				 * copy register state to dest reg
+				 */
+				regs[insn->dst_reg] = regs[insn->src_reg];
+			} else {
+				regs[insn->dst_reg].type = UNKNOWN_VALUE;
+				regs[insn->dst_reg].map_ptr = NULL;
+			}
+		} else {
+			/* case: R = imm
+			 * remember the value we stored into this reg
+			 */
+			regs[insn->dst_reg].type = CONST_IMM;
+			regs[insn->dst_reg].imm = insn->imm;
+		}
+
+	} else if (opcode > BPF_END) {
+		verbose("invalid BPF_ALU opcode %x\n", opcode);
+		return -EINVAL;
+
+	} else {	/* all other ALU ops: and, sub, xor, add, ... */
+
+		bool stack_relative = false;
+
+		if (BPF_SRC(insn->code) == BPF_X) {
+			if (insn->imm != 0 || insn->off != 0) {
+				verbose("BPF_ALU uses reserved fields\n");
+				return -EINVAL;
+			}
+			/* check src1 operand */
+			err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+			if (err)
+				return err;
+		} else {
+			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+				verbose("BPF_ALU uses reserved fields\n");
+				return -EINVAL;
+			}
+		}
+
+		/* check src2 operand */
+		err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+		if (err)
+			return err;
+
+		if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
+		    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
+			verbose("div by zero\n");
+			return -EINVAL;
+		}
+
+		/* pattern match 'bpf_add Rx, imm' instruction */
+		if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
+		    regs[insn->dst_reg].type == FRAME_PTR &&
+		    BPF_SRC(insn->code) == BPF_K)
+			stack_relative = true;
+
+		/* check dest operand */
+		err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+		if (err)
+			return err;
+
+		if (stack_relative) {
+			regs[insn->dst_reg].type = PTR_TO_STACK;
+			regs[insn->dst_reg].imm = insn->imm;
+		}
+	}
+
+	return 0;
+}
+
+static int check_cond_jmp_op(struct verifier_env *env,
+			     struct bpf_insn *insn, int *insn_idx)
+{
+	struct reg_state *regs = env->cur_state.regs;
+	struct verifier_state *other_branch;
+	u8 opcode = BPF_OP(insn->code);
+	int err;
+
+	if (opcode > BPF_EXIT) {
+		verbose("invalid BPF_JMP opcode %x\n", opcode);
+		return -EINVAL;
+	}
+
+	if (BPF_SRC(insn->code) == BPF_X) {
+		if (insn->imm != 0) {
+			verbose("BPF_JMP uses reserved fields\n");
+			return -EINVAL;
+		}
+
+		/* check src1 operand */
+		err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+		if (err)
+			return err;
+	} else {
+		if (insn->src_reg != BPF_REG_0) {
+			verbose("BPF_JMP uses reserved fields\n");
+			return -EINVAL;
+		}
+	}
+
+	/* check src2 operand */
+	err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+	if (err)
+		return err;
+
+	/* detect if R == 0 where R was initialized to zero earlier */
+	if (BPF_SRC(insn->code) == BPF_K &&
+	    (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+	    regs[insn->dst_reg].type == CONST_IMM &&
+	    regs[insn->dst_reg].imm == insn->imm) {
+		if (opcode == BPF_JEQ) {
+			/* if (imm == imm) goto pc+off;
+			 * only follow the goto, ignore fall-through
+			 */
+			*insn_idx += insn->off;
+			return 0;
+		} else {
+			/* if (imm != imm) goto pc+off;
+			 * only follow fall-through branch, since
+			 * that's where the program will go
+			 */
+			return 0;
+		}
+	}
+
+	other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+	if (!other_branch)
+		return -EFAULT;
+
+	/* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+	if (BPF_SRC(insn->code) == BPF_K &&
+	    insn->imm == 0 && (opcode == BPF_JEQ ||
+			       opcode == BPF_JNE) &&
+	    regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+		if (opcode == BPF_JEQ) {
+			/* next fallthrough insn can access memory via
+			 * this register
+			 */
+			regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+			/* branch targer cannot access it, since reg == 0 */
+			other_branch->regs[insn->dst_reg].type = CONST_IMM;
+			other_branch->regs[insn->dst_reg].imm = 0;
+		} else {
+			other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+			regs[insn->dst_reg].type = CONST_IMM;
+			regs[insn->dst_reg].imm = 0;
+		}
+	} else if (BPF_SRC(insn->code) == BPF_K &&
+		   (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+
+		if (opcode == BPF_JEQ) {
+			/* detect if (R == imm) goto
+			 * and in the target state recognize that R = imm
+			 */
+			other_branch->regs[insn->dst_reg].type = CONST_IMM;
+			other_branch->regs[insn->dst_reg].imm = insn->imm;
+		} else {
+			/* detect if (R != imm) goto
+			 * and in the fall-through state recognize that R = imm
+			 */
+			regs[insn->dst_reg].type = CONST_IMM;
+			regs[insn->dst_reg].imm = insn->imm;
+		}
+	}
+	if (log_level)
+		print_verifier_state(env);
+	return 0;
+}
+
 /* return the map pointer stored inside BPF_LD_IMM64 instruction */
 static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
 {
@@ -313,6 +1148,37 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
 	return (struct bpf_map *) (unsigned long) imm64;
 }
 
+/* verify BPF_LD_IMM64 instruction */
+static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+{
+	struct reg_state *regs = env->cur_state.regs;
+	int err;
+
+	if (BPF_SIZE(insn->code) != BPF_DW) {
+		verbose("invalid BPF_LD_IMM insn\n");
+		return -EINVAL;
+	}
+	if (insn->off != 0) {
+		verbose("BPF_LD_IMM64 uses reserved fields\n");
+		return -EINVAL;
+	}
+
+	err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+	if (err)
+		return err;
+
+	if (insn->src_reg == 0)
+		/* generic move 64-bit immediate into a register */
+		return 0;
+
+	/* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
+	BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+
+	regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+	regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
+	return 0;
+}
+
 /* non-recursive DFS pseudo code
  * 1  procedure DFS-iterative(G,v):
  * 2      label v as discovered
@@ -498,6 +1364,212 @@ err_free:
 	return ret;
 }
 
+static int do_check(struct verifier_env *env)
+{
+	struct verifier_state *state = &env->cur_state;
+	struct bpf_insn *insns = env->prog->insnsi;
+	struct reg_state *regs = state->regs;
+	int insn_cnt = env->prog->len;
+	int insn_idx, prev_insn_idx = 0;
+	int insn_processed = 0;
+	bool do_print_state = false;
+
+	init_reg_state(regs);
+	insn_idx = 0;
+	for (;;) {
+		struct bpf_insn *insn;
+		u8 class;
+		int err;
+
+		if (insn_idx >= insn_cnt) {
+			verbose("invalid insn idx %d insn_cnt %d\n",
+				insn_idx, insn_cnt);
+			return -EFAULT;
+		}
+
+		insn = &insns[insn_idx];
+		class = BPF_CLASS(insn->code);
+
+		if (++insn_processed > 32768) {
+			verbose("BPF program is too large. Proccessed %d insn\n",
+				insn_processed);
+			return -E2BIG;
+		}
+
+		if (log_level && do_print_state) {
+			verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+			print_verifier_state(env);
+			do_print_state = false;
+		}
+
+		if (log_level) {
+			verbose("%d: ", insn_idx);
+			print_bpf_insn(insn);
+		}
+
+		if (class == BPF_ALU || class == BPF_ALU64) {
+			err = check_alu_op(regs, insn);
+			if (err)
+				return err;
+
+		} else if (class == BPF_LDX) {
+			if (BPF_MODE(insn->code) != BPF_MEM ||
+			    insn->imm != 0) {
+				verbose("BPF_LDX uses reserved fields\n");
+				return -EINVAL;
+			}
+			/* check src operand */
+			err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+			if (err)
+				return err;
+
+			err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+			if (err)
+				return err;
+
+			/* check that memory (src_reg + off) is readable,
+			 * the state of dst_reg will be updated by this func
+			 */
+			err = check_mem_access(env, insn->src_reg, insn->off,
+					       BPF_SIZE(insn->code), BPF_READ,
+					       insn->dst_reg);
+			if (err)
+				return err;
+
+		} else if (class == BPF_STX) {
+			if (BPF_MODE(insn->code) == BPF_XADD) {
+				err = check_xadd(env, insn);
+				if (err)
+					return err;
+				insn_idx++;
+				continue;
+			}
+
+			if (BPF_MODE(insn->code) != BPF_MEM ||
+			    insn->imm != 0) {
+				verbose("BPF_STX uses reserved fields\n");
+				return -EINVAL;
+			}
+			/* check src1 operand */
+			err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+			if (err)
+				return err;
+			/* check src2 operand */
+			err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+			if (err)
+				return err;
+
+			/* check that memory (dst_reg + off) is writeable */
+			err = check_mem_access(env, insn->dst_reg, insn->off,
+					       BPF_SIZE(insn->code), BPF_WRITE,
+					       insn->src_reg);
+			if (err)
+				return err;
+
+		} else if (class == BPF_ST) {
+			if (BPF_MODE(insn->code) != BPF_MEM ||
+			    insn->src_reg != BPF_REG_0) {
+				verbose("BPF_ST uses reserved fields\n");
+				return -EINVAL;
+			}
+			/* check src operand */
+			err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+			if (err)
+				return err;
+
+			/* check that memory (dst_reg + off) is writeable */
+			err = check_mem_access(env, insn->dst_reg, insn->off,
+					       BPF_SIZE(insn->code), BPF_WRITE,
+					       -1);
+			if (err)
+				return err;
+
+		} else if (class == BPF_JMP) {
+			u8 opcode = BPF_OP(insn->code);
+
+			if (opcode == BPF_CALL) {
+				if (BPF_SRC(insn->code) != BPF_K ||
+				    insn->off != 0 ||
+				    insn->src_reg != BPF_REG_0 ||
+				    insn->dst_reg != BPF_REG_0) {
+					verbose("BPF_CALL uses reserved fields\n");
+					return -EINVAL;
+				}
+
+				err = check_call(env, insn->imm);
+				if (err)
+					return err;
+
+			} else if (opcode == BPF_JA) {
+				if (BPF_SRC(insn->code) != BPF_K ||
+				    insn->imm != 0 ||
+				    insn->src_reg != BPF_REG_0 ||
+				    insn->dst_reg != BPF_REG_0) {
+					verbose("BPF_JA uses reserved fields\n");
+					return -EINVAL;
+				}
+
+				insn_idx += insn->off + 1;
+				continue;
+
+			} else if (opcode == BPF_EXIT) {
+				if (BPF_SRC(insn->code) != BPF_K ||
+				    insn->imm != 0 ||
+				    insn->src_reg != BPF_REG_0 ||
+				    insn->dst_reg != BPF_REG_0) {
+					verbose("BPF_EXIT uses reserved fields\n");
+					return -EINVAL;
+				}
+
+				/* eBPF calling convetion is such that R0 is used
+				 * to return the value from eBPF program.
+				 * Make sure that it's readable at this time
+				 * of bpf_exit, which means that program wrote
+				 * something into it earlier
+				 */
+				err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+				if (err)
+					return err;
+
+				insn_idx = pop_stack(env, &prev_insn_idx);
+				if (insn_idx < 0) {
+					break;
+				} else {
+					do_print_state = true;
+					continue;
+				}
+			} else {
+				err = check_cond_jmp_op(env, insn, &insn_idx);
+				if (err)
+					return err;
+			}
+		} else if (class == BPF_LD) {
+			u8 mode = BPF_MODE(insn->code);
+
+			if (mode == BPF_ABS || mode == BPF_IND) {
+				verbose("LD_ABS is not supported yet\n");
+				return -EINVAL;
+			} else if (mode == BPF_IMM) {
+				err = check_ld_imm(env, insn);
+				if (err)
+					return err;
+
+				insn_idx++;
+			} else {
+				verbose("invalid BPF_LD mode\n");
+				return -EINVAL;
+			}
+		} else {
+			verbose("unknown insn class %d\n", class);
+			return -EINVAL;
+		}
+
+		insn_idx++;
+	}
+
+	return 0;
+}
+
 /* look for pseudo eBPF instructions that access map FDs and
  * replace them with actual map pointers
  */
@@ -651,9 +1723,10 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 	if (ret < 0)
 		goto skip_full_check;
 
-	/* ret = do_check(env); */
+	ret = do_check(env);
 
 skip_full_check:
+	while (pop_stack(env, NULL) >= 0);
 
 	if (log_level && log_len >= log_size - 1) {
 		BUG_ON(log_len >= log_size);
-- 
cgit v1.2.3


From 3c731eba48e1b0650decfc91a839b80f0e05ce8f Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 26 Sep 2014 00:17:07 -0700
Subject: bpf: mini eBPF library, test stubs and verifier testsuite

1.
the library includes a trivial set of BPF syscall wrappers:
int bpf_create_map(int key_size, int value_size, int max_entries);
int bpf_update_elem(int fd, void *key, void *value);
int bpf_lookup_elem(int fd, void *key, void *value);
int bpf_delete_elem(int fd, void *key);
int bpf_get_next_key(int fd, void *key, void *next_key);
int bpf_prog_load(enum bpf_prog_type prog_type,
		  const struct sock_filter_int *insns, int insn_len,
		  const char *license);
bpf_prog_load() stores verifier log into global bpf_log_buf[] array

and BPF_*() macros to build instructions

2.
test stubs configure eBPF infra with 'unspec' map and program types.
These are fake types used by user space testsuite only.

3.
verifier tests valid and invalid programs and expects predefined
error log messages from kernel.
40 tests so far.

$ sudo ./test_verifier
 #0 add+sub+mul OK
 #1 unreachable OK
 #2 unreachable2 OK
 #3 out of range jump OK
 #4 out of range jump2 OK
 #5 test1 ld_imm64 OK
 ...

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/Makefile    |   4 ++
 kernel/bpf/test_stub.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 kernel/bpf/test_stub.c

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 3c726b0995b7..45427239f375 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1,5 @@
 obj-y := core.o syscall.o verifier.o
+
+ifdef CONFIG_TEST_BPF
+obj-y += test_stub.o
+endif
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
new file mode 100644
index 000000000000..fcaddff4003e
--- /dev/null
+++ b/kernel/bpf/test_stub.c
@@ -0,0 +1,116 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/bpf.h>
+
+/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
+ * to be used by user space verifier testsuite
+ */
+struct bpf_context {
+	u64 arg1;
+	u64 arg2;
+};
+
+static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return 0;
+}
+
+static struct bpf_func_proto test_funcs[] = {
+	[BPF_FUNC_unspec] = {
+		.func = test_func,
+		.gpl_only = true,
+		.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+		.arg1_type = ARG_CONST_MAP_PTR,
+		.arg2_type = ARG_PTR_TO_MAP_KEY,
+	},
+};
+
+static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
+{
+	if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+		return NULL;
+	return &test_funcs[func_id];
+}
+
+static const struct bpf_context_access {
+	int size;
+	enum bpf_access_type type;
+} test_ctx_access[] = {
+	[offsetof(struct bpf_context, arg1)] = {
+		FIELD_SIZEOF(struct bpf_context, arg1),
+		BPF_READ
+	},
+	[offsetof(struct bpf_context, arg2)] = {
+		FIELD_SIZEOF(struct bpf_context, arg2),
+		BPF_READ
+	},
+};
+
+static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	const struct bpf_context_access *access;
+
+	if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
+		return false;
+
+	access = &test_ctx_access[off];
+	if (access->size == size && (access->type & type))
+		return true;
+
+	return false;
+}
+
+static struct bpf_verifier_ops test_ops = {
+	.get_func_proto = test_func_proto,
+	.is_valid_access = test_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl_prog = {
+	.ops = &test_ops,
+	.type = BPF_PROG_TYPE_UNSPEC,
+};
+
+static struct bpf_map *test_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_map *map;
+
+	map = kzalloc(sizeof(*map), GFP_USER);
+	if (!map)
+		return ERR_PTR(-ENOMEM);
+
+	map->key_size = attr->key_size;
+	map->value_size = attr->value_size;
+	map->max_entries = attr->max_entries;
+	return map;
+}
+
+static void test_map_free(struct bpf_map *map)
+{
+	kfree(map);
+}
+
+static struct bpf_map_ops test_map_ops = {
+	.map_alloc = test_map_alloc,
+	.map_free = test_map_free,
+};
+
+static struct bpf_map_type_list tl_map = {
+	.ops = &test_map_ops,
+	.type = BPF_MAP_TYPE_UNSPEC,
+};
+
+static int __init register_test_ops(void)
+{
+	bpf_register_map_type(&tl_map);
+	bpf_register_prog_type(&tl_prog);
+	return 0;
+}
+late_initcall(register_test_ops);
-- 
cgit v1.2.3


From f1bca824dabba4ffe8582f87ca587780befce7ad Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 29 Sep 2014 18:50:01 -0700
Subject: bpf: add search pruning optimization to verifier

consider C program represented in eBPF:
int filter(int arg)
{
    int a, b, c, *ptr;

    if (arg == 1)
        ptr = &a;
    else if (arg == 2)
        ptr = &b;
    else
        ptr = &c;

    *ptr = 0;
    return 0;
}
eBPF verifier has to follow all possible paths through the program
to recognize that '*ptr = 0' instruction would be safe to execute
in all situations.
It's doing it by picking a path towards the end and observes changes
to registers and stack at every insn until it reaches bpf_exit.
Then it comes back to one of the previous branches and goes towards
the end again with potentially different values in registers.
When program has a lot of branches, the number of possible combinations
of branches is huge, so verifer has a hard limit of walking no more
than 32k instructions. This limit can be reached and complex (but valid)
programs could be rejected. Therefore it's important to recognize equivalent
verifier states to prune this depth first search.

Basic idea can be illustrated by the program (where .. are some eBPF insns):
    1: ..
    2: if (rX == rY) goto 4
    3: ..
    4: ..
    5: ..
    6: bpf_exit
In the first pass towards bpf_exit the verifier will walk insns: 1, 2, 3, 4, 5, 6
Since insn#2 is a branch the verifier will remember its state in verifier stack
to come back to it later.
Since insn#4 is marked as 'branch target', the verifier will remember its state
in explored_states[4] linked list.
Once it reaches insn#6 successfully it will pop the state recorded at insn#2 and
will continue.
Without search pruning optimization verifier would have to walk 4, 5, 6 again,
effectively simulating execution of insns 1, 2, 4, 5, 6
With search pruning it will check whether state at #4 after jumping from #2
is equivalent to one recorded in explored_states[4] during first pass.
If there is an equivalent state, verifier can prune the search at #4 and declare
this path to be safe as well.
In other words two states at #4 are equivalent if execution of 1, 2, 3, 4 insns
and 1, 2, 4 insns produces equivalent registers and stack.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 146 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a086dd3210a8..801f5f3b9307 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -199,6 +199,7 @@ struct verifier_env {
 	struct verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	struct verifier_state cur_state; /* current verifier state */
+	struct verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
 };
@@ -1219,6 +1220,8 @@ enum {
 	BRANCH = 2,
 };
 
+#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+
 static int *insn_stack;	/* stack of insns to process */
 static int cur_stack;	/* current stack index */
 static int *insn_state;
@@ -1241,6 +1244,10 @@ static int push_insn(int t, int w, int e, struct verifier_env *env)
 		return -EINVAL;
 	}
 
+	if (e == BRANCH)
+		/* mark branch target for state pruning */
+		env->explored_states[w] = STATE_LIST_MARK;
+
 	if (insn_state[w] == 0) {
 		/* tree-edge */
 		insn_state[t] = DISCOVERED | e;
@@ -1314,6 +1321,10 @@ peek_stack:
 				goto peek_stack;
 			else if (ret < 0)
 				goto err_free;
+			/* tell verifier to check for equivalent states
+			 * after every call and jump
+			 */
+			env->explored_states[t + 1] = STATE_LIST_MARK;
 		} else {
 			/* conditional jump with two edges */
 			ret = push_insn(t, t + 1, FALLTHROUGH, env);
@@ -1364,6 +1375,95 @@ err_free:
 	return ret;
 }
 
+/* compare two verifier states
+ *
+ * all states stored in state_list are known to be valid, since
+ * verifier reached 'bpf_exit' instruction through them
+ *
+ * this function is called when verifier exploring different branches of
+ * execution popped from the state stack. If it sees an old state that has
+ * more strict register state and more strict stack state then this execution
+ * branch doesn't need to be explored further, since verifier already
+ * concluded that more strict state leads to valid finish.
+ *
+ * Therefore two states are equivalent if register state is more conservative
+ * and explored stack state is more conservative than the current one.
+ * Example:
+ *       explored                   current
+ * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
+ * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
+ *
+ * In other words if current stack state (one being explored) has more
+ * valid slots than old one that already passed validation, it means
+ * the verifier can stop exploring and conclude that current state is valid too
+ *
+ * Similarly with registers. If explored state has register type as invalid
+ * whereas register type in current state is meaningful, it means that
+ * the current state will reach 'bpf_exit' instruction safely
+ */
+static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+{
+	int i;
+
+	for (i = 0; i < MAX_BPF_REG; i++) {
+		if (memcmp(&old->regs[i], &cur->regs[i],
+			   sizeof(old->regs[0])) != 0) {
+			if (old->regs[i].type == NOT_INIT ||
+			    old->regs[i].type == UNKNOWN_VALUE)
+				continue;
+			return false;
+		}
+	}
+
+	for (i = 0; i < MAX_BPF_STACK; i++) {
+		if (memcmp(&old->stack[i], &cur->stack[i],
+			   sizeof(old->stack[0])) != 0) {
+			if (old->stack[i].stype == STACK_INVALID)
+				continue;
+			return false;
+		}
+	}
+	return true;
+}
+
+static int is_state_visited(struct verifier_env *env, int insn_idx)
+{
+	struct verifier_state_list *new_sl;
+	struct verifier_state_list *sl;
+
+	sl = env->explored_states[insn_idx];
+	if (!sl)
+		/* this 'insn_idx' instruction wasn't marked, so we will not
+		 * be doing state search here
+		 */
+		return 0;
+
+	while (sl != STATE_LIST_MARK) {
+		if (states_equal(&sl->state, &env->cur_state))
+			/* reached equivalent register/stack state,
+			 * prune the search
+			 */
+			return 1;
+		sl = sl->next;
+	}
+
+	/* there were no equivalent states, remember current one.
+	 * technically the current state is not proven to be safe yet,
+	 * but it will either reach bpf_exit (which means it's safe) or
+	 * it will be rejected. Since there are no loops, we won't be
+	 * seeing this 'insn_idx' instruction again on the way to bpf_exit
+	 */
+	new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+	if (!new_sl)
+		return -ENOMEM;
+
+	/* add new state to the head of linked list */
+	memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+	new_sl->next = env->explored_states[insn_idx];
+	env->explored_states[insn_idx] = new_sl;
+	return 0;
+}
+
 static int do_check(struct verifier_env *env)
 {
 	struct verifier_state *state = &env->cur_state;
@@ -1396,6 +1496,21 @@ static int do_check(struct verifier_env *env)
 			return -E2BIG;
 		}
 
+		err = is_state_visited(env, insn_idx);
+		if (err < 0)
+			return err;
+		if (err == 1) {
+			/* found equivalent state, can prune the search */
+			if (log_level) {
+				if (do_print_state)
+					verbose("\nfrom %d to %d: safe\n",
+						prev_insn_idx, insn_idx);
+				else
+					verbose("%d: safe\n", insn_idx);
+			}
+			goto process_bpf_exit;
+		}
+
 		if (log_level && do_print_state) {
 			verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
 			print_verifier_state(env);
@@ -1531,6 +1646,7 @@ static int do_check(struct verifier_env *env)
 				if (err)
 					return err;
 
+process_bpf_exit:
 				insn_idx = pop_stack(env, &prev_insn_idx);
 				if (insn_idx < 0) {
 					break;
@@ -1671,6 +1787,28 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env)
 			insn->src_reg = 0;
 }
 
+static void free_states(struct verifier_env *env)
+{
+	struct verifier_state_list *sl, *sln;
+	int i;
+
+	if (!env->explored_states)
+		return;
+
+	for (i = 0; i < env->prog->len; i++) {
+		sl = env->explored_states[i];
+
+		if (sl)
+			while (sl != STATE_LIST_MARK) {
+				sln = sl->next;
+				kfree(sl);
+				sl = sln;
+			}
+	}
+
+	kfree(env->explored_states);
+}
+
 int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 {
 	char __user *log_ubuf = NULL;
@@ -1719,6 +1857,13 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 	if (ret < 0)
 		goto skip_full_check;
 
+	env->explored_states = kcalloc(prog->len,
+				       sizeof(struct verifier_state_list *),
+				       GFP_USER);
+	ret = -ENOMEM;
+	if (!env->explored_states)
+		goto skip_full_check;
+
 	ret = check_cfg(env);
 	if (ret < 0)
 		goto skip_full_check;
@@ -1727,6 +1872,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
 
 skip_full_check:
 	while (pop_stack(env, NULL) >= 0);
+	free_states(env);
 
 	if (log_level && log_len >= log_size - 1) {
 		BUG_ON(log_len >= log_size);
-- 
cgit v1.2.3


From 32bf08a6257b9c7380dcd040af3c0858eee3ef05 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 20 Oct 2014 14:54:57 -0700
Subject: bpf: fix bug in eBPF verifier

while comparing for verifier state equivalency the comparison
was missing a check for uninitialized register.
Make sure it does so and add a testcase.

Fixes: f1bca824dabb ("bpf: add search pruning optimization to verifier")
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 801f5f3b9307..9f81818f2941 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1409,7 +1409,8 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
 		if (memcmp(&old->regs[i], &cur->regs[i],
 			   sizeof(old->regs[0])) != 0) {
 			if (old->regs[i].type == NOT_INIT ||
-			    old->regs[i].type == UNKNOWN_VALUE)
+			    (old->regs[i].type == UNKNOWN_VALUE &&
+			     cur->regs[i].type != NOT_INIT))
 				continue;
 			return false;
 		}
-- 
cgit v1.2.3


From f89b7755f517cdbb755d7543eef986ee9d54e654 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 23 Oct 2014 18:41:08 -0700
Subject: bpf: split eBPF out of NET

introduce two configs:
- hidden CONFIG_BPF to select eBPF interpreter that classic socket filters
  depend on
- visible CONFIG_BPF_SYSCALL (default off) that tracing and sockets can use

that solves several problems:
- tracing and others that wish to use eBPF don't need to depend on NET.
  They can use BPF_SYSCALL to allow loading from userspace or select BPF
  to use it directly from kernel in NET-less configs.
- in 3.18 programs cannot be attached to events yet, so don't force it on
- when the rest of eBPF infra is there in 3.19+, it's still useful to
  switch it off to minimize kernel size

bloat-o-meter on x64 shows:
add/remove: 0/60 grow/shrink: 0/2 up/down: 0/-15601 (-15601)

tested with many different config combinations. Hopefully didn't miss anything.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/Makefile | 6 +++---
 kernel/bpf/core.c   | 9 +++++++++
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 45427239f375..0daf7f6ae7df 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
-obj-y := core.o syscall.o verifier.o
-
+obj-y := core.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
 ifdef CONFIG_TEST_BPF
-obj-y += test_stub.o
+obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
 endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f0c30c59b317..d6594e457a25 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -655,3 +655,12 @@ void bpf_prog_free(struct bpf_prog *fp)
 	schedule_work(&aux->work);
 }
 EXPORT_SYMBOL_GPL(bpf_prog_free);
+
+/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
+ * skb_copy_bits(), so provide a weak definition of it for NET-less config.
+ */
+int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
+			 int len)
+{
+	return -EFAULT;
+}
-- 
cgit v1.2.3