From e4eda884db7930cee434828759064b4711604078 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 22 May 2017 12:27:07 -0400
Subject: net: Make IP alignment calulations clearer.

The assignmnet:

	ip_align = strict ? 2 : NET_IP_ALIGN;

in compare_pkt_ptr_alignment() trips up Coverity because we can only
get to this code when strict is true, therefore ip_align will always
be 2 regardless of NET_IP_ALIGN's value.

So just assign directly to '2' and explain the situation in the
comment above.

Reported-by: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1eddb713b815..c72cd41f5b8b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -808,11 +808,15 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 		reg_off += reg->aux_off;
 	}
 
-	/* skb->data is NET_IP_ALIGN-ed, but for strict alignment checking
-	 * we force this to 2 which is universally what architectures use
-	 * when they don't set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+	/* For platforms that do not have a Kconfig enabling
+	 * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of
+	 * NET_IP_ALIGN is universally set to '2'.  And on platforms
+	 * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get
+	 * to this code only in strict mode where we want to emulate
+	 * the NET_IP_ALIGN==2 checking.  Therefore use an
+	 * unconditional IP align value of '2'.
 	 */
-	ip_align = strict ? 2 : NET_IP_ALIGN;
+	ip_align = 2;
 	if ((ip_align + reg_off + off) % size != 0) {
 		verbose("misaligned packet access off %d+%d+%d size %d\n",
 			ip_align, reg_off, off, size);
-- 
cgit v1.2.3


From 1ad2f5838d345e1c102bd1cd27c4f4c1349b0dc8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 25 May 2017 01:05:05 +0200
Subject: bpf: fix incorrect pruning decision when alignment must be tracked

Currently, when we enforce alignment tracking on direct packet access,
the verifier lets the following program pass despite doing a packet
write with unaligned access:

  0: (61) r2 = *(u32 *)(r1 +76)
  1: (61) r3 = *(u32 *)(r1 +80)
  2: (61) r7 = *(u32 *)(r1 +8)
  3: (bf) r0 = r2
  4: (07) r0 += 14
  5: (25) if r7 > 0x1 goto pc+4
   R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0)
   R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp
  6: (2d) if r0 > r3 goto pc+1
   R0=pkt(id=0,off=14,r=14) R1=ctx R2=pkt(id=0,off=0,r=14)
   R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp
  7: (63) *(u32 *)(r0 -4) = r0
  8: (b7) r0 = 0
  9: (95) exit

  from 6 to 8:
   R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0)
   R3=pkt_end R7=inv,min_value=0,max_value=1 R10=fp
  8: (b7) r0 = 0
  9: (95) exit

  from 5 to 10:
   R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0)
   R3=pkt_end R7=inv,min_value=2 R10=fp
  10: (07) r0 += 1
  11: (05) goto pc-6
  6: safe                           <----- here, wrongly found safe
  processed 15 insns

However, if we enforce a pruning mismatch by adding state into r8
which is then being mismatched in states_equal(), we find that for
the otherwise same program, the verifier detects a misaligned packet
access when actually walking that path:

  0: (61) r2 = *(u32 *)(r1 +76)
  1: (61) r3 = *(u32 *)(r1 +80)
  2: (61) r7 = *(u32 *)(r1 +8)
  3: (b7) r8 = 1
  4: (bf) r0 = r2
  5: (07) r0 += 14
  6: (25) if r7 > 0x1 goto pc+4
   R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0)
   R3=pkt_end R7=inv,min_value=0,max_value=1
   R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp
  7: (2d) if r0 > r3 goto pc+1
   R0=pkt(id=0,off=14,r=14) R1=ctx R2=pkt(id=0,off=0,r=14)
   R3=pkt_end R7=inv,min_value=0,max_value=1
   R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp
  8: (63) *(u32 *)(r0 -4) = r0
  9: (b7) r0 = 0
  10: (95) exit

  from 7 to 9:
   R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0)
   R3=pkt_end R7=inv,min_value=0,max_value=1
   R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp
  9: (b7) r0 = 0
  10: (95) exit

  from 6 to 11:
   R0=pkt(id=0,off=14,r=0) R1=ctx R2=pkt(id=0,off=0,r=0)
   R3=pkt_end R7=inv,min_value=2
   R8=imm1,min_value=1,max_value=1,min_align=1 R10=fp
  11: (07) r0 += 1
  12: (b7) r8 = 0
  13: (05) goto pc-7                <----- mismatch due to r8
  7: (2d) if r0 > r3 goto pc+1
   R0=pkt(id=0,off=15,r=15) R1=ctx R2=pkt(id=0,off=0,r=15)
   R3=pkt_end R7=inv,min_value=2
   R8=imm0,min_value=0,max_value=0,min_align=2147483648 R10=fp
  8: (63) *(u32 *)(r0 -4) = r0
  misaligned packet access off 2+15+-4 size 4

The reason why we fail to see it in states_equal() is that the
third test in compare_ptrs_to_packet() ...

  if (old->off <= cur->off &&
      old->off >= old->range && cur->off >= cur->range)
          return true;

... will let the above pass. The situation we run into is that
old->off <= cur->off (14 <= 15), meaning that prior walked paths
went with smaller offset, which was later used in the packet
access after successful packet range check and found to be safe
already.

For example: Given is R0=pkt(id=0,off=0,r=0). Adding offset 14
as in above program to it, results in R0=pkt(id=0,off=14,r=0)
before the packet range test. Now, testing this against R3=pkt_end
with 'if r0 > r3 goto out' will transform R0 into R0=pkt(id=0,off=14,r=14)
for the case when we're within bounds. A write into the packet
at offset *(u32 *)(r0 -4), that is, 2 + 14 -4, is valid and
aligned (2 is for NET_IP_ALIGN). After processing this with
all fall-through paths, we later on check paths from branches.
When the above skb->mark test is true, then we jump near the
end of the program, perform r0 += 1, and jump back to the
'if r0 > r3 goto out' test we've visited earlier already. This
time, R0 is of type R0=pkt(id=0,off=15,r=0), and we'll prune
that part because this time we'll have a larger safe packet
range, and we already found that with off=14 all further insn
were already safe, so it's safe as well with a larger off.
However, the problem is that the subsequent write into the packet
with 2 + 15 -4 is then unaligned, and not caught by the alignment
tracking. Note that min_align, aux_off, and aux_off_align were
all 0 in this example.

Since we cannot tell at this time what kind of packet access was
performed in the prior walk and what minimal requirements it has
(we might do so in the future, but that requires more complexity),
fix it to disable this pruning case for strict alignment for now,
and let the verifier do check such paths instead. With that applied,
the test cases pass and reject the program due to misalignment.

Fixes: d1174416747d ("bpf: Track alignment of register values in the verifier.")
Reference: http://patchwork.ozlabs.org/patch/761909/
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c72cd41f5b8b..e37e06b1229d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -843,9 +843,6 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 {
 	bool strict = env->strict_alignment;
 
-	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-		strict = true;
-
 	switch (reg->type) {
 	case PTR_TO_PACKET:
 		return check_pkt_ptr_alignment(reg, off, size, strict);
@@ -2696,7 +2693,8 @@ err_free:
 /* the following conditions reduce the number of explored insns
  * from ~140k to ~80k for ultra large programs that use a lot of ptr_to_packet
  */
-static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
+static bool compare_ptrs_to_packet(struct bpf_verifier_env *env,
+				   struct bpf_reg_state *old,
 				   struct bpf_reg_state *cur)
 {
 	if (old->id != cur->id)
@@ -2739,7 +2737,7 @@ static bool compare_ptrs_to_packet(struct bpf_reg_state *old,
 	 * 'if (R4 > data_end)' and all further insn were already good with r=20,
 	 * so they will be good with r=30 and we can prune the search.
 	 */
-	if (old->off <= cur->off &&
+	if (!env->strict_alignment && old->off <= cur->off &&
 	    old->off >= old->range && cur->off >= cur->range)
 		return true;
 
@@ -2810,7 +2808,7 @@ static bool states_equal(struct bpf_verifier_env *env,
 			continue;
 
 		if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
-		    compare_ptrs_to_packet(rold, rcur))
+		    compare_ptrs_to_packet(env, rold, rcur))
 			continue;
 
 		return false;
@@ -3588,10 +3586,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	} else {
 		log_level = 0;
 	}
-	if (attr->prog_flags & BPF_F_STRICT_ALIGNMENT)
+
+	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
-	else
-		env->strict_alignment = false;
 
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
@@ -3697,7 +3695,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	mutex_lock(&bpf_verifier_lock);
 
 	log_level = 0;
+
 	env->strict_alignment = false;
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+		env->strict_alignment = true;
 
 	env->explored_states = kcalloc(env->prog->len,
 				       sizeof(struct bpf_verifier_state_list *),
-- 
cgit v1.2.3


From a9789ef9afcb4fb0193f8dd94f2665ba3ad71e79 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 25 May 2017 01:05:06 +0200
Subject: bpf: properly reset caller saved regs after helper call and
 ld_abs/ind

Currently, after performing helper calls, we clear all caller saved
registers, that is r0 - r5 and fill r0 depending on struct bpf_func_proto
specification. The way we reset these regs can affect pruning decisions
in later paths, since we only reset register's imm to 0 and type to
NOT_INIT. However, we leave out clearing of other variables such as id,
min_value, max_value, etc, which can later on lead to pruning mismatches
due to stale data.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e37e06b1229d..339c8a1371de 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -463,19 +463,22 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
 	BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
 };
 
+static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+{
+	BUG_ON(regno >= MAX_BPF_REG);
+
+	memset(&regs[regno], 0, sizeof(regs[regno]));
+	regs[regno].type = NOT_INIT;
+	regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
+	regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+}
+
 static void init_reg_state(struct bpf_reg_state *regs)
 {
 	int i;
 
-	for (i = 0; i < MAX_BPF_REG; i++) {
-		regs[i].type = NOT_INIT;
-		regs[i].imm = 0;
-		regs[i].min_value = BPF_REGISTER_MIN_RANGE;
-		regs[i].max_value = BPF_REGISTER_MAX_RANGE;
-		regs[i].min_align = 0;
-		regs[i].aux_off = 0;
-		regs[i].aux_off_align = 0;
-	}
+	for (i = 0; i < MAX_BPF_REG; i++)
+		mark_reg_not_init(regs, i);
 
 	/* frame pointer */
 	regs[BPF_REG_FP].type = FRAME_PTR;
@@ -1346,7 +1349,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	struct bpf_verifier_state *state = &env->cur_state;
 	const struct bpf_func_proto *fn = NULL;
 	struct bpf_reg_state *regs = state->regs;
-	struct bpf_reg_state *reg;
 	struct bpf_call_arg_meta meta;
 	bool changes_data;
 	int i, err;
@@ -1413,11 +1415,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	}
 
 	/* reset caller saved regs */
-	for (i = 0; i < CALLER_SAVED_REGS; i++) {
-		reg = regs + caller_saved[i];
-		reg->type = NOT_INIT;
-		reg->imm = 0;
-	}
+	for (i = 0; i < CALLER_SAVED_REGS; i++)
+		mark_reg_not_init(regs, caller_saved[i]);
 
 	/* update return register */
 	if (fn->ret_type == RET_INTEGER) {
@@ -2445,7 +2444,6 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
 	struct bpf_reg_state *regs = env->cur_state.regs;
 	u8 mode = BPF_MODE(insn->code);
-	struct bpf_reg_state *reg;
 	int i, err;
 
 	if (!may_access_skb(env->prog->type)) {
@@ -2478,11 +2476,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	}
 
 	/* reset caller saved regs to unreadable */
-	for (i = 0; i < CALLER_SAVED_REGS; i++) {
-		reg = regs + caller_saved[i];
-		reg->type = NOT_INIT;
-		reg->imm = 0;
-	}
+	for (i = 0; i < CALLER_SAVED_REGS; i++)
+		mark_reg_not_init(regs, caller_saved[i]);
 
 	/* mark destination R0 register as readable, since it contains
 	 * the value fetched from the packet
-- 
cgit v1.2.3


From a316338cb71a3260201490e615f2f6d5c0d8fb2c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 25 May 2017 01:05:08 +0200
Subject: bpf: fix wrong exposure of map_flags into fdinfo for lpm

trie_alloc() always needs to have BPF_F_NO_PREALLOC passed in via
attr->map_flags, since it does not support preallocation yet. We
check the flag, but we never copy the flag into trie->map.map_flags,
which is later on exposed into fdinfo and used by loaders such as
iproute2. Latter uses this in bpf_map_selfcheck_pinned() to test
whether a pinned map has the same spec as the one from the BPF obj
file and if not, bails out, which is currently the case for lpm
since it exposes always 0 as flags.

Also copy over flags in array_map_alloc() and stack_map_alloc().
They always have to be 0 right now, but we should make sure to not
miss to copy them over at a later point in time when we add actual
flags for them to use.

Fixes: b95a5c4db09b ("bpf: add a longest prefix match trie map implementation")
Reported-by: Jarno Rajahalme <jarno@covalent.io>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 1 +
 kernel/bpf/lpm_trie.c | 1 +
 kernel/bpf/stackmap.c | 1 +
 3 files changed, 3 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 5e00b2333c26..172dc8ee0e3b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -86,6 +86,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	array->map.key_size = attr->key_size;
 	array->map.value_size = attr->value_size;
 	array->map.max_entries = attr->max_entries;
+	array->map.map_flags = attr->map_flags;
 	array->elem_size = elem_size;
 
 	if (!percpu)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 39cfafd895b8..b09185f0f17d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -432,6 +432,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	trie->map.key_size = attr->key_size;
 	trie->map.value_size = attr->value_size;
 	trie->map.max_entries = attr->max_entries;
+	trie->map.map_flags = attr->map_flags;
 	trie->data_size = attr->key_size -
 			  offsetof(struct bpf_lpm_trie_key, data);
 	trie->max_prefixlen = trie->data_size * 8;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 4dfd6f2ec2f9..31147d730abf 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -88,6 +88,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	smap->map.key_size = attr->key_size;
 	smap->map.value_size = value_size;
 	smap->map.max_entries = attr->max_entries;
+	smap->map.map_flags = attr->map_flags;
 	smap->n_buckets = n_buckets;
 	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
-- 
cgit v1.2.3


From 71189fa9b092ef125ee741eccb2f5fa916798afd Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 30 May 2017 13:31:27 -0700
Subject: bpf: free up BPF_JMP | BPF_CALL | BPF_X opcode

free up BPF_JMP | BPF_CALL | BPF_X opcode to be used by actual
indirect call by register and use kernel internal opcode to
mark call instruction into bpf_tail_call() helper.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c     | 2 +-
 kernel/bpf/verifier.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index dedf367f59bb..339289402b96 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
 		/* Call instruction */
 		[BPF_JMP | BPF_CALL] = &&JMP_CALL,
-		[BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
+		[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
 		/* Jumps */
 		[BPF_JMP | BPF_JA] = &&JMP_JA,
 		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 339c8a1371de..28113d0e8e92 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3469,7 +3469,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			 * that doesn't support bpf_tail_call yet
 			 */
 			insn->imm = 0;
-			insn->code |= BPF_X;
+			insn->code = BPF_JMP | BPF_TAIL_CALL;
 			continue;
 		}
 
-- 
cgit v1.2.3


From f696b8f471ec987e987e38206b8eb23c39ee5a86 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 30 May 2017 13:31:28 -0700
Subject: bpf: split bpf core interpreter

split __bpf_prog_run() interpreter into stack allocation and execution parts.
The code section shrinks which helps interpreter performance in some cases.
   text	   data	    bss	    dec	    hex	filename
  26350	  10328	    624	  37302	   91b6	kernel/bpf/core.o.before
  25777	  10328	    624	  36729	   8f79	kernel/bpf/core.o.after

Very short programs got slower (due to extra function call):
Before:
test_bpf: #89 ALU64_ADD_K: 1 + 2 = 3 jited:0 7 PASS
test_bpf: #90 ALU64_ADD_K: 3 + 0 = 3 jited:0 8 PASS
test_bpf: #91 ALU64_ADD_K: 1 + 2147483646 = 2147483647 jited:0 7 PASS
test_bpf: #92 ALU64_ADD_K: 4294967294 + 2 = 4294967296 jited:0 11 PASS
test_bpf: #93 ALU64_ADD_K: 2147483646 + -2147483647 = -1 jited:0 7 PASS
After:
test_bpf: #89 ALU64_ADD_K: 1 + 2 = 3 jited:0 11 PASS
test_bpf: #90 ALU64_ADD_K: 3 + 0 = 3 jited:0 11 PASS
test_bpf: #91 ALU64_ADD_K: 1 + 2147483646 = 2147483647 jited:0 11 PASS
test_bpf: #92 ALU64_ADD_K: 4294967294 + 2 = 4294967296 jited:0 14 PASS
test_bpf: #93 ALU64_ADD_K: 2147483646 + -2147483647 = -1 jited:0 10 PASS

Longer programs got faster:
Before:
test_bpf: #266 BPF_MAXINSNS: Ctx heavy transformations jited:0 20286 20513 PASS
test_bpf: #267 BPF_MAXINSNS: Call heavy transformations jited:0 31853 31768 PASS
test_bpf: #268 BPF_MAXINSNS: Jump heavy test jited:0 9815 PASS
test_bpf: #269 BPF_MAXINSNS: Very long jump backwards jited:0 6 PASS
test_bpf: #270 BPF_MAXINSNS: Edge hopping nuthouse jited:0 13959 PASS
test_bpf: #271 BPF_MAXINSNS: Jump, gap, jump, ... jited:0 210 PASS
test_bpf: #272 BPF_MAXINSNS: ld_abs+get_processor_id jited:0 21724 PASS
test_bpf: #273 BPF_MAXINSNS: ld_abs+vlan_push/pop jited:0 19118 PASS
After:
test_bpf: #266 BPF_MAXINSNS: Ctx heavy transformations jited:0 19008 18827 PASS
test_bpf: #267 BPF_MAXINSNS: Call heavy transformations jited:0 29238 28450 PASS
test_bpf: #268 BPF_MAXINSNS: Jump heavy test jited:0 9485 PASS
test_bpf: #269 BPF_MAXINSNS: Very long jump backwards jited:0 12 PASS
test_bpf: #270 BPF_MAXINSNS: Edge hopping nuthouse jited:0 13257 PASS
test_bpf: #271 BPF_MAXINSNS: Jump, gap, jump, ... jited:0 213 PASS
test_bpf: #272 BPF_MAXINSNS: ld_abs+get_processor_id jited:0 19389 PASS
test_bpf: #273 BPF_MAXINSNS: ld_abs+vlan_push/pop jited:0 19583 PASS

For real world production programs the difference is noise.

This patch is first step towards reducing interpreter stack consumption.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 339289402b96..abd410d394bc 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
  *
  * Decode and execute eBPF instructions.
  */
-static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
+				    u64 *stack)
 {
-	u64 stack[MAX_BPF_STACK / sizeof(u64)];
-	u64 regs[MAX_BPF_REG], tmp;
+	u64 tmp;
 	static const void *jumptable[256] = {
 		[0 ... 255] = &&default_label,
 		/* Now overwrite non-defaults ... */
@@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
 #define CONT	 ({ insn++; goto select_insn; })
 #define CONT_JMP ({ insn++; goto select_insn; })
 
-	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
-	ARG1 = (u64) (unsigned long) ctx;
-
 select_insn:
 	goto *jumptable[insn->code];
 
@@ -1219,7 +1216,17 @@ load_byte:
 		WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
 		return 0;
 }
-STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */
+STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
+
+static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+{
+	u64 stack[MAX_BPF_STACK / sizeof(u64)];
+	u64 regs[MAX_BPF_REG];
+
+	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+	ARG1 = (u64) (unsigned long) ctx;
+	return ___bpf_prog_run(regs, insn, stack);
+}
 
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
-- 
cgit v1.2.3


From 8726679a0fa317f8e83d0843b266453f31bff092 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 30 May 2017 13:31:29 -0700
Subject: bpf: teach verifier to track stack depth

teach verifier to track bpf program stack depth

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 28113d0e8e92..d96f27ff9f6f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -926,6 +926,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			verbose("invalid stack off=%d size=%d\n", off, size);
 			return -EACCES;
 		}
+
+		if (env->prog->aux->stack_depth < -off)
+			env->prog->aux->stack_depth = -off;
+
 		if (t == BPF_WRITE) {
 			if (!env->allow_ptr_leaks &&
 			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
@@ -1032,6 +1036,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		return -EACCES;
 	}
 
+	if (env->prog->aux->stack_depth < -off)
+		env->prog->aux->stack_depth = -off;
+
 	if (meta && meta->raw_mode) {
 		meta->access_size = access_size;
 		meta->regno = regno;
@@ -3167,7 +3174,8 @@ process_bpf_exit:
 		insn_idx++;
 	}
 
-	verbose("processed %d insns\n", insn_processed);
+	verbose("processed %d insns, stack depth %d\n",
+		insn_processed, env->prog->aux->stack_depth);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 80a58d02559465b0ea403ff91c8bca9a733b1b0d Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 30 May 2017 13:31:30 -0700
Subject: bpf: reconcile bpf_tail_call and stack_depth

The next set of patches will take advantage of stack_depth tracking,
so make sure that the program that does bpf_tail_call() has
stack depth large enough for the callee.
We could have tracked the stack depth of the prog_array owner program
and only allow insertion of the programs with stack depth less
than the owner, but it will break existing applications.
Some of them have trivial root bpf program that only does
multiple bpf_tail_calls and at init time the prog array is empty.
In the future we may add a flag to do such tracking optionally,
but for now play simple and safe.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d96f27ff9f6f..14ccb0759fa4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3470,6 +3470,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			 * the program array.
 			 */
 			prog->cb_access = 1;
+			env->prog->aux->stack_depth = MAX_BPF_STACK;
 
 			/* mark bpf_tail_call as different opcode to avoid
 			 * conditional branch in the interpeter for every normal
-- 
cgit v1.2.3


From b870aa901f4be1d32c13faf9e8f40bf2a8562e19 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 30 May 2017 13:31:33 -0700
Subject: bpf: use different interpreter depending on required stack size

16 __bpf_prog_run() interpreters for various stack sizes add .text
but not a lot comparing to run-time stack savings

   text	   data	    bss	    dec	    hex	filename
  26350   10328     624   37302    91b6 kernel/bpf/core.o.before_split
  25777   10328     624   36729    8f79 kernel/bpf/core.o.after_split
  26970	  10328	    624	  37922	   9422	kernel/bpf/core.o.now

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 40 +++++++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 9 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index abd410d394bc..774069ca18a7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1218,16 +1218,38 @@ load_byte:
 }
 STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
 
-static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
-{
-	u64 stack[MAX_BPF_STACK / sizeof(u64)];
-	u64 regs[MAX_BPF_REG];
-
-	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
-	ARG1 = (u64) (unsigned long) ctx;
-	return ___bpf_prog_run(regs, insn, stack);
+#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
+#define DEFINE_BPF_PROG_RUN(stack_size) \
+static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
+{ \
+	u64 stack[stack_size / sizeof(u64)]; \
+	u64 regs[MAX_BPF_REG]; \
+\
+	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
+	ARG1 = (u64) (unsigned long) ctx; \
+	return ___bpf_prog_run(regs, insn, stack); \
 }
 
+#define EVAL1(FN, X) FN(X)
+#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
+#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
+#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
+#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
+#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
+
+EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
+EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
+EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
+
+#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
+
+static unsigned int (*interpreters[])(const void *ctx,
+				      const struct bpf_insn *insn) = {
+EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
+EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
+EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
+};
+
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
@@ -1275,7 +1297,7 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
  */
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
-	fp->bpf_func = (void *) __bpf_prog_run;
+	fp->bpf_func = interpreters[round_down(fp->aux->stack_depth, 32) / 32];
 
 	/* eBPF JITs can rewrite the program in case constant
 	 * blinding is active. However, in case of error during
-- 
cgit v1.2.3


From fb9a307d11d62749d75b404f15517d73f5d6e148 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 31 May 2017 18:15:59 -0700
Subject: bpf: Allow CGROUP_SKB eBPF program to access sk_buff

This allows cgroup eBPF program to classify packet based on their
protocol or other detail information. Currently program need
CAP_NET_ADMIN privilege to attach a cgroup eBPF program, and A
process with CAP_NET_ADMIN can already see all packets on the system,
for example, by creating an iptables rules that causes the packet to
be passed to userspace via NFLOG.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 14ccb0759fa4..8acae64df255 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2426,6 +2426,7 @@ static bool may_access_skb(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_CGROUP_SKB:
 		return true;
 	default:
 		return false;
-- 
cgit v1.2.3


From 80b7d81912d807f161d55e9c2c9cc81061666f83 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 31 May 2017 18:16:00 -0700
Subject: bpf: Remove the capability check for cgroup skb eBPF program

Currently loading a cgroup skb eBPF program require a CAP_SYS_ADMIN
capability while attaching the program to a cgroup only requires the
user have CAP_NET_ADMIN privilege. We can escape the capability
check when load the program just like socket filter program to make
the capability requirement consistent.

Change since v1:
Change the code style in order to be compliant with checkpatch.pl
preference

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 265a0d854e33..59da103adb85 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -815,7 +815,9 @@ static int bpf_prog_load(union bpf_attr *attr)
 	    attr->kern_version != LINUX_VERSION_CODE)
 		return -EINVAL;
 
-	if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
+	    type != BPF_PROG_TYPE_CGROUP_SKB &&
+	    !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	/* plain bpf_prog allocation */
-- 
cgit v1.2.3


From f91840a32deef5cb1bf73338bc5010f843b01426 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Fri, 2 Jun 2017 21:03:52 -0700
Subject: perf, bpf: Add BPF support to all perf_event types

Allow BPF_PROG_TYPE_PERF_EVENT program types to attach to all
perf_event types, including HW_CACHE, RAW, and dynamic pmu events.
Only tracepoint/kprobe events are treated differently which require
BPF_PROG_TYPE_TRACEPOINT/BPF_PROG_TYPE_KPROBE program types accordingly.

Also add support for reading all event counters using
bpf_perf_event_read() helper.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 172dc8ee0e3b..ecb43542246e 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -452,38 +452,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
 static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
 					 struct file *map_file, int fd)
 {
-	const struct perf_event_attr *attr;
 	struct bpf_event_entry *ee;
 	struct perf_event *event;
 	struct file *perf_file;
+	u64 value;
 
 	perf_file = perf_event_get(fd);
 	if (IS_ERR(perf_file))
 		return perf_file;
 
+	ee = ERR_PTR(-EOPNOTSUPP);
 	event = perf_file->private_data;
-	ee = ERR_PTR(-EINVAL);
-
-	attr = perf_event_attrs(event);
-	if (IS_ERR(attr) || attr->inherit)
+	if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
 		goto err_out;
 
-	switch (attr->type) {
-	case PERF_TYPE_SOFTWARE:
-		if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
-			goto err_out;
-		/* fall-through */
-	case PERF_TYPE_RAW:
-	case PERF_TYPE_HARDWARE:
-		ee = bpf_event_entry_gen(perf_file, map_file);
-		if (ee)
-			return ee;
-		ee = ERR_PTR(-ENOMEM);
-		/* fall-through */
-	default:
-		break;
-	}
-
+	ee = bpf_event_entry_gen(perf_file, map_file);
+	if (ee)
+		return ee;
+	ee = ERR_PTR(-ENOMEM);
 err_out:
 	fput(perf_file);
 	return ee;
-- 
cgit v1.2.3


From dc4bb0e2356149aee4cdae061936f3bbdd45595c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:46 -0700
Subject: bpf: Introduce bpf_prog ID

This patch generates an unique ID for each BPF_PROG_LOAD-ed prog.
It is worth to note that each BPF_PROG_LOAD-ed prog will have
a different ID even they have the same bpf instructions.

The ID is generated by the existing idr_alloc_cyclic().
The ID is ranged from [1, INT_MAX).  It is allocated in cyclic manner,
so an ID will get reused every 2 billion BPF_PROG_LOAD.

The bpf_prog_alloc_id() is done after bpf_prog_select_runtime()
because the jit process may have allocated a new prog.  Hence,
we need to ensure the value of pointer 'prog' will not be changed
any more before storing the prog to the prog_idr.

After bpf_prog_select_runtime(), the prog is read-only.  Hence,
the id is stored in 'struct bpf_prog_aux'.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 40 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 59da103adb85..2a1b32b470f1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -22,8 +22,11 @@
 #include <linux/filter.h>
 #include <linux/version.h>
 #include <linux/kernel.h>
+#include <linux/idr.h>
 
 DEFINE_PER_CPU(int, bpf_prog_active);
+static DEFINE_IDR(prog_idr);
+static DEFINE_SPINLOCK(prog_idr_lock);
 
 int sysctl_unprivileged_bpf_disabled __read_mostly;
 
@@ -650,6 +653,34 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
 	free_uid(user);
 }
 
+static int bpf_prog_alloc_id(struct bpf_prog *prog)
+{
+	int id;
+
+	spin_lock_bh(&prog_idr_lock);
+	id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
+	if (id > 0)
+		prog->aux->id = id;
+	spin_unlock_bh(&prog_idr_lock);
+
+	/* id is in [1, INT_MAX) */
+	if (WARN_ON_ONCE(!id))
+		return -ENOSPC;
+
+	return id > 0 ? 0 : id;
+}
+
+static void bpf_prog_free_id(struct bpf_prog *prog)
+{
+	/* cBPF to eBPF migrations are currently not in the idr store. */
+	if (!prog->aux->id)
+		return;
+
+	spin_lock_bh(&prog_idr_lock);
+	idr_remove(&prog_idr, prog->aux->id);
+	spin_unlock_bh(&prog_idr_lock);
+}
+
 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 {
 	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -663,6 +694,7 @@ void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		trace_bpf_prog_put_rcu(prog);
+		bpf_prog_free_id(prog);
 		bpf_prog_kallsyms_del(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
@@ -857,15 +889,21 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err < 0)
 		goto free_used_maps;
 
+	err = bpf_prog_alloc_id(prog);
+	if (err)
+		goto free_used_maps;
+
 	err = bpf_prog_new_fd(prog);
 	if (err < 0)
 		/* failed to allocate fd */
-		goto free_used_maps;
+		goto free_id;
 
 	bpf_prog_kallsyms_add(prog);
 	trace_bpf_prog_load(prog, err);
 	return err;
 
+free_id:
+	bpf_prog_free_id(prog);
 free_used_maps:
 	free_used_maps(prog->aux);
 free_prog:
-- 
cgit v1.2.3


From f3f1c054c288bb6e503005e6d73611151ed20e91 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:47 -0700
Subject: bpf: Introduce bpf_map ID

This patch generates an unique ID for each created bpf_map.
The approach is similar to the earlier patch for bpf_prog ID.

It is worth to note that the bpf_map's ID and bpf_prog's ID
are in two independent ID spaces and both have the same valid range:
[1, INT_MAX).

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2a1b32b470f1..4c3075b5d840 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -27,6 +27,8 @@
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
+static DEFINE_IDR(map_idr);
+static DEFINE_SPINLOCK(map_idr_lock);
 
 int sysctl_unprivileged_bpf_disabled __read_mostly;
 
@@ -117,6 +119,29 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map)
 	free_uid(user);
 }
 
+static int bpf_map_alloc_id(struct bpf_map *map)
+{
+	int id;
+
+	spin_lock_bh(&map_idr_lock);
+	id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
+	if (id > 0)
+		map->id = id;
+	spin_unlock_bh(&map_idr_lock);
+
+	if (WARN_ON_ONCE(!id))
+		return -ENOSPC;
+
+	return id > 0 ? 0 : id;
+}
+
+static void bpf_map_free_id(struct bpf_map *map)
+{
+	spin_lock_bh(&map_idr_lock);
+	idr_remove(&map_idr, map->id);
+	spin_unlock_bh(&map_idr_lock);
+}
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
@@ -141,6 +166,7 @@ static void bpf_map_put_uref(struct bpf_map *map)
 void bpf_map_put(struct bpf_map *map)
 {
 	if (atomic_dec_and_test(&map->refcnt)) {
+		bpf_map_free_id(map);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
 	}
@@ -239,14 +265,20 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map_nouncharge;
 
+	err = bpf_map_alloc_id(map);
+	if (err)
+		goto free_map;
+
 	err = bpf_map_new_fd(map);
 	if (err < 0)
 		/* failed to allocate fd */
-		goto free_map;
+		goto free_id;
 
 	trace_bpf_map_create(map, err);
 	return err;
 
+free_id:
+	bpf_map_free_id(map);
 free_map:
 	bpf_map_uncharge_memlock(map);
 free_map_nouncharge:
-- 
cgit v1.2.3


From 34ad5580f8f9c86cb273ebea25c149613cd1667e Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:48 -0700
Subject: bpf: Add BPF_(PROG|MAP)_GET_NEXT_ID command

This patch adds BPF_PROG_GET_NEXT_ID and BPF_MAP_GET_NEXT_ID
to allow userspace to iterate all bpf_prog IDs and bpf_map IDs.

The API is trying to be consistent with the existing
BPF_MAP_GET_NEXT_KEY.

It is currently limited to CAP_SYS_ADMIN which we can
consider to lift it in followup patches.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4c3075b5d840..2405feedb8c1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -166,6 +166,7 @@ static void bpf_map_put_uref(struct bpf_map *map)
 void bpf_map_put(struct bpf_map *map)
 {
 	if (atomic_dec_and_test(&map->refcnt)) {
+		/* bpf_map_free_id() must be called first */
 		bpf_map_free_id(map);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
@@ -726,6 +727,7 @@ void bpf_prog_put(struct bpf_prog *prog)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		trace_bpf_prog_put_rcu(prog);
+		/* bpf_prog_free_id() must be called first */
 		bpf_prog_free_id(prog);
 		bpf_prog_kallsyms_del(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
@@ -1069,6 +1071,34 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	return ret;
 }
 
+#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
+
+static int bpf_obj_get_next_id(const union bpf_attr *attr,
+			       union bpf_attr __user *uattr,
+			       struct idr *idr,
+			       spinlock_t *lock)
+{
+	u32 next_id = attr->start_id;
+	int err = 0;
+
+	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	next_id++;
+	spin_lock_bh(lock);
+	if (!idr_get_next(idr, &next_id))
+		err = -ENOENT;
+	spin_unlock_bh(lock);
+
+	if (!err)
+		err = put_user(next_id, &uattr->next_id);
+
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1146,6 +1176,14 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr);
 		break;
+	case BPF_PROG_GET_NEXT_ID:
+		err = bpf_obj_get_next_id(&attr, uattr,
+					  &prog_idr, &prog_idr_lock);
+		break;
+	case BPF_MAP_GET_NEXT_ID:
+		err = bpf_obj_get_next_id(&attr, uattr,
+					  &map_idr, &map_idr_lock);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From b16d9aa4c2b90af8d2c3201e245150f8c430c3bc Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:49 -0700
Subject: bpf: Add BPF_PROG_GET_FD_BY_ID

Add BPF_PROG_GET_FD_BY_ID command to allow user to get a fd
from a bpf_prog's ID.

bpf_prog_inc_not_zero() is added and is called with prog_idr_lock
held.

__bpf_prog_put() is also added which has the 'bool do_idr_lock'
param to decide if the prog_idr_lock should be acquired when
freeing the prog->id.

In the error path of bpf_prog_inc_not_zero(), it may have to
call __bpf_prog_put(map, false) which does not need
to take the prog_idr_lock when freeing the prog->id.

It is currently limited to CAP_SYS_ADMIN which we can
consider to lift it in followup patches.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 81 insertions(+), 10 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2405feedb8c1..dc6253bb8ebb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -703,15 +703,23 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog)
 	return id > 0 ? 0 : id;
 }
 
-static void bpf_prog_free_id(struct bpf_prog *prog)
+static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
 {
 	/* cBPF to eBPF migrations are currently not in the idr store. */
 	if (!prog->aux->id)
 		return;
 
-	spin_lock_bh(&prog_idr_lock);
+	if (do_idr_lock)
+		spin_lock_bh(&prog_idr_lock);
+	else
+		__acquire(&prog_idr_lock);
+
 	idr_remove(&prog_idr, prog->aux->id);
-	spin_unlock_bh(&prog_idr_lock);
+
+	if (do_idr_lock)
+		spin_unlock_bh(&prog_idr_lock);
+	else
+		__release(&prog_idr_lock);
 }
 
 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
@@ -723,16 +731,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 	bpf_prog_free(aux->prog);
 }
 
-void bpf_prog_put(struct bpf_prog *prog)
+static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
 {
 	if (atomic_dec_and_test(&prog->aux->refcnt)) {
 		trace_bpf_prog_put_rcu(prog);
 		/* bpf_prog_free_id() must be called first */
-		bpf_prog_free_id(prog);
+		bpf_prog_free_id(prog, do_idr_lock);
 		bpf_prog_kallsyms_del(prog);
 		call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
 	}
 }
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+	__bpf_prog_put(prog, true);
+}
 EXPORT_SYMBOL_GPL(bpf_prog_put);
 
 static int bpf_prog_release(struct inode *inode, struct file *filp)
@@ -814,6 +827,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc);
 
+/* prog_idr_lock should have been held */
+static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
+{
+	int refold;
+
+	refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0);
+
+	if (refold >= BPF_MAX_REFCNT) {
+		__bpf_prog_put(prog, false);
+		return ERR_PTR(-EBUSY);
+	}
+
+	if (!refold)
+		return ERR_PTR(-ENOENT);
+
+	return prog;
+}
+
 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 {
 	struct fd f = fdget(ufd);
@@ -928,16 +959,21 @@ static int bpf_prog_load(union bpf_attr *attr)
 		goto free_used_maps;
 
 	err = bpf_prog_new_fd(prog);
-	if (err < 0)
-		/* failed to allocate fd */
-		goto free_id;
+	if (err < 0) {
+		/* failed to allocate fd.
+		 * bpf_prog_put() is needed because the above
+		 * bpf_prog_alloc_id() has published the prog
+		 * to the userspace and the userspace may
+		 * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
+		 */
+		bpf_prog_put(prog);
+		return err;
+	}
 
 	bpf_prog_kallsyms_add(prog);
 	trace_bpf_prog_load(prog, err);
 	return err;
 
-free_id:
-	bpf_prog_free_id(prog);
 free_used_maps:
 	free_used_maps(prog->aux);
 free_prog:
@@ -1099,6 +1135,38 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
 	return err;
 }
 
+#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+	struct bpf_prog *prog;
+	u32 id = attr->prog_id;
+	int fd;
+
+	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	spin_lock_bh(&prog_idr_lock);
+	prog = idr_find(&prog_idr, id);
+	if (prog)
+		prog = bpf_prog_inc_not_zero(prog);
+	else
+		prog = ERR_PTR(-ENOENT);
+	spin_unlock_bh(&prog_idr_lock);
+
+	if (IS_ERR(prog))
+		return PTR_ERR(prog);
+
+	fd = bpf_prog_new_fd(prog);
+	if (fd < 0)
+		bpf_prog_put(prog);
+
+	return fd;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1184,6 +1252,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = bpf_obj_get_next_id(&attr, uattr,
 					  &map_idr, &map_idr_lock);
 		break;
+	case BPF_PROG_GET_FD_BY_ID:
+		err = bpf_prog_get_fd_by_id(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From bd5f5f4ecb78e2698dad655645b6d6a2f7012a8c Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:50 -0700
Subject: bpf: Add BPF_MAP_GET_FD_BY_ID

Add BPF_MAP_GET_FD_BY_ID command to allow user to get a fd
from a bpf_map's ID.

bpf_map_inc_not_zero() is added and is called with map_idr_lock
held.

__bpf_map_put() is also added which has the 'bool do_idr_lock'
param to decide if the map_idr_lock should be acquired when
freeing the map->id.

In the error path of bpf_map_inc_not_zero(), it may have to
call __bpf_map_put(map, false) which does not need
to take the map_idr_lock when freeing the map->id.

It is currently limited to CAP_SYS_ADMIN which we can
consider to lift it in followup patches.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 85 insertions(+), 10 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dc6253bb8ebb..1802bb9c47d9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -135,11 +135,19 @@ static int bpf_map_alloc_id(struct bpf_map *map)
 	return id > 0 ? 0 : id;
 }
 
-static void bpf_map_free_id(struct bpf_map *map)
+static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
 {
-	spin_lock_bh(&map_idr_lock);
+	if (do_idr_lock)
+		spin_lock_bh(&map_idr_lock);
+	else
+		__acquire(&map_idr_lock);
+
 	idr_remove(&map_idr, map->id);
-	spin_unlock_bh(&map_idr_lock);
+
+	if (do_idr_lock)
+		spin_unlock_bh(&map_idr_lock);
+	else
+		__release(&map_idr_lock);
 }
 
 /* called from workqueue */
@@ -163,16 +171,21 @@ static void bpf_map_put_uref(struct bpf_map *map)
 /* decrement map refcnt and schedule it for freeing via workqueue
  * (unrelying map implementation ops->map_free() might sleep)
  */
-void bpf_map_put(struct bpf_map *map)
+static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
 {
 	if (atomic_dec_and_test(&map->refcnt)) {
 		/* bpf_map_free_id() must be called first */
-		bpf_map_free_id(map);
+		bpf_map_free_id(map, do_idr_lock);
 		INIT_WORK(&map->work, bpf_map_free_deferred);
 		schedule_work(&map->work);
 	}
 }
 
+void bpf_map_put(struct bpf_map *map)
+{
+	__bpf_map_put(map, true);
+}
+
 void bpf_map_put_with_uref(struct bpf_map *map)
 {
 	bpf_map_put_uref(map);
@@ -271,15 +284,20 @@ static int map_create(union bpf_attr *attr)
 		goto free_map;
 
 	err = bpf_map_new_fd(map);
-	if (err < 0)
-		/* failed to allocate fd */
-		goto free_id;
+	if (err < 0) {
+		/* failed to allocate fd.
+		 * bpf_map_put() is needed because the above
+		 * bpf_map_alloc_id() has published the map
+		 * to the userspace and the userspace may
+		 * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
+		 */
+		bpf_map_put(map);
+		return err;
+	}
 
 	trace_bpf_map_create(map, err);
 	return err;
 
-free_id:
-	bpf_map_free_id(map);
 free_map:
 	bpf_map_uncharge_memlock(map);
 free_map_nouncharge:
@@ -331,6 +349,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
 	return map;
 }
 
+/* map_idr_lock should have been held */
+static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
+					    bool uref)
+{
+	int refold;
+
+	refold = __atomic_add_unless(&map->refcnt, 1, 0);
+
+	if (refold >= BPF_MAX_REFCNT) {
+		__bpf_map_put(map, false);
+		return ERR_PTR(-EBUSY);
+	}
+
+	if (!refold)
+		return ERR_PTR(-ENOENT);
+
+	if (uref)
+		atomic_inc(&map->usercnt);
+
+	return map;
+}
+
 int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
 {
 	return -ENOTSUPP;
@@ -1167,6 +1207,38 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+
+static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
+{
+	struct bpf_map *map;
+	u32 id = attr->map_id;
+	int fd;
+
+	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	spin_lock_bh(&map_idr_lock);
+	map = idr_find(&map_idr, id);
+	if (map)
+		map = bpf_map_inc_not_zero(map, true);
+	else
+		map = ERR_PTR(-ENOENT);
+	spin_unlock_bh(&map_idr_lock);
+
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+
+	fd = bpf_map_new_fd(map);
+	if (fd < 0)
+		bpf_map_put(map);
+
+	return fd;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1255,6 +1327,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_GET_FD_BY_ID:
 		err = bpf_prog_get_fd_by_id(&attr);
 		break;
+	case BPF_MAP_GET_FD_BY_ID:
+		err = bpf_map_get_fd_by_id(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 1e270976908686ec25fb91b8a34145be54137976 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 5 Jun 2017 12:15:52 -0700
Subject: bpf: Add BPF_OBJ_GET_INFO_BY_FD

A single BPF_OBJ_GET_INFO_BY_FD cmd is used to obtain the info
for both bpf_prog and bpf_map.  The kernel can figure out the
fd is associated with a bpf_prog or bpf_map.

The suggested struct bpf_prog_info and struct bpf_map_info are
not meant to be a complete list and it is not the goal of this patch.
New fields can be added in the future patch.

The focus of this patch is to create the interface,
BPF_OBJ_GET_INFO_BY_FD cmd for exposing the bpf_prog's and
bpf_map's info.

The obj's info, which will be extended (and get bigger) over time, is
separated from the bpf_attr to avoid bloating the bpf_attr.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 163 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 146 insertions(+), 17 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1802bb9c47d9..8942c820d620 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1239,6 +1239,145 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
+static int check_uarg_tail_zero(void __user *uaddr,
+				size_t expected_size,
+				size_t actual_size)
+{
+	unsigned char __user *addr;
+	unsigned char __user *end;
+	unsigned char val;
+	int err;
+
+	if (actual_size <= expected_size)
+		return 0;
+
+	addr = uaddr + expected_size;
+	end  = uaddr + actual_size;
+
+	for (; addr < end; addr++) {
+		err = get_user(val, addr);
+		if (err)
+			return err;
+		if (val)
+			return -E2BIG;
+	}
+
+	return 0;
+}
+
+static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
+				   const union bpf_attr *attr,
+				   union bpf_attr __user *uattr)
+{
+	struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+	struct bpf_prog_info info = {};
+	u32 info_len = attr->info.info_len;
+	char __user *uinsns;
+	u32 ulen;
+	int err;
+
+	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	if (err)
+		return err;
+	info_len = min_t(u32, sizeof(info), info_len);
+
+	if (copy_from_user(&info, uinfo, info_len))
+		return err;
+
+	info.type = prog->type;
+	info.id = prog->aux->id;
+
+	memcpy(info.tag, prog->tag, sizeof(prog->tag));
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		info.jited_prog_len = 0;
+		info.xlated_prog_len = 0;
+		goto done;
+	}
+
+	ulen = info.jited_prog_len;
+	info.jited_prog_len = prog->jited_len;
+	if (info.jited_prog_len && ulen) {
+		uinsns = u64_to_user_ptr(info.jited_prog_insns);
+		ulen = min_t(u32, info.jited_prog_len, ulen);
+		if (copy_to_user(uinsns, prog->bpf_func, ulen))
+			return -EFAULT;
+	}
+
+	ulen = info.xlated_prog_len;
+	info.xlated_prog_len = bpf_prog_size(prog->len);
+	if (info.xlated_prog_len && ulen) {
+		uinsns = u64_to_user_ptr(info.xlated_prog_insns);
+		ulen = min_t(u32, info.xlated_prog_len, ulen);
+		if (copy_to_user(uinsns, prog->insnsi, ulen))
+			return -EFAULT;
+	}
+
+done:
+	if (copy_to_user(uinfo, &info, info_len) ||
+	    put_user(info_len, &uattr->info.info_len))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int bpf_map_get_info_by_fd(struct bpf_map *map,
+				  const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+	struct bpf_map_info info = {};
+	u32 info_len = attr->info.info_len;
+	int err;
+
+	err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+	if (err)
+		return err;
+	info_len = min_t(u32, sizeof(info), info_len);
+
+	info.type = map->map_type;
+	info.id = map->id;
+	info.key_size = map->key_size;
+	info.value_size = map->value_size;
+	info.max_entries = map->max_entries;
+	info.map_flags = map->map_flags;
+
+	if (copy_to_user(uinfo, &info, info_len) ||
+	    put_user(info_len, &uattr->info.info_len))
+		return -EFAULT;
+
+	return 0;
+}
+
+#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
+
+static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
+				  union bpf_attr __user *uattr)
+{
+	int ufd = attr->info.bpf_fd;
+	struct fd f;
+	int err;
+
+	if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
+		return -EINVAL;
+
+	f = fdget(ufd);
+	if (!f.file)
+		return -EBADFD;
+
+	if (f.file->f_op == &bpf_prog_fops)
+		err = bpf_prog_get_info_by_fd(f.file->private_data, attr,
+					      uattr);
+	else if (f.file->f_op == &bpf_map_fops)
+		err = bpf_map_get_info_by_fd(f.file->private_data, attr,
+					     uattr);
+	else
+		err = -EINVAL;
+
+	fdput(f);
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -1258,23 +1397,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	 * user-space does not rely on any kernel feature
 	 * extensions we dont know about yet.
 	 */
-	if (size > sizeof(attr)) {
-		unsigned char __user *addr;
-		unsigned char __user *end;
-		unsigned char val;
-
-		addr = (void __user *)uattr + sizeof(attr);
-		end  = (void __user *)uattr + size;
-
-		for (; addr < end; addr++) {
-			err = get_user(val, addr);
-			if (err)
-				return err;
-			if (val)
-				return -E2BIG;
-		}
-		size = sizeof(attr);
-	}
+	err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+	if (err)
+		return err;
+	size = min_t(u32, size, sizeof(attr));
 
 	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
 	if (copy_from_user(&attr, uattr, size) != 0)
@@ -1330,6 +1456,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_MAP_GET_FD_BY_ID:
 		err = bpf_map_get_fd_by_id(&attr);
 		break;
+	case BPF_OBJ_GET_INFO_BY_FD:
+		err = bpf_obj_get_info_by_fd(&attr, uattr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
-- 
cgit v1.2.3


From 92046578ac88e0a93f8ef03240e6c832b0189aa7 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Tue, 6 Jun 2017 18:38:04 +0200
Subject: bpf: cgroup skb progs cannot access ld_abs/ind

Commit fb9a307d11d6 ("bpf: Allow CGROUP_SKB eBPF program to
access sk_buff") enabled programs of BPF_PROG_TYPE_CGROUP_SKB
type to use ld_abs/ind instructions. However, at this point,
we cannot use them, since offsets relative to SKF_LL_OFF will
end up pointing skb_mac_header(skb) out of bounds since in the
egress path it is not yet set at that point in time, but only
after __dev_queue_xmit() did a general reset on the mac header.
bpf_internal_load_pointer_neg_helper() will then end up reading
data from a wrong offset.

BPF_PROG_TYPE_CGROUP_SKB programs can use bpf_skb_load_bytes()
already to access packet data, which is also more flexible than
the insns carried over from cBPF.

Fixes: fb9a307d11d6 ("bpf: Allow CGROUP_SKB eBPF program to access sk_buff")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Cc: Chenbo Feng <fengc@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8acae64df255..14ccb0759fa4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2426,7 +2426,6 @@ static bool may_access_skb(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
-	case BPF_PROG_TYPE_CGROUP_SKB:
 		return true;
 	default:
 		return false;
-- 
cgit v1.2.3


From d25da6caa2a1d6644360c40d7c5fd7c057551360 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:41 +0200
Subject: bpf: don't check spilled reg state for non-STACK_SPILLed type slots

spilled_regs[] state is only used for stack slots of type STACK_SPILL,
never for STACK_MISC. Right now, in states_equal(), even if we have
old and current stack state of type STACK_MISC, we compare spilled_regs[]
for that particular offset. Just skip these like we do everywhere else.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 14ccb0759fa4..d031b3b0752e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2828,6 +2828,8 @@ static bool states_equal(struct bpf_verifier_env *env,
 			return false;
 		if (i % BPF_REG_SIZE)
 			continue;
+		if (old->stack_slot_type[i] != STACK_SPILL)
+			continue;
 		if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
 			   &cur->spilled_regs[i / BPF_REG_SIZE],
 			   sizeof(old->spilled_regs[0])))
-- 
cgit v1.2.3


From 4a2ff55aa4946b036b87572976cbfc6ab244c497 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:42 +0200
Subject: bpf: reset id on CONST_IMM transition

Whenever we set the register to the type CONST_IMM, we currently don't
reset the id to 0. id member is not used in CONST_IMM case, so don't
let it become stale, where pruning won't be able to match later on.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d031b3b0752e..d195d825515a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1952,6 +1952,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			 */
 			regs[insn->dst_reg].type = CONST_IMM;
 			regs[insn->dst_reg].imm = insn->imm;
+			regs[insn->dst_reg].id = 0;
 			regs[insn->dst_reg].max_value = insn->imm;
 			regs[insn->dst_reg].min_value = insn->imm;
 			regs[insn->dst_reg].min_align = calc_align(insn->imm);
@@ -2409,6 +2410,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		regs[insn->dst_reg].type = CONST_IMM;
 		regs[insn->dst_reg].imm = imm;
+		regs[insn->dst_reg].id = 0;
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 36e24c003091a11ec847291c9a1d36d2ec92b155 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 11 Jun 2017 00:50:43 +0200
Subject: bpf: reset id on spilled regs in clear_all_pkt_pointers

Right now, we don't reset the id of spilled registers in case of
clear_all_pkt_pointers(). Given pkt_pointers are highly likely to
contain an id, do so by reusing __mark_reg_unknown_value().

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d195d825515a..519a6144d3d3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1346,8 +1346,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 		if (reg->type != PTR_TO_PACKET &&
 		    reg->type != PTR_TO_PACKET_END)
 			continue;
-		reg->type = UNKNOWN_VALUE;
-		reg->imm = 0;
+		__mark_reg_unknown_value(state->spilled_regs,
+					 i / BPF_REG_SIZE);
 	}
 }
 
-- 
cgit v1.2.3


From 31fd85816dbe3a714bcc3f67c17c3dd87011f79e Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 13 Jun 2017 15:52:13 -0700
Subject: bpf: permits narrower load from bpf program context fields

Currently, verifier will reject a program if it contains an
narrower load from the bpf context structure. For example,
        __u8 h = __sk_buff->hash, or
        __u16 p = __sk_buff->protocol
        __u32 sample_period = bpf_perf_event_data->sample_period
which are narrower loads of 4-byte or 8-byte field.

This patch solves the issue by:
  . Introduce a new parameter ctx_field_size to carry the
    field size of narrower load from prog type
    specific *__is_valid_access validator back to verifier.
  . The non-zero ctx_field_size for a memory access indicates
    (1). underlying prog type specific convert_ctx_accesses
         supporting non-whole-field access
    (2). the current insn is a narrower or whole field access.
  . In verifier, for such loads where load memory size is
    less than ctx_field_size, verifier transforms it
    to a full field load followed by proper masking.
  . Currently, __sk_buff and bpf_perf_event_data->sample_period
    are supporting narrowing loads.
  . Narrower stores are still not allowed as typical ctx stores
    are just normal stores.

Because of this change, some tests in verifier will fail and
these tests are removed. As a bonus, rename some out of bound
__sk_buff->cb access to proper field name and remove two
redundant "skb cb oob" tests.

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 71 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 52 insertions(+), 19 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 519a6144d3d3..44b97d958fb7 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -758,15 +758,26 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 }
 
 /* check access to 'struct bpf_context' fields */
-static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
+static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
+	int ctx_field_size = 0;
+
 	/* for analyzer ctx accesses are already validated and converted */
 	if (env->analyzer_ops)
 		return 0;
 
 	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
+	    env->prog->aux->ops->is_valid_access(off, size, t, reg_type, &ctx_field_size)) {
+		/* a non zero ctx_field_size indicates:
+		 * . For this field, the prog type specific ctx conversion algorithm
+		 *   only supports whole field access.
+		 * . This ctx access is a candiate for later verifier transformation
+		 *   to load the whole field and then apply a mask to get correct result.
+		 */
+		if (ctx_field_size)
+			env->insn_aux_data[insn_idx].ctx_field_size = ctx_field_size;
+
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
 			env->prog->aux->max_ctx_offset = off + size;
@@ -868,7 +879,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
  * if t==write && value_regno==-1, some unknown value is stored into memory
  * if t==read && value_regno==-1, don't care what we read from memory
  */
-static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off,
 			    int bpf_size, enum bpf_access_type t,
 			    int value_regno)
 {
@@ -911,7 +922,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 			verbose("R%d leaks addr into ctx\n", value_regno);
 			return -EACCES;
 		}
-		err = check_ctx_access(env, off, size, t, &reg_type);
+		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			mark_reg_unknown_value_and_range(state->regs,
 							 value_regno);
@@ -972,7 +983,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
 	return err;
 }
 
-static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
 {
 	struct bpf_reg_state *regs = env->cur_state.regs;
 	int err;
@@ -994,13 +1005,13 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return err;
 
 	/* check whether atomic_add can read the memory */
-	err = check_mem_access(env, insn->dst_reg, insn->off,
+	err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 			       BPF_SIZE(insn->code), BPF_READ, -1);
 	if (err)
 		return err;
 
 	/* check whether atomic_add can write into the same memory */
-	return check_mem_access(env, insn->dst_reg, insn->off,
+	return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 				BPF_SIZE(insn->code), BPF_WRITE, -1);
 }
 
@@ -1416,7 +1427,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	 * is inferred from register state.
 	 */
 	for (i = 0; i < meta.access_size; i++) {
-		err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1);
+		err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1);
 		if (err)
 			return err;
 	}
@@ -2993,18 +3004,12 @@ static int do_check(struct bpf_verifier_env *env)
 			/* check that memory (src_reg + off) is readable,
 			 * the state of dst_reg will be updated by this func
 			 */
-			err = check_mem_access(env, insn->src_reg, insn->off,
+			err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_READ,
 					       insn->dst_reg);
 			if (err)
 				return err;
 
-			if (BPF_SIZE(insn->code) != BPF_W &&
-			    BPF_SIZE(insn->code) != BPF_DW) {
-				insn_idx++;
-				continue;
-			}
-
 			prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
 
 			if (*prev_src_type == NOT_INIT) {
@@ -3032,7 +3037,7 @@ static int do_check(struct bpf_verifier_env *env)
 			enum bpf_reg_type *prev_dst_type, dst_reg_type;
 
 			if (BPF_MODE(insn->code) == BPF_XADD) {
-				err = check_xadd(env, insn);
+				err = check_xadd(env, insn_idx, insn);
 				if (err)
 					return err;
 				insn_idx++;
@@ -3051,7 +3056,7 @@ static int do_check(struct bpf_verifier_env *env)
 			dst_reg_type = regs[insn->dst_reg].type;
 
 			/* check that memory (dst_reg + off) is writeable */
-			err = check_mem_access(env, insn->dst_reg, insn->off,
+			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_WRITE,
 					       insn->src_reg);
 			if (err)
@@ -3080,7 +3085,7 @@ static int do_check(struct bpf_verifier_env *env)
 				return err;
 
 			/* check that memory (dst_reg + off) is writeable */
-			err = check_mem_access(env, insn->dst_reg, insn->off,
+			err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
 					       BPF_SIZE(insn->code), BPF_WRITE,
 					       -1);
 			if (err)
@@ -3383,7 +3388,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 	struct bpf_insn insn_buf[16], *insn;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
-	int i, cnt, delta = 0;
+	int i, cnt, off, size, ctx_field_size, is_narrower_load, delta = 0;
 
 	if (ops->gen_prologue) {
 		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
@@ -3423,11 +3428,39 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
 			continue;
 
+		off = insn->off;
+		size = bpf_size_to_bytes(BPF_SIZE(insn->code));
+		ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
+		is_narrower_load = (type == BPF_READ && size < ctx_field_size);
+
+		/* If the read access is a narrower load of the field,
+		 * convert to a 4/8-byte load, to minimum program type specific
+		 * convert_ctx_access changes. If conversion is successful,
+		 * we will apply proper mask to the result.
+		 */
+		if (is_narrower_load) {
+			int size_code = BPF_H;
+
+			if (ctx_field_size == 4)
+				size_code = BPF_W;
+			else if (ctx_field_size == 8)
+				size_code = BPF_DW;
+			insn->off = off & ~(ctx_field_size - 1);
+			insn->code = BPF_LDX | BPF_MEM | size_code;
+		}
 		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
 		}
+		if (is_narrower_load) {
+			if (ctx_field_size <= 4)
+				insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
+							(1 << size * 8) - 1);
+			else
+				insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
+							(1 << size * 8) - 1);
+		}
 
 		new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
 		if (!new_prog)
-- 
cgit v1.2.3


From 239946314e57711d7da546b67964d0b387a3ee42 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 22 Jun 2017 15:07:39 -0700
Subject: bpf: possibly avoid extra masking for narrower load in verifier

Commit 31fd85816dbe ("bpf: permits narrower load from bpf program
context fields") permits narrower load for certain ctx fields.
The commit however will already generate a masking even if
the prog-specific ctx conversion produces the result with
narrower size.

For example, for __sk_buff->protocol, the ctx conversion
loads the data into register with 2-byte load.
A narrower 2-byte load should not generate masking.
For __sk_buff->vlan_present, the conversion function
set the result as either 0 or 1, essentially a byte.
The narrower 2-byte or 1-byte load should not generate masking.

To avoid unnecessary masking, prog-specific *_is_valid_access
now passes converted_op_size back to verifier, which indicates
the valid data width after perceived future conversion.
Based on this information, verifier is able to avoid
unnecessary marking.

Since we want more information back from prog-specific
*_is_valid_access checking, all of them are packed into
one data structure for more clarity.

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 44b97d958fb7..74ea96ea391b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -761,22 +761,34 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
-	int ctx_field_size = 0;
+	struct bpf_insn_access_aux info = { .reg_type = *reg_type };
 
 	/* for analyzer ctx accesses are already validated and converted */
 	if (env->analyzer_ops)
 		return 0;
 
 	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t, reg_type, &ctx_field_size)) {
-		/* a non zero ctx_field_size indicates:
+	    env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+		/* a non zero info.ctx_field_size indicates:
 		 * . For this field, the prog type specific ctx conversion algorithm
 		 *   only supports whole field access.
 		 * . This ctx access is a candiate for later verifier transformation
 		 *   to load the whole field and then apply a mask to get correct result.
+		 * a non zero info.converted_op_size indicates perceived actual converted
+		 * value width in convert_ctx_access.
 		 */
-		if (ctx_field_size)
-			env->insn_aux_data[insn_idx].ctx_field_size = ctx_field_size;
+		if ((info.ctx_field_size && !info.converted_op_size) ||
+		    (!info.ctx_field_size &&  info.converted_op_size)) {
+			verbose("verifier bug in is_valid_access prog type=%u off=%d size=%d\n",
+				env->prog->type, off, size);
+			return -EACCES;
+		}
+
+		if (info.ctx_field_size) {
+			env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+			env->insn_aux_data[insn_idx].converted_op_size = info.converted_op_size;
+		}
+		*reg_type = info.reg_type;
 
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
@@ -3388,7 +3400,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 	struct bpf_insn insn_buf[16], *insn;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
-	int i, cnt, off, size, ctx_field_size, is_narrower_load, delta = 0;
+	int i, cnt, off, size, ctx_field_size, converted_op_size, is_narrower_load, delta = 0;
 
 	if (ops->gen_prologue) {
 		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
@@ -3431,7 +3443,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		off = insn->off;
 		size = bpf_size_to_bytes(BPF_SIZE(insn->code));
 		ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
-		is_narrower_load = (type == BPF_READ && size < ctx_field_size);
+		converted_op_size = env->insn_aux_data[i + delta].converted_op_size;
+		is_narrower_load = type == BPF_READ && size < ctx_field_size;
 
 		/* If the read access is a narrower load of the field,
 		 * convert to a 4/8-byte load, to minimum program type specific
@@ -3453,7 +3466,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
 		}
-		if (is_narrower_load) {
+		if (is_narrower_load && size < converted_op_size) {
 			if (ctx_field_size <= 4)
 				insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
 							(1 << size * 8) - 1);
-- 
cgit v1.2.3


From 14dc6f04f49dc12614d7e90928b495b8d73cd471 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 27 Jun 2017 23:08:34 -0700
Subject: bpf: Add syscall lookup support for fd array and htab

This patch allows userspace to do BPF_MAP_LOOKUP_ELEM on
BPF_MAP_TYPE_PROG_ARRAY,
BPF_MAP_TYPE_ARRAY_OF_MAPS and
BPF_MAP_TYPE_HASH_OF_MAPS.

The lookup returns a prog-id or map-id to the userspace.
The userspace can then use the BPF_PROG_GET_FD_BY_ID
or BPF_MAP_GET_FD_BY_ID to get a fd.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c   | 27 +++++++++++++++++++++++++++
 kernel/bpf/hashtab.c    | 21 +++++++++++++++++++++
 kernel/bpf/map_in_map.c |  5 +++++
 kernel/bpf/map_in_map.h |  1 +
 kernel/bpf/syscall.c    | 16 +++++++++++++---
 5 files changed, 67 insertions(+), 3 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index ecb43542246e..d771a3872500 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -334,6 +334,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
 	return NULL;
 }
 
+/* only called from syscall */
+int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+	void **elem, *ptr;
+	int ret =  0;
+
+	if (!map->ops->map_fd_sys_lookup_elem)
+		return -ENOTSUPP;
+
+	rcu_read_lock();
+	elem = array_map_lookup_elem(map, key);
+	if (elem && (ptr = READ_ONCE(*elem)))
+		*value = map->ops->map_fd_sys_lookup_elem(ptr);
+	else
+		ret = -ENOENT;
+	rcu_read_unlock();
+
+	return ret;
+}
+
 /* only called from syscall */
 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
 				 void *key, void *value, u64 map_flags)
@@ -400,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr)
 	bpf_prog_put(ptr);
 }
 
+static u32 prog_fd_array_sys_lookup_elem(void *ptr)
+{
+	return ((struct bpf_prog *)ptr)->aux->id;
+}
+
 /* decrement refcnt of all bpf_progs that are stored in this map */
 void bpf_fd_array_map_clear(struct bpf_map *map)
 {
@@ -418,6 +443,7 @@ const struct bpf_map_ops prog_array_map_ops = {
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = prog_fd_array_get_ptr,
 	.map_fd_put_ptr = prog_fd_array_put_ptr,
+	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
 };
 
 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -585,4 +611,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
 	.map_delete_elem = fd_array_map_delete_elem,
 	.map_fd_get_ptr = bpf_map_fd_get_ptr,
 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
+	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 };
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 004334ea13ba..4fb463172aa8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1243,6 +1243,26 @@ static void fd_htab_map_free(struct bpf_map *map)
 	htab_map_free(map);
 }
 
+/* only called from syscall */
+int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+	void **ptr;
+	int ret = 0;
+
+	if (!map->ops->map_fd_sys_lookup_elem)
+		return -ENOTSUPP;
+
+	rcu_read_lock();
+	ptr = htab_map_lookup_elem(map, key);
+	if (ptr)
+		*value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
+	else
+		ret = -ENOENT;
+	rcu_read_unlock();
+
+	return ret;
+}
+
 /* only called from syscall */
 int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 				void *key, void *value, u64 map_flags)
@@ -1305,4 +1325,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
 	.map_delete_elem = htab_map_delete_elem,
 	.map_fd_get_ptr = bpf_map_fd_get_ptr,
 	.map_fd_put_ptr = bpf_map_fd_put_ptr,
+	.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
 };
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 59bcdf821ae4..1da574612bea 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr)
 	 */
 	bpf_map_put(ptr);
 }
+
+u32 bpf_map_fd_sys_lookup_elem(void *ptr)
+{
+	return ((struct bpf_map *)ptr)->id;
+}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 177fadb689dc..6183db9ec08c 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
 void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
 			 int ufd);
 void bpf_map_fd_put_ptr(void *ptr);
+u32 bpf_map_fd_sys_lookup_elem(void *ptr);
 
 #endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8942c820d620..4409ccca8831 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -24,6 +24,13 @@
 #include <linux/kernel.h>
 #include <linux/idr.h>
 
+#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
+			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
+			   (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
+			   (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
+#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
+
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
@@ -411,6 +418,8 @@ static int map_lookup_elem(union bpf_attr *attr)
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 		value_size = round_up(map->value_size, 8) * num_possible_cpus();
+	else if (IS_FD_MAP(map))
+		value_size = sizeof(u32);
 	else
 		value_size = map->value_size;
 
@@ -426,9 +435,10 @@ static int map_lookup_elem(union bpf_attr *attr)
 		err = bpf_percpu_array_copy(map, key, value);
 	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
 		err = bpf_stackmap_copy(map, key, value);
-	} else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
-		   map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
-		err = -ENOTSUPP;
+	} else if (IS_FD_ARRAY(map)) {
+		err = bpf_fd_array_map_lookup_elem(map, key, value);
+	} else if (IS_FD_HASH(map)) {
+		err = bpf_fd_htab_map_lookup_elem(map, key, value);
 	} else {
 		rcu_read_lock();
 		ptr = map->ops->map_lookup_elem(map, key);
-- 
cgit v1.2.3


From 8007e40a24e12d35189203370268c7278f29ab74 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 28 Jun 2017 10:41:24 -0700
Subject: bpf: Fix out-of-bound access on interpreters[]

The index is off-by-one when fp->aux->stack_depth
has already been rounded up to 32.  In particular,
if stack_depth is 512, the index will be 16.

The fix is to round_up and then takes -1 instead of round_down.

[   22.318680] ==================================================================
[   22.319745] BUG: KASAN: global-out-of-bounds in bpf_prog_select_runtime+0x48a/0x670
[   22.320737] Read of size 8 at addr ffffffff82aadae0 by task sockex3/1946
[   22.321646]
[   22.321858] CPU: 1 PID: 1946 Comm: sockex3 Tainted: G        W       4.12.0-rc6-01680-g2ee87db3a287 #22
[   22.323061] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
[   22.324260] Call Trace:
[   22.324612]  dump_stack+0x67/0x99
[   22.325081]  print_address_description+0x1e8/0x290
[   22.325734]  ? bpf_prog_select_runtime+0x48a/0x670
[   22.326360]  kasan_report+0x265/0x350
[   22.326860]  __asan_report_load8_noabort+0x19/0x20
[   22.327484]  bpf_prog_select_runtime+0x48a/0x670
[   22.328109]  bpf_prog_load+0x626/0xd40
[   22.328637]  ? __bpf_prog_charge+0xc0/0xc0
[   22.329222]  ? check_nnp_nosuid.isra.61+0x100/0x100
[   22.329890]  ? __might_fault+0xf6/0x1b0
[   22.330446]  ? lock_acquire+0x360/0x360
[   22.331013]  SyS_bpf+0x67c/0x24d0
[   22.331491]  ? trace_hardirqs_on+0xd/0x10
[   22.332049]  ? __getnstimeofday64+0xaf/0x1c0
[   22.332635]  ? bpf_prog_get+0x20/0x20
[   22.333135]  ? __audit_syscall_entry+0x300/0x600
[   22.333770]  ? syscall_trace_enter+0x540/0xdd0
[   22.334339]  ? exit_to_usermode_loop+0xe0/0xe0
[   22.334950]  ? do_syscall_64+0x48/0x410
[   22.335446]  ? bpf_prog_get+0x20/0x20
[   22.335954]  do_syscall_64+0x181/0x410
[   22.336454]  entry_SYSCALL64_slow_path+0x25/0x25
[   22.337121] RIP: 0033:0x7f263fe81f19
[   22.337618] RSP: 002b:00007ffd9a3440c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000141
[   22.338619] RAX: ffffffffffffffda RBX: 0000000000aac5fb RCX: 00007f263fe81f19
[   22.339600] RDX: 0000000000000030 RSI: 00007ffd9a3440d0 RDI: 0000000000000005
[   22.340470] RBP: 0000000000a9a1e0 R08: 0000000000a9a1e0 R09: 0000009d00000001
[   22.341430] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000010000
[   22.342411] R13: 0000000000a9a023 R14: 0000000000000001 R15: 0000000000000003
[   22.343369]
[   22.343593] The buggy address belongs to the variable:
[   22.344241]  interpreters+0x80/0x980
[   22.344708]
[   22.344908] Memory state around the buggy address:
[   22.345556]  ffffffff82aad980: 00 00 00 04 fa fa fa fa 04 fa fa fa fa fa fa fa
[   22.346449]  ffffffff82aada00: 00 00 00 00 00 fa fa fa fa fa fa fa 00 00 00 00
[   22.347361] >ffffffff82aada80: 00 00 00 00 00 00 00 00 00 00 00 00 fa fa fa fa
[   22.348301]                                                        ^
[   22.349142]  ffffffff82aadb00: 00 01 fa fa fa fa fa fa 00 00 00 00 00 00 00 00
[   22.350058]  ffffffff82aadb80: 00 00 07 fa fa fa fa fa 00 00 05 fa fa fa fa fa
[   22.350984] ==================================================================

Fixes: b870aa901f4b ("bpf: use different interpreter depending on required stack size")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 774069ca18a7..ad5f55922a13 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1297,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
  */
 struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 {
-	fp->bpf_func = interpreters[round_down(fp->aux->stack_depth, 32) / 32];
+	u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+
+	fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
 
 	/* eBPF JITs can rewrite the program in case constant
 	 * blinding is active. However, in case of error during
-- 
cgit v1.2.3


From 6bdf6abc56b53103324dfd270a86580306e1a232 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 29 Jun 2017 03:04:59 +0200
Subject: bpf: prevent leaking pointer via xadd on unpriviledged

Leaking kernel addresses on unpriviledged is generally disallowed,
for example, verifier rejects the following:

  0: (b7) r0 = 0
  1: (18) r2 = 0xffff897e82304400
  3: (7b) *(u64 *)(r1 +48) = r2
  R2 leaks addr into ctx

Doing pointer arithmetic on them is also forbidden, so that they
don't turn into unknown value and then get leaked out. However,
there's xadd as a special case, where we don't check the src reg
for being a pointer register, e.g. the following will pass:

  0: (b7) r0 = 0
  1: (7b) *(u64 *)(r1 +48) = r0
  2: (18) r2 = 0xffff897e82304400 ; map
  4: (db) lock *(u64 *)(r1 +48) += r2
  5: (95) exit

We could store the pointer into skb->cb, loose the type context,
and then read it out from there again to leak it eventually out
of a map value. Or more easily in a different variant, too:

   0: (bf) r6 = r1
   1: (7a) *(u64 *)(r10 -8) = 0
   2: (bf) r2 = r10
   3: (07) r2 += -8
   4: (18) r1 = 0x0
   6: (85) call bpf_map_lookup_elem#1
   7: (15) if r0 == 0x0 goto pc+3
   R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R6=ctx R10=fp
   8: (b7) r3 = 0
   9: (7b) *(u64 *)(r0 +0) = r3
  10: (db) lock *(u64 *)(r0 +0) += r6
  11: (b7) r0 = 0
  12: (95) exit

  from 7 to 11: R0=inv,min_value=0,max_value=0 R6=ctx R10=fp
  11: (b7) r0 = 0
  12: (95) exit

Prevent this by checking xadd src reg for pointer types. Also
add a couple of test cases related to this.

Fixes: 1be7f75d1668 ("bpf: enable non-root eBPF programs")
Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 339c8a1371de..a8a725697bed 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -989,6 +989,11 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	if (err)
 		return err;
 
+	if (is_pointer_value(env, insn->src_reg)) {
+		verbose("R%d leaks addr into mem\n", insn->src_reg);
+		return -EACCES;
+	}
+
 	/* check whether atomic_add can read the memory */
 	err = check_mem_access(env, insn->dst_reg, insn->off,
 			       BPF_SIZE(insn->code), BPF_READ, -1);
-- 
cgit v1.2.3


From e4448ed87ccdbacb74871736f63220642242b32f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 13 May 2017 18:43:00 -0400
Subject: bpf: don't open-code memdup_user()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/bpf/syscall.c | 45 ++++++++++++++++-----------------------------
 1 file changed, 16 insertions(+), 29 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fd2411fd6914..4b8b10bddfde 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -322,14 +322,11 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	err = -ENOMEM;
-	key = kmalloc(map->key_size, GFP_USER);
-	if (!key)
+	key = memdup_user(ukey, map->key_size);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
 		goto err_put;
-
-	err = -EFAULT;
-	if (copy_from_user(key, ukey, map->key_size) != 0)
-		goto free_key;
+	}
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -402,14 +399,11 @@ static int map_update_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	err = -ENOMEM;
-	key = kmalloc(map->key_size, GFP_USER);
-	if (!key)
+	key = memdup_user(ukey, map->key_size);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
 		goto err_put;
-
-	err = -EFAULT;
-	if (copy_from_user(key, ukey, map->key_size) != 0)
-		goto free_key;
+	}
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
 	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -488,14 +482,11 @@ static int map_delete_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	err = -ENOMEM;
-	key = kmalloc(map->key_size, GFP_USER);
-	if (!key)
+	key = memdup_user(ukey, map->key_size);
+	if (IS_ERR(key)) {
+		err = PTR_ERR(key);
 		goto err_put;
-
-	err = -EFAULT;
-	if (copy_from_user(key, ukey, map->key_size) != 0)
-		goto free_key;
+	}
 
 	preempt_disable();
 	__this_cpu_inc(bpf_prog_active);
@@ -507,7 +498,6 @@ static int map_delete_elem(union bpf_attr *attr)
 
 	if (!err)
 		trace_bpf_map_delete_elem(map, ufd, key);
-free_key:
 	kfree(key);
 err_put:
 	fdput(f);
@@ -536,14 +526,11 @@ static int map_get_next_key(union bpf_attr *attr)
 		return PTR_ERR(map);
 
 	if (ukey) {
-		err = -ENOMEM;
-		key = kmalloc(map->key_size, GFP_USER);
-		if (!key)
+		key = memdup_user(ukey, map->key_size);
+		if (IS_ERR(key)) {
+			err = PTR_ERR(key);
 			goto err_put;
-
-		err = -EFAULT;
-		if (copy_from_user(key, ukey, map->key_size) != 0)
-			goto free_key;
+		}
 	} else {
 		key = NULL;
 	}
-- 
cgit v1.2.3


From 40304b2a1567fecc321f640ee4239556dd0f3ee0 Mon Sep 17 00:00:00 2001
From: Lawrence Brakmo <brakmo@fb.com>
Date: Fri, 30 Jun 2017 20:02:40 -0700
Subject: bpf: BPF support for sock_ops

Created a new BPF program type, BPF_PROG_TYPE_SOCK_OPS, and a corresponding
struct that allows BPF programs of this type to access some of the
socket's fields (such as IP addresses, ports, etc.). It uses the
existing bpf cgroups infrastructure so the programs can be attached per
cgroup with full inheritance support. The program will be called at
appropriate times to set relevant connections parameters such as buffer
sizes, SYN and SYN-ACK RTOs, etc., based on connection information such
as IP addresses, port numbers, etc.

Alghough there are already 3 mechanisms to set parameters (sysctls,
route metrics and setsockopts), this new mechanism provides some
distinct advantages. Unlike sysctls, it can set parameters per
connection. In contrast to route metrics, it can also use port numbers
and information provided by a user level program. In addition, it could
set parameters probabilistically for evaluation purposes (i.e. do
something different on 10% of the flows and compare results with the
other 90% of the flows). Also, in cases where IPv6 addresses contain
geographic information, the rules to make changes based on the distance
(or RTT) between the hosts are much easier than route metric rules and
can be global. Finally, unlike setsockopt, it oes not require
application changes and it can be updated easily at any time.

Although the bpf cgroup framework already contains a sock related
program type (BPF_PROG_TYPE_CGROUP_SOCK), I created the new type
(BPF_PROG_TYPE_SOCK_OPS) beccause the existing type expects to be called
only once during the connections's lifetime. In contrast, the new
program type will be called multiple times from different places in the
network stack code.  For example, before sending SYN and SYN-ACKs to set
an appropriate timeout, when the connection is established to set
congestion control, etc. As a result it has "op" field to specify the
type of operation requested.

The purpose of this new program type is to simplify setting connection
parameters, such as buffer sizes, TCP's SYN RTO, etc. For example, it is
easy to use facebook's internal IPv6 addresses to determine if both hosts
of a connection are in the same datacenter. Therefore, it is easy to
write a BPF program to choose a small SYN RTO value when both hosts are
in the same datacenter.

This patch only contains the framework to support the new BPF program
type, following patches add the functionality to set various connection
parameters.

This patch defines a new BPF program type: BPF_PROG_TYPE_SOCKET_OPS
and a new bpf syscall command to load a new program of this type:
BPF_PROG_LOAD_SOCKET_OPS.

Two new corresponding structs (one for the kernel one for the user/BPF
program):

/* kernel version */
struct bpf_sock_ops_kern {
        struct sock *sk;
        __u32  op;
        union {
                __u32 reply;
                __u32 replylong[4];
        };
};

/* user version
 * Some fields are in network byte order reflecting the sock struct
 * Use the bpf_ntohl helper macro in samples/bpf/bpf_endian.h to
 * convert them to host byte order.
 */
struct bpf_sock_ops {
        __u32 op;
        union {
                __u32 reply;
                __u32 replylong[4];
        };
        __u32 family;
        __u32 remote_ip4;     /* In network byte order */
        __u32 local_ip4;      /* In network byte order */
        __u32 remote_ip6[4];  /* In network byte order */
        __u32 local_ip6[4];   /* In network byte order */
        __u32 remote_port;    /* In network byte order */
        __u32 local_port;     /* In host byte horder */
};

Currently there are two types of ops. The first type expects the BPF
program to return a value which is then used by the caller (or a
negative value to indicate the operation is not supported). The second
type expects state changes to be done by the BPF program, for example
through a setsockopt BPF helper function, and they ignore the return
value.

The reply fields of the bpf_sockt_ops struct are there in case a bpf
program needs to return a value larger than an integer.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/cgroup.c  | 37 +++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c |  5 +++++
 2 files changed, 42 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ea6033cba947..546113430049 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 	return ret;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
+
+/**
+ * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
+ * @sk: socket to get cgroup from
+ * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
+ * sk with connection information (IP addresses, etc.) May not contain
+ * cgroup info if it is a req sock.
+ * @type: The type of program to be exectuted
+ *
+ * socket passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock_ops
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+				     struct bpf_sock_ops_kern *sock_ops,
+				     enum bpf_attach_type type)
+{
+	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+	struct bpf_prog *prog;
+	int ret = 0;
+
+
+	rcu_read_lock();
+
+	prog = rcu_dereference(cgrp->bpf.effective[type]);
+	if (prog)
+		ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4409ccca8831..d4d47de75bba 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1079,6 +1079,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET_SOCK_CREATE:
 		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
 		break;
+	case BPF_CGROUP_SOCK_OPS:
+		ptype = BPF_PROG_TYPE_SOCK_OPS;
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -1119,6 +1122,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
 	case BPF_CGROUP_INET_SOCK_CREATE:
+	case BPF_CGROUP_SOCK_OPS:
 		cgrp = cgroup_get_from_fd(attr->target_fd);
 		if (IS_ERR(cgrp))
 			return PTR_ERR(cgrp);
@@ -1133,6 +1137,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 
 	return ret;
 }
+
 #endif /* CONFIG_CGROUP_BPF */
 
 #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
-- 
cgit v1.2.3


From f96da09473b52c09125cc9bf7d7d4576ae8229e0 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 2 Jul 2017 02:13:27 +0200
Subject: bpf: simplify narrower ctx access

This work tries to make the semantics and code around the
narrower ctx access a bit easier to follow. Right now
everything is done inside the .is_valid_access(). Offset
matching is done differently for read/write types, meaning
writes don't support narrower access and thus matching only
on offsetof(struct foo, bar) is enough whereas for read
case that supports narrower access we must check for
offsetof(struct foo, bar) + offsetof(struct foo, bar) +
sizeof(<bar>) - 1 for each of the cases. For read cases of
individual members that don't support narrower access (like
packet pointers or skb->cb[] case which has its own narrow
access logic), we check as usual only offsetof(struct foo,
bar) like in write case. Then, for the case where narrower
access is allowed, we also need to set the aux info for the
access. Meaning, ctx_field_size and converted_op_size have
to be set. First is the original field size e.g. sizeof(<bar>)
as in above example from the user facing ctx, and latter
one is the target size after actual rewrite happened, thus
for the kernel facing ctx. Also here we need the range match
and we need to keep track changing convert_ctx_access() and
converted_op_size from is_valid_access() as both are not at
the same location.

We can simplify the code a bit: check_ctx_access() becomes
simpler in that we only store ctx_field_size as a meta data
and later in convert_ctx_accesses() we fetch the target_size
right from the location where we do convert. Should the verifier
be misconfigured we do reject for BPF_WRITE cases or target_size
that are not provided. For the subsystems, we always work on
ranges in is_valid_access() and add small helpers for ranges
and narrow access, convert_ctx_accesses() sets target_size
for the relevant instruction.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Cc: Yonghong Song <yhs@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 78 ++++++++++++++++++++++-----------------------------
 1 file changed, 34 insertions(+), 44 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6ea2adcb233b..6f820a044079 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -546,20 +546,6 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
 	return 0;
 }
 
-static int bpf_size_to_bytes(int bpf_size)
-{
-	if (bpf_size == BPF_W)
-		return 4;
-	else if (bpf_size == BPF_H)
-		return 2;
-	else if (bpf_size == BPF_B)
-		return 1;
-	else if (bpf_size == BPF_DW)
-		return 8;
-	else
-		return -EINVAL;
-}
-
 static bool is_spillable_regtype(enum bpf_reg_type type)
 {
 	switch (type) {
@@ -761,7 +747,9 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
 {
-	struct bpf_insn_access_aux info = { .reg_type = *reg_type };
+	struct bpf_insn_access_aux info = {
+		.reg_type = *reg_type,
+	};
 
 	/* for analyzer ctx accesses are already validated and converted */
 	if (env->analyzer_ops)
@@ -769,25 +757,14 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 
 	if (env->prog->aux->ops->is_valid_access &&
 	    env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
-		/* a non zero info.ctx_field_size indicates:
-		 * . For this field, the prog type specific ctx conversion algorithm
-		 *   only supports whole field access.
-		 * . This ctx access is a candiate for later verifier transformation
-		 *   to load the whole field and then apply a mask to get correct result.
-		 * a non zero info.converted_op_size indicates perceived actual converted
-		 * value width in convert_ctx_access.
+		/* A non zero info.ctx_field_size indicates that this field is a
+		 * candidate for later verifier transformation to load the whole
+		 * field and then apply a mask when accessed with a narrower
+		 * access than actual ctx access size. A zero info.ctx_field_size
+		 * will only allow for whole field access and rejects any other
+		 * type of narrower access.
 		 */
-		if ((info.ctx_field_size && !info.converted_op_size) ||
-		    (!info.ctx_field_size &&  info.converted_op_size)) {
-			verbose("verifier bug in is_valid_access prog type=%u off=%d size=%d\n",
-				env->prog->type, off, size);
-			return -EACCES;
-		}
-
-		if (info.ctx_field_size) {
-			env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
-			env->insn_aux_data[insn_idx].converted_op_size = info.converted_op_size;
-		}
+		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		*reg_type = info.reg_type;
 
 		/* remember the offset of last byte accessed in ctx */
@@ -3401,11 +3378,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
 	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16], *insn;
 	struct bpf_prog *new_prog;
 	enum bpf_access_type type;
-	int i, cnt, off, size, ctx_field_size, converted_op_size, is_narrower_load, delta = 0;
+	bool is_narrower_load;
+	u32 target_size;
 
 	if (ops->gen_prologue) {
 		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
@@ -3445,39 +3424,50 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
 			continue;
 
-		off = insn->off;
-		size = bpf_size_to_bytes(BPF_SIZE(insn->code));
 		ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
-		converted_op_size = env->insn_aux_data[i + delta].converted_op_size;
-		is_narrower_load = type == BPF_READ && size < ctx_field_size;
+		size = BPF_LDST_BYTES(insn);
 
 		/* If the read access is a narrower load of the field,
 		 * convert to a 4/8-byte load, to minimum program type specific
 		 * convert_ctx_access changes. If conversion is successful,
 		 * we will apply proper mask to the result.
 		 */
+		is_narrower_load = size < ctx_field_size;
 		if (is_narrower_load) {
-			int size_code = BPF_H;
+			u32 off = insn->off;
+			u8 size_code;
+
+			if (type == BPF_WRITE) {
+				verbose("bpf verifier narrow ctx access misconfigured\n");
+				return -EINVAL;
+			}
 
+			size_code = BPF_H;
 			if (ctx_field_size == 4)
 				size_code = BPF_W;
 			else if (ctx_field_size == 8)
 				size_code = BPF_DW;
+
 			insn->off = off & ~(ctx_field_size - 1);
 			insn->code = BPF_LDX | BPF_MEM | size_code;
 		}
-		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
-		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+
+		target_size = 0;
+		cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
+					      &target_size);
+		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
+		    (ctx_field_size && !target_size)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
 		}
-		if (is_narrower_load && size < converted_op_size) {
+
+		if (is_narrower_load && size < target_size) {
 			if (ctx_field_size <= 4)
 				insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
-							(1 << size * 8) - 1);
+								(1 << size * 8) - 1);
 			else
 				insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
-							(1 << size * 8) - 1);
+								(1 << size * 8) - 1);
 		}
 
 		new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
-- 
cgit v1.2.3


From 9780c0ab1a4e64ef6998c4d83f9df5be806a02dc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Sun, 2 Jul 2017 02:13:28 +0200
Subject: bpf: export whether tail call has jited owner

We do export through fdinfo already whether a prog is JITed or not,
given a program load can fail in case of either prog or tail call map
has JITed property, but neither both are JITed or not JITed, we can
facilitate error reporting in loaders like iproute2 through exporting
owner_jited of tail call map. We already do export owner_prog_type
through this facility, so parser can pick up both for comparison.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d4d47de75bba..18980472f5b0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -216,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 	const struct bpf_map *map = filp->private_data;
 	const struct bpf_array *array;
 	u32 owner_prog_type = 0;
+	u32 owner_jited = 0;
 
 	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
 		array = container_of(map, struct bpf_array, map);
 		owner_prog_type = array->owner_prog_type;
+		owner_jited = array->owner_jited;
 	}
 
 	seq_printf(m,
@@ -236,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 		   map->map_flags,
 		   map->pages * 1ULL << PAGE_SHIFT);
 
-	if (owner_prog_type)
+	if (owner_prog_type) {
 		seq_printf(m, "owner_prog_type:\t%u\n",
 			   owner_prog_type);
+		seq_printf(m, "owner_jited:\t%u\n",
+			   owner_jited);
+	}
 }
 #endif
 
-- 
cgit v1.2.3


From 43188702b3d98d2792969a3377a30957f05695e6 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Sun, 2 Jul 2017 02:13:30 +0200
Subject: bpf, verifier: add additional patterns to evaluate_reg_imm_alu

Currently the verifier does not track imm across alu operations when
the source register is of unknown type. This adds additional pattern
matching to catch this and track imm. We've seen LLVM generating this
pattern while working on cilium.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6f820a044079..6a86723c5b64 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1657,6 +1657,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
+static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
+					struct bpf_insn *insn)
+{
+	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+	struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+	u8 opcode = BPF_OP(insn->code);
+	s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
+
+	/* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
+	if (src_reg->imm > 0 && dst_reg->imm) {
+		switch (opcode) {
+		case BPF_ADD:
+			/* dreg += sreg
+			 * where both have zero upper bits. Adding them
+			 * can only result making one more bit non-zero
+			 * in the larger value.
+			 * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+			 *     0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+			 */
+			dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+			dst_reg->imm--;
+			break;
+		case BPF_AND:
+			/* dreg &= sreg
+			 * AND can not extend zero bits only shrink
+			 * Ex.  0x00..00ffffff
+			 *    & 0x0f..ffffffff
+			 *     ----------------
+			 *      0x00..00ffffff
+			 */
+			dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+			break;
+		case BPF_OR:
+			/* dreg |= sreg
+			 * OR can only extend zero bits
+			 * Ex.  0x00..00ffffff
+			 *    | 0x0f..ffffffff
+			 *     ----------------
+			 *      0x0f..00ffffff
+			 */
+			dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+			break;
+		case BPF_SUB:
+		case BPF_MUL:
+		case BPF_RSH:
+		case BPF_LSH:
+			/* These may be flushed out later */
+		default:
+			mark_reg_unknown_value(regs, insn->dst_reg);
+		}
+	} else {
+		mark_reg_unknown_value(regs, insn->dst_reg);
+	}
+
+	dst_reg->type = UNKNOWN_VALUE;
+	return 0;
+}
+
 static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
 				struct bpf_insn *insn)
 {
@@ -1666,6 +1725,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
 	u8 opcode = BPF_OP(insn->code);
 	u64 dst_imm = dst_reg->imm;
 
+	if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
+		return evaluate_reg_imm_alu_unknown(env, insn);
+
 	/* dst_reg->type == CONST_IMM here. Simulate execution of insns
 	 * containing ALU ops. Don't care about overflow or negative
 	 * values, just add/sub/... them; registers are in u64.
-- 
cgit v1.2.3


From 4cc7c1864bbd4cf80f6bdc8ba3217de5aa5f4688 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 5 Jul 2017 16:24:49 +0100
Subject: bpf: Implement show_options

Implement the show_options superblock op for bpf as part of a bid to get
rid of s_options and generic_show_options() to make it easier to implement
a context-based mount where the mount options can be passed individually
over a file descriptor.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Alexei Starovoitov <ast@kernel.org>
cc: Daniel Borkmann <daniel@iogearbox.net>
cc: netdev@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/bpf/inode.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9bbd33497d3d..e833ed914358 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode)
 		bpf_any_put(inode->i_private, type);
 }
 
+/*
+ * Display the mount options in /proc/mounts.
+ */
+static int bpf_show_options(struct seq_file *m, struct dentry *root)
+{
+	umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+
+	if (mode != S_IRWXUGO)
+		seq_printf(m, ",mode=%o", mode);
+	return 0;
+}
+
 static const struct super_operations bpf_super_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
-	.show_options	= generic_show_options,
+	.show_options	= bpf_show_options,
 	.evict_inode	= bpf_evict_inode,
 };
 
@@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *inode;
 	int ret;
 
-	save_mount_options(sb, data);
-
 	ret = bpf_parse_options(data, &opts);
 	if (ret)
 		return ret;
-- 
cgit v1.2.3


From 4cabc5b186b5427b9ee5a7495172542af105f02b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 21 Jul 2017 00:00:21 +0200
Subject: bpf: fix mixed signed/unsigned derived min/max value bounds

Edward reported that there's an issue in min/max value bounds
tracking when signed and unsigned compares both provide hints
on limits when having unknown variables. E.g. a program such
as the following should have been rejected:

   0: (7a) *(u64 *)(r10 -8) = 0
   1: (bf) r2 = r10
   2: (07) r2 += -8
   3: (18) r1 = 0xffff8a94cda93400
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+7
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp
   7: (7a) *(u64 *)(r10 -16) = -8
   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = -1
  10: (2d) if r1 > r2 goto pc+3
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0
  R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
  11: (65) if r1 s> 0x1 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=0,max_value=1
  R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
  12: (0f) r0 += r1
  13: (72) *(u8 *)(r0 +0) = 0
  R0=map_value_adj(ks=8,vs=8,id=0),min_value=0,max_value=1 R1=inv,min_value=0,max_value=1
  R2=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
  14: (b7) r0 = 0
  15: (95) exit

What happens is that in the first part ...

   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = -1
  10: (2d) if r1 > r2 goto pc+3

... r1 carries an unsigned value, and is compared as unsigned
against a register carrying an immediate. Verifier deduces in
reg_set_min_max() that since the compare is unsigned and operation
is greater than (>), that in the fall-through/false case, r1's
minimum bound must be 0 and maximum bound must be r2. Latter is
larger than the bound and thus max value is reset back to being
'invalid' aka BPF_REGISTER_MAX_RANGE. Thus, r1 state is now
'R1=inv,min_value=0'. The subsequent test ...

  11: (65) if r1 s> 0x1 goto pc+2

... is a signed compare of r1 with immediate value 1. Here,
verifier deduces in reg_set_min_max() that since the compare
is signed this time and operation is greater than (>), that
in the fall-through/false case, we can deduce that r1's maximum
bound must be 1, meaning with prior test, we result in r1 having
the following state: R1=inv,min_value=0,max_value=1. Given that
the actual value this holds is -8, the bounds are wrongly deduced.
When this is being added to r0 which holds the map_value(_adj)
type, then subsequent store access in above case will go through
check_mem_access() which invokes check_map_access_adj(), that
will then probe whether the map memory is in bounds based
on the min_value and max_value as well as access size since
the actual unknown value is min_value <= x <= max_value; commit
fce366a9dd0d ("bpf, verifier: fix alu ops against map_value{,
_adj} register types") provides some more explanation on the
semantics.

It's worth to note in this context that in the current code,
min_value and max_value tracking are used for two things, i)
dynamic map value access via check_map_access_adj() and since
commit 06c1c049721a ("bpf: allow helpers access to variable memory")
ii) also enforced at check_helper_mem_access() when passing a
memory address (pointer to packet, map value, stack) and length
pair to a helper and the length in this case is an unknown value
defining an access range through min_value/max_value in that
case. The min_value/max_value tracking is /not/ used in the
direct packet access case to track ranges. However, the issue
also affects case ii), for example, the following crafted program
based on the same principle must be rejected as well:

   0: (b7) r2 = 0
   1: (bf) r3 = r10
   2: (07) r3 += -512
   3: (7a) *(u64 *)(r10 -16) = -8
   4: (79) r4 = *(u64 *)(r10 -16)
   5: (b7) r6 = -1
   6: (2d) if r4 > r6 goto pc+5
  R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512
  R4=inv,min_value=0 R6=imm-1,max_value=18446744073709551615,min_align=1 R10=fp
   7: (65) if r4 s> 0x1 goto pc+4
  R1=ctx R2=imm0,min_value=0,max_value=0,min_align=2147483648 R3=fp-512
  R4=inv,min_value=0,max_value=1 R6=imm-1,max_value=18446744073709551615,min_align=1
  R10=fp
   8: (07) r4 += 1
   9: (b7) r5 = 0
  10: (6a) *(u16 *)(r10 -512) = 0
  11: (85) call bpf_skb_load_bytes#26
  12: (b7) r0 = 0
  13: (95) exit

Meaning, while we initialize the max_value stack slot that the
verifier thinks we access in the [1,2] range, in reality we
pass -7 as length which is interpreted as u32 in the helper.
Thus, this issue is relevant also for the case of helper ranges.
Resetting both bounds in check_reg_overflow() in case only one
of them exceeds limits is also not enough as similar test can be
created that uses values which are within range, thus also here
learned min value in r1 is incorrect when mixed with later signed
test to create a range:

   0: (7a) *(u64 *)(r10 -8) = 0
   1: (bf) r2 = r10
   2: (07) r2 += -8
   3: (18) r1 = 0xffff880ad081fa00
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+7
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp
   7: (7a) *(u64 *)(r10 -16) = -8
   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = 2
  10: (3d) if r2 >= r1 goto pc+3
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  11: (65) if r1 s> 0x4 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0
  R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  12: (0f) r0 += r1
  13: (72) *(u8 *)(r0 +0) = 0
  R0=map_value_adj(ks=8,vs=8,id=0),min_value=3,max_value=4
  R1=inv,min_value=3,max_value=4 R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  14: (b7) r0 = 0
  15: (95) exit

This leaves us with two options for fixing this: i) to invalidate
all prior learned information once we switch signed context, ii)
to track min/max signed and unsigned boundaries separately as
done in [0]. (Given latter introduces major changes throughout
the whole verifier, it's rather net-next material, thus this
patch follows option i), meaning we can derive bounds either
from only signed tests or only unsigned tests.) There is still the
case of adjust_reg_min_max_vals(), where we adjust bounds on ALU
operations, meaning programs like the following where boundaries
on the reg get mixed in context later on when bounds are merged
on the dst reg must get rejected, too:

   0: (7a) *(u64 *)(r10 -8) = 0
   1: (bf) r2 = r10
   2: (07) r2 += -8
   3: (18) r1 = 0xffff89b2bf87ce00
   5: (85) call bpf_map_lookup_elem#1
   6: (15) if r0 == 0x0 goto pc+6
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R10=fp
   7: (7a) *(u64 *)(r10 -16) = -8
   8: (79) r1 = *(u64 *)(r10 -16)
   9: (b7) r2 = 2
  10: (3d) if r2 >= r1 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R10=fp
  11: (b7) r7 = 1
  12: (65) if r7 s> 0x0 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,max_value=0 R10=fp
  13: (b7) r0 = 0
  14: (95) exit

  from 12 to 15: R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0
  R1=inv,min_value=3 R2=imm2,min_value=2,max_value=2,min_align=2 R7=imm1,min_value=1 R10=fp
  15: (0f) r7 += r1
  16: (65) if r7 s> 0x4 goto pc+2
  R0=map_value(ks=8,vs=8,id=0),min_value=0,max_value=0 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp
  17: (0f) r0 += r7
  18: (72) *(u8 *)(r0 +0) = 0
  R0=map_value_adj(ks=8,vs=8,id=0),min_value=4,max_value=4 R1=inv,min_value=3
  R2=imm2,min_value=2,max_value=2,min_align=2 R7=inv,min_value=4,max_value=4 R10=fp
  19: (b7) r0 = 0
  20: (95) exit

Meaning, in adjust_reg_min_max_vals() we must also reset range
values on the dst when src/dst registers have mixed signed/
unsigned derived min/max value bounds with one unbounded value
as otherwise they can be added together deducing false boundaries.
Once both boundaries are established from either ALU ops or
compare operations w/o mixing signed/unsigned insns, then they
can safely be added to other regs also having both boundaries
established. Adding regs with one unbounded side to a map value
where the bounded side has been learned w/o mixing ops is
possible, but the resulting map value won't recover from that,
meaning such op is considered invalid on the time of actual
access. Invalid bounds are set on the dst reg in case i) src reg,
or ii) in case dst reg already had them. The only way to recover
would be to perform i) ALU ops but only 'add' is allowed on map
value types or ii) comparisons, but these are disallowed on
pointers in case they span a range. This is fine as only BPF_JEQ
and BPF_JNE may be performed on PTR_TO_MAP_VALUE_OR_NULL registers
which potentially turn them into PTR_TO_MAP_VALUE type depending
on the branch, so only here min/max value cannot be invalidated
for them.

In terms of state pruning, value_from_signed is considered
as well in states_equal() when dealing with adjusted map values.
With regards to breaking existing programs, there is a small
risk, but use-cases are rather quite narrow where this could
occur and mixing compares probably unlikely.

Joint work with Josef and Edward.

  [0] https://lists.iovisor.org/pipermail/iovisor-dev/2017-June/000822.html

Fixes: 484611357c19 ("bpf: allow access into map value arrays")
Reported-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Edward Cree <ecree@solarflare.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 108 +++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 94 insertions(+), 14 deletions(-)

(limited to 'kernel/bpf')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6a86723c5b64..af9e84a4944e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
 {
 	regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
 	regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+	regs[regno].value_from_signed = false;
 	regs[regno].min_align = 0;
 }
 
@@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 	return -EACCES;
 }
 
-static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+static bool __is_pointer_value(bool allow_ptr_leaks,
+			       const struct bpf_reg_state *reg)
 {
-	if (env->allow_ptr_leaks)
+	if (allow_ptr_leaks)
 		return false;
 
-	switch (env->cur_state.regs[regno].type) {
+	switch (reg->type) {
 	case UNKNOWN_VALUE:
 	case CONST_IMM:
 		return false;
@@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 	}
 }
 
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+{
+	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+}
+
 static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
 {
@@ -1832,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 	dst_align = dst_reg->min_align;
 
 	/* We don't know anything about what was done to this register, mark it
-	 * as unknown.
+	 * as unknown. Also, if both derived bounds came from signed/unsigned
+	 * mixed compares and one side is unbounded, we cannot really do anything
+	 * with them as boundaries cannot be trusted. Thus, arithmetic of two
+	 * regs of such kind will get invalidated bounds on the dst side.
 	 */
-	if (min_val == BPF_REGISTER_MIN_RANGE &&
-	    max_val == BPF_REGISTER_MAX_RANGE) {
+	if ((min_val == BPF_REGISTER_MIN_RANGE &&
+	     max_val == BPF_REGISTER_MAX_RANGE) ||
+	    (BPF_SRC(insn->code) == BPF_X &&
+	     ((min_val != BPF_REGISTER_MIN_RANGE &&
+	       max_val == BPF_REGISTER_MAX_RANGE) ||
+	      (min_val == BPF_REGISTER_MIN_RANGE &&
+	       max_val != BPF_REGISTER_MAX_RANGE) ||
+	      (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
+	       dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
+	      (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
+	       dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
+	     regs[insn->dst_reg].value_from_signed !=
+	     regs[insn->src_reg].value_from_signed)) {
 		reset_reg_range_values(regs, insn->dst_reg);
 		return;
 	}
@@ -2023,6 +2044,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			regs[insn->dst_reg].max_value = insn->imm;
 			regs[insn->dst_reg].min_value = insn->imm;
 			regs[insn->dst_reg].min_align = calc_align(insn->imm);
+			regs[insn->dst_reg].value_from_signed = false;
 		}
 
 	} else if (opcode > BPF_END) {
@@ -2198,40 +2220,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 			    struct bpf_reg_state *false_reg, u64 val,
 			    u8 opcode)
 {
+	bool value_from_signed = true;
+	bool is_range = true;
+
 	switch (opcode) {
 	case BPF_JEQ:
 		/* If this is false then we know nothing Jon Snow, but if it is
 		 * true then we know for sure.
 		 */
 		true_reg->max_value = true_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JNE:
 		/* If this is true we know nothing Jon Snow, but if it is false
 		 * we know the value for sure;
 		 */
 		false_reg->max_value = false_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JGT:
-		/* Unsigned comparison, the minimum value is 0. */
-		false_reg->min_value = 0;
+		value_from_signed = false;
 		/* fallthrough */
 	case BPF_JSGT:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGT) {
+			/* Unsigned comparison, the minimum value is 0. */
+			false_reg->min_value = 0;
+		}
 		/* If this is false then we know the maximum val is val,
 		 * otherwise we know the min val is val+1.
 		 */
 		false_reg->max_value = val;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->min_value = val + 1;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	case BPF_JGE:
-		/* Unsigned comparison, the minimum value is 0. */
-		false_reg->min_value = 0;
+		value_from_signed = false;
 		/* fallthrough */
 	case BPF_JSGE:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGE) {
+			/* Unsigned comparison, the minimum value is 0. */
+			false_reg->min_value = 0;
+		}
 		/* If this is false then we know the maximum value is val - 1,
 		 * otherwise we know the mimimum value is val.
 		 */
 		false_reg->max_value = val - 1;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->min_value = val;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	default:
 		break;
@@ -2239,6 +2284,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
 
 	check_reg_overflow(false_reg);
 	check_reg_overflow(true_reg);
+	if (is_range) {
+		if (__is_pointer_value(false, false_reg))
+			reset_reg_range_values(false_reg, 0);
+		if (__is_pointer_value(false, true_reg))
+			reset_reg_range_values(true_reg, 0);
+	}
 }
 
 /* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
@@ -2248,41 +2299,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 				struct bpf_reg_state *false_reg, u64 val,
 				u8 opcode)
 {
+	bool value_from_signed = true;
+	bool is_range = true;
+
 	switch (opcode) {
 	case BPF_JEQ:
 		/* If this is false then we know nothing Jon Snow, but if it is
 		 * true then we know for sure.
 		 */
 		true_reg->max_value = true_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JNE:
 		/* If this is true we know nothing Jon Snow, but if it is false
 		 * we know the value for sure;
 		 */
 		false_reg->max_value = false_reg->min_value = val;
+		is_range = false;
 		break;
 	case BPF_JGT:
-		/* Unsigned comparison, the minimum value is 0. */
-		true_reg->min_value = 0;
+		value_from_signed = false;
 		/* fallthrough */
 	case BPF_JSGT:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGT) {
+			/* Unsigned comparison, the minimum value is 0. */
+			true_reg->min_value = 0;
+		}
 		/*
 		 * If this is false, then the val is <= the register, if it is
 		 * true the register <= to the val.
 		 */
 		false_reg->min_value = val;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->max_value = val - 1;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	case BPF_JGE:
-		/* Unsigned comparison, the minimum value is 0. */
-		true_reg->min_value = 0;
+		value_from_signed = false;
 		/* fallthrough */
 	case BPF_JSGE:
+		if (true_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(true_reg, 0);
+		if (false_reg->value_from_signed != value_from_signed)
+			reset_reg_range_values(false_reg, 0);
+		if (opcode == BPF_JGE) {
+			/* Unsigned comparison, the minimum value is 0. */
+			true_reg->min_value = 0;
+		}
 		/* If this is false then constant < register, if it is true then
 		 * the register < constant.
 		 */
 		false_reg->min_value = val + 1;
+		false_reg->value_from_signed = value_from_signed;
 		true_reg->max_value = val;
+		true_reg->value_from_signed = value_from_signed;
 		break;
 	default:
 		break;
@@ -2290,6 +2364,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
 
 	check_reg_overflow(false_reg);
 	check_reg_overflow(true_reg);
+	if (is_range) {
+		if (__is_pointer_value(false, false_reg))
+			reset_reg_range_values(false_reg, 0);
+		if (__is_pointer_value(false, true_reg))
+			reset_reg_range_values(true_reg, 0);
+	}
 }
 
 static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
-- 
cgit v1.2.3