From b9193c1b61ddb97da4713155b0d580e41fb544ac Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sat, 24 Mar 2018 11:44:22 -0700
Subject: bpf: Rename bpf_verifer_log

bpf_verifer_log =>
bpf_verifier_log

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 6b66cd1aa0b9..c30668414b22 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -153,7 +153,7 @@ struct bpf_insn_aux_data {
 
 #define BPF_VERIFIER_TMP_LOG_SIZE	1024
 
-struct bpf_verifer_log {
+struct bpf_verifier_log {
 	u32 level;
 	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 	char __user *ubuf;
@@ -161,7 +161,7 @@ struct bpf_verifer_log {
 	u32 len_total;
 };
 
-static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log)
+static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 {
 	return log->len_used >= log->len_total - 1;
 }
@@ -185,7 +185,7 @@ struct bpf_verifier_env {
 	bool allow_ptr_leaks;
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
-	struct bpf_verifer_log log;
+	struct bpf_verifier_log log;
 	u32 subprog_starts[BPF_MAX_SUBPROGS];
 	/* computes the stack depth of each bpf function */
 	u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1];
-- 
cgit v1.2.3


From 77d2e05abd45886dcad2b632c738cf46b9f7c19e Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Sat, 24 Mar 2018 11:44:23 -0700
Subject: bpf: Add bpf_verifier_vlog() and bpf_verifier_log_needed()

The BTF (BPF Type Format) verifier needs to reuse the current
BPF verifier log.  Hence, it requires the following changes:

(1) Expose log_write() in verifier.c for other users.
    Its name is renamed to bpf_verifier_vlog().

(2) The BTF verifier also needs to check
'log->level && log->ubuf && !bpf_verifier_log_full(log);'
independently outside of the current log_write().  It is
because the BTF verifier will do one-check before
making multiple calls to btf_verifier_vlog to log
the details of a type.

Hence, this check is also re-factored to a new function
bpf_verifier_log_needed().  Since it is re-factored,
we can check it before va_start() in the current
bpf_verifier_log_write() and verbose().

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c30668414b22..7e61c395fddf 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -166,6 +166,11 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log)
 	return log->len_used >= log->len_total - 1;
 }
 
+static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
+{
+	return log->level && log->ubuf && !bpf_verifier_log_full(log);
+}
+
 #define BPF_MAX_SUBPROGS 256
 
 /* single container for all structs
@@ -192,6 +197,8 @@ struct bpf_verifier_env {
 	u32 subprog_cnt;
 };
 
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+		       va_list args);
 __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
 					   const char *fmt, ...);
 
-- 
cgit v1.2.3


From cf14f27f82af78e713f8a57c477cf9233faf8b30 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 28 Mar 2018 12:05:36 -0700
Subject: macro: introduce COUNT_ARGS() macro

move COUNT_ARGS() macro from apparmor to generic header and extend it
to count till twelve.

COUNT() was an alternative name for this logic, but it's used for
different purpose in many other places.

Similarly for CONCATENATE() macro.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/kernel.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 3fd291503576..293fa0677fba 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -919,6 +919,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #define swap(a, b) \
 	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
 
+/* This counts to 12. Any more, it will return 13th argument. */
+#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n
+#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
+
+#define __CONCAT(a, b) a ## b
+#define CONCATENATE(a, b) __CONCAT(a, b)
+
 /**
  * container_of - cast a member of a structure out to the containing structure
  * @ptr:	the pointer to the member.
-- 
cgit v1.2.3


From c4f6699dfcb8558d138fe838f741b2c10f416cf9 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 28 Mar 2018 12:05:37 -0700
Subject: bpf: introduce BPF_RAW_TRACEPOINT

Introduce BPF_PROG_TYPE_RAW_TRACEPOINT bpf program type to access
kernel internal arguments of the tracepoints in their raw form.

>From bpf program point of view the access to the arguments look like:
struct bpf_raw_tracepoint_args {
       __u64 args[0];
};

int bpf_prog(struct bpf_raw_tracepoint_args *ctx)
{
  // program can read args[N] where N depends on tracepoint
  // and statically verified at program load+attach time
}

kprobe+bpf infrastructure allows programs access function arguments.
This feature allows programs access raw tracepoint arguments.

Similar to proposed 'dynamic ftrace events' there are no abi guarantees
to what the tracepoints arguments are and what their meaning is.
The program needs to type cast args properly and use bpf_probe_read()
helper to access struct fields when argument is a pointer.

For every tracepoint __bpf_trace_##call function is prepared.
In assembler it looks like:
(gdb) disassemble __bpf_trace_xdp_exception
Dump of assembler code for function __bpf_trace_xdp_exception:
   0xffffffff81132080 <+0>:     mov    %ecx,%ecx
   0xffffffff81132082 <+2>:     jmpq   0xffffffff811231f0 <bpf_trace_run3>

where

TRACE_EVENT(xdp_exception,
        TP_PROTO(const struct net_device *dev,
                 const struct bpf_prog *xdp, u32 act),

The above assembler snippet is casting 32-bit 'act' field into 'u64'
to pass into bpf_trace_run3(), while 'dev' and 'xdp' args are passed as-is.
All of ~500 of __bpf_trace_*() functions are only 5-10 byte long
and in total this approach adds 7k bytes to .text.

This approach gives the lowest possible overhead
while calling trace_xdp_exception() from kernel C code and
transitioning into bpf land.
Since tracepoint+bpf are used at speeds of 1M+ events per second
this is valuable optimization.

The new BPF_RAW_TRACEPOINT_OPEN sys_bpf command is introduced
that returns anon_inode FD of 'bpf-raw-tracepoint' object.

The user space looks like:
// load bpf prog with BPF_PROG_TYPE_RAW_TRACEPOINT type
prog_fd = bpf_prog_load(...);
// receive anon_inode fd for given bpf_raw_tracepoint with prog attached
raw_tp_fd = bpf_raw_tracepoint_open("xdp_exception", prog_fd);

Ctrl-C of tracing daemon or cmdline tool that uses this feature
will automatically detach bpf program, unload it and
unregister tracepoint probe.

On the kernel side the __bpf_raw_tp_map section of pointers to
tracepoint definition and to __bpf_trace_*() probe function is used
to find a tracepoint with "xdp_exception" name and
corresponding __bpf_trace_xdp_exception() probe function
which are passed to tracepoint_probe_register() to connect probe
with tracepoint.

Addition of bpf_raw_tracepoint doesn't interfere with ftrace and perf
tracepoint mechanisms. perf_event_open() can be used in parallel
on the same tracepoint.
Multiple bpf_raw_tracepoint_open("xdp_exception", prog_fd) are permitted.
Each with its own bpf program. The kernel will execute
all tracepoint probes and all attached bpf programs.

In the future bpf_raw_tracepoints can be extended with
query/introspection logic.

__bpf_raw_tp_map section logic was contributed by Steven Rostedt

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_types.h       |  1 +
 include/linux/trace_events.h    | 42 +++++++++++++++++++++++++++++++++++++++++
 include/linux/tracepoint-defs.h |  6 ++++++
 3 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5e2e8a49fb21..6d7243bfb0ff 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -19,6 +19,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
 BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
 BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
 BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
+BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint)
 #endif
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8a1442c4e513..b0357cd198b0 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -468,6 +468,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
 int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
 void perf_event_detach_bpf_prog(struct perf_event *event);
 int perf_event_query_prog_array(struct perf_event *event, void __user *info);
+int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
+int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
+struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
@@ -487,6 +490,18 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info)
 {
 	return -EOPNOTSUPP;
 }
+static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p)
+{
+	return -EOPNOTSUPP;
+}
+static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p)
+{
+	return -EOPNOTSUPP;
+}
+static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name)
+{
+	return NULL;
+}
 #endif
 
 enum {
@@ -546,6 +561,33 @@ extern void ftrace_profile_free_filter(struct perf_event *event);
 void perf_trace_buf_update(void *record, u16 type);
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
 
+void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
+void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2);
+void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3);
+void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4);
+void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5);
+void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5, u64 arg6);
+void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7);
+void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		    u64 arg8);
+void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		    u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		    u64 arg8, u64 arg9);
+void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		     u64 arg8, u64 arg9, u64 arg10);
+void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		     u64 arg8, u64 arg9, u64 arg10, u64 arg11);
+void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2,
+		     u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7,
+		     u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12);
 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 			       struct trace_event_call *call, u64 count,
 			       struct pt_regs *regs, struct hlist_head *head,
diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h
index 64ed7064f1fa..22c5a46e9693 100644
--- a/include/linux/tracepoint-defs.h
+++ b/include/linux/tracepoint-defs.h
@@ -35,4 +35,10 @@ struct tracepoint {
 	struct tracepoint_func __rcu *funcs;
 };
 
+struct bpf_raw_event_map {
+	struct tracepoint	*tp;
+	void			*bpf_func;
+	u32			num_args;
+} __aligned(32);
+
 #endif
-- 
cgit v1.2.3


From e59ac634908f4ea90066e6db7dd7ae8ca02815ff Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 28 Mar 2018 17:48:33 -0700
Subject: bpf: add parenthesis around argument of BPF_LDST_BYTES()

BPF_LDST_BYTES() does not put it's argument in parenthesis
when referencing it.  This makes it impossible to pass pointers
obtained by address-of operator (e.g. BPF_LDST_BYTES(&insn)).
Add the parenthesis.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 109d05ccea9a..c2f167db8bd5 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -372,7 +372,7 @@ struct xdp_rxq_info;
 
 #define BPF_LDST_BYTES(insn)					\
 	({							\
-		const int __size = bpf_size_to_bytes(BPF_SIZE(insn->code)); \
+		const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \
 		WARN_ON(__size < 0);				\
 		__size;						\
 	})
-- 
cgit v1.2.3


From 8934ce2fd08171e8605f7fada91ee7619fe17ab8 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 28 Mar 2018 12:49:15 -0700
Subject: bpf: sockmap redirect ingress support

Add support for the BPF_F_INGRESS flag in sk_msg redirect helper.
To do this add a scatterlist ring for receiving socks to check
before calling into regular recvmsg call path. Additionally, because
the poll wakeup logic only checked the skb recv queue we need to
add a hook in TCP stack (similar to write side) so that we have
a way to wake up polling socks when a scatterlist is redirected
to that sock.

After this all that is needed is for the redirect helper to
push the scatterlist into the psock receive queue.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index c2f167db8bd5..961cc5d53956 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -521,6 +521,7 @@ struct sk_msg_buff {
 	__u32 key;
 	__u32 flags;
 	struct bpf_map *map;
+	struct list_head list;
 };
 
 /* Compute the linear packet data range [data, data_end) which
-- 
cgit v1.2.3


From fa246693a111fab32bd51d20f07a347e42773ee9 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Wed, 28 Mar 2018 12:49:25 -0700
Subject: bpf: sockmap, BPF_F_INGRESS flag for BPF_SK_SKB_STREAM_VERDICT:

Add support for the BPF_F_INGRESS flag in skb redirect helper. To
do this convert skb into a scatterlist and push into ingress queue.
This is the same logic that is used in the sk_msg redirect helper
so it should feel familiar.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/filter.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 961cc5d53956..897ff3d95968 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -521,6 +521,7 @@ struct sk_msg_buff {
 	__u32 key;
 	__u32 flags;
 	struct bpf_map *map;
+	struct sk_buff *skb;
 	struct list_head list;
 };
 
-- 
cgit v1.2.3


From f385178679b6561d2e717567d12e07c7f927ee59 Mon Sep 17 00:00:00 2001
From: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Date: Fri, 30 Mar 2018 09:20:59 +0900
Subject: lib/scatterlist: add sg_init_marker() helper

sg_init_marker initializes sg_magic in the sg table and calls
sg_mark_end() on the last entry of the table. This can be useful to
avoid memset in sg_init_table() when scatterlist is already zeroed out

For example: when scatterlist is embedded inside other struct and that
container struct is zeroed out

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/scatterlist.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 22b2131bcdcd..aa5d4eb725f5 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -248,6 +248,24 @@ static inline void *sg_virt(struct scatterlist *sg)
 	return page_address(sg_page(sg)) + sg->offset;
 }
 
+/**
+ * sg_init_marker - Initialize markers in sg table
+ * @sgl:	   The SG table
+ * @nents:	   Number of entries in table
+ *
+ **/
+static inline void sg_init_marker(struct scatterlist *sgl,
+				  unsigned int nents)
+{
+#ifdef CONFIG_DEBUG_SG
+	unsigned int i;
+
+	for (i = 0; i < nents; i++)
+		sgl[i].sg_magic = SG_MAGIC;
+#endif
+	sg_mark_end(&sgl[nents - 1]);
+}
+
 int sg_nents(struct scatterlist *sg);
 int sg_nents_for_len(struct scatterlist *sg, u64 len);
 struct scatterlist *sg_next(struct scatterlist *);
-- 
cgit v1.2.3


From 5e43f899b03a3492ce5fc44e8900becb04dae9c0 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 30 Mar 2018 15:08:00 -0700
Subject: bpf: Check attach type at prog load time

== The problem ==

There are use-cases when a program of some type can be attached to
multiple attach points and those attach points must have different
permissions to access context or to call helpers.

E.g. context structure may have fields for both IPv4 and IPv6 but it
doesn't make sense to read from / write to IPv6 field when attach point
is somewhere in IPv4 stack.

Same applies to BPF-helpers: it may make sense to call some helper from
some attach point, but not from other for same prog type.

== The solution ==

Introduce `expected_attach_type` field in in `struct bpf_attr` for
`BPF_PROG_LOAD` command. If scenario described in "The problem" section
is the case for some prog type, the field will be checked twice:

1) At load time prog type is checked to see if attach type for it must
   be known to validate program permissions correctly. Prog will be
   rejected with EINVAL if it's the case and `expected_attach_type` is
   not specified or has invalid value.

2) At attach time `attach_type` is compared with `expected_attach_type`,
   if prog type requires to have one, and, if they differ, attach will
   be rejected with EINVAL.

The `expected_attach_type` is now available as part of `struct bpf_prog`
in both `bpf_verifier_ops->is_valid_access()` and
`bpf_verifier_ops->get_func_proto()` () and can be used to check context
accesses and calls to helpers correspondingly.

Initially the idea was discussed by Alexei Starovoitov <ast@fb.com> and
Daniel Borkmann <daniel@iogearbox.net> here:
https://marc.info/?l=linux-netdev&m=152107378717201&w=2

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h    | 5 ++++-
 include/linux/filter.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 819229c80eca..95a7abd0ee92 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -208,12 +208,15 @@ struct bpf_prog_ops {
 
 struct bpf_verifier_ops {
 	/* return eBPF function prototype for verification */
-	const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
+	const struct bpf_func_proto *
+	(*get_func_proto)(enum bpf_func_id func_id,
+			  const struct bpf_prog *prog);
 
 	/* return true if 'size' wide access at offset 'off' within bpf_context
 	 * with 'type' (read or write) is allowed
 	 */
 	bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
+				const struct bpf_prog *prog,
 				struct bpf_insn_access_aux *info);
 	int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
 			    const struct bpf_prog *prog);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 897ff3d95968..13c044e4832d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -469,6 +469,7 @@ struct bpf_prog {
 				is_func:1,	/* program is a bpf function */
 				kprobe_override:1; /* Do we override a kprobe? */
 	enum bpf_prog_type	type;		/* Type of BPF program */
+	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
 	u32			jited_len;	/* Size of jited insns in bytes */
 	u8			tag[BPF_TAG_SIZE];
-- 
cgit v1.2.3


From 4fbac77d2d092b475dda9eea66da674369665427 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 30 Mar 2018 15:08:02 -0700
Subject: bpf: Hooks for sys_bind

== The problem ==

There is a use-case when all processes inside a cgroup should use one
single IP address on a host that has multiple IP configured.  Those
processes should use the IP for both ingress and egress, for TCP and UDP
traffic. So TCP/UDP servers should be bound to that IP to accept
incoming connections on it, and TCP/UDP clients should make outgoing
connections from that IP. It should not require changing application
code since it's often not possible.

Currently it's solved by intercepting glibc wrappers around syscalls
such as `bind(2)` and `connect(2)`. It's done by a shared library that
is preloaded for every process in a cgroup so that whenever TCP/UDP
server calls `bind(2)`, the library replaces IP in sockaddr before
passing arguments to syscall. When application calls `connect(2)` the
library transparently binds the local end of connection to that IP
(`bind(2)` with `IP_BIND_ADDRESS_NO_PORT` to avoid performance penalty).

Shared library approach is fragile though, e.g.:
* some applications clear env vars (incl. `LD_PRELOAD`);
* `/etc/ld.so.preload` doesn't help since some applications are linked
  with option `-z nodefaultlib`;
* other applications don't use glibc and there is nothing to intercept.

== The solution ==

The patch provides much more reliable in-kernel solution for the 1st
part of the problem: binding TCP/UDP servers on desired IP. It does not
depend on application environment and implementation details (whether
glibc is used or not).

It adds new eBPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` and
attach types `BPF_CGROUP_INET4_BIND` and `BPF_CGROUP_INET6_BIND`
(similar to already existing `BPF_CGROUP_INET_SOCK_CREATE`).

The new program type is intended to be used with sockets (`struct sock`)
in a cgroup and provided by user `struct sockaddr`. Pointers to both of
them are parts of the context passed to programs of newly added types.

The new attach types provides hooks in `bind(2)` system call for both
IPv4 and IPv6 so that one can write a program to override IP addresses
and ports user program tries to bind to and apply such a program for
whole cgroup.

== Implementation notes ==

[1]
Separate attach types for `AF_INET` and `AF_INET6` are added
intentionally to prevent reading/writing to offsets that don't make
sense for corresponding socket family. E.g. if user passes `sockaddr_in`
it doesn't make sense to read from / write to `user_ip6[]` context
fields.

[2]
The write access to `struct bpf_sock_addr_kern` is implemented using
special field as an additional "register".

There are just two registers in `sock_addr_convert_ctx_access`: `src`
with value to write and `dst` with pointer to context that can't be
changed not to break later instructions. But the fields, allowed to
write to, are not available directly and to access them address of
corresponding pointer has to be loaded first. To get additional register
the 1st not used by `src` and `dst` one is taken, its content is saved
to `bpf_sock_addr_kern.tmp_reg`, then the register is used to load
address of pointer field, and finally the register's content is restored
from the temporary field after writing `src` value.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 21 +++++++++++++++++++++
 include/linux/bpf_types.h  |  1 +
 include/linux/filter.h     | 10 ++++++++++
 3 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8a4566691c8f..67dc4a6471ad 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -6,6 +6,7 @@
 #include <uapi/linux/bpf.h>
 
 struct sock;
+struct sockaddr;
 struct cgroup;
 struct sk_buff;
 struct bpf_sock_ops_kern;
@@ -63,6 +64,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 int __cgroup_bpf_run_filter_sk(struct sock *sk,
 			       enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
+				      struct sockaddr *uaddr,
+				      enum bpf_attach_type type);
+
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
 				     enum bpf_attach_type type);
@@ -103,6 +108,20 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type)				       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled)						       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
+	__ret;								       \
+})
+
+#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr)			       \
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND)
+
+#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr)			       \
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND)
+
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
 ({									       \
 	int __ret = 0;							       \
@@ -135,6 +154,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 6d7243bfb0ff..2b28fcf6f6ae 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -8,6 +8,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act)
 BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout)
 BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 13c044e4832d..fc4e8f91b03d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1021,6 +1021,16 @@ static inline int bpf_tell_extensions(void)
 	return SKF_AD_MAX;
 }
 
+struct bpf_sock_addr_kern {
+	struct sock *sk;
+	struct sockaddr *uaddr;
+	/* Temporary "register" to make indirect stores to nested structures
+	 * defined above. We need three registers to make such a store, but
+	 * only two (src and dst) are available at convert_ctx_access time
+	 */
+	u64 tmp_reg;
+};
+
 struct bpf_sock_ops_kern {
 	struct	sock *sk;
 	u32	op;
-- 
cgit v1.2.3


From d74bad4e74ee373787a9ae24197c17b7cdc428d5 Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 30 Mar 2018 15:08:05 -0700
Subject: bpf: Hooks for sys_connect

== The problem ==

See description of the problem in the initial patch of this patch set.

== The solution ==

The patch provides much more reliable in-kernel solution for the 2nd
part of the problem: making outgoing connecttion from desired IP.

It adds new attach types `BPF_CGROUP_INET4_CONNECT` and
`BPF_CGROUP_INET6_CONNECT` for program type
`BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that can be used to override both
source and destination of a connection at connect(2) time.

Local end of connection can be bound to desired IP using newly
introduced BPF-helper `bpf_bind()`. It allows to bind to only IP though,
and doesn't support binding to port, i.e. leverages
`IP_BIND_ADDRESS_NO_PORT` socket option. There are two reasons for this:
* looking for a free port is expensive and can affect performance
  significantly;
* there is no use-case for port.

As for remote end (`struct sockaddr *` passed by user), both parts of it
can be overridden, remote IP and remote port. It's useful if an
application inside cgroup wants to connect to another application inside
same cgroup or to itself, but knows nothing about IP assigned to the
cgroup.

Support is added for IPv4 and IPv6, for TCP and UDP.

IPv4 and IPv6 have separate attach types for same reason as sys_bind
hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields
when user passes sockaddr_in since it'd be out-of-bound.

== Implementation notes ==

The patch introduces new field in `struct proto`: `pre_connect` that is
a pointer to a function with same signature as `connect` but is called
before it. The reason is in some cases BPF hooks should be called way
before control is passed to `sk->sk_prot->connect`. Specifically
`inet_dgram_connect` autobinds socket before calling
`sk->sk_prot->connect` and there is no way to call `bpf_bind()` from
hooks from e.g. `ip4_datagram_connect` or `ip6_datagram_connect` since
it'd cause double-bind. On the other hand `proto.pre_connect` provides a
flexible way to add BPF hooks for connect only for necessary `proto` and
call them at desired time before `connect`. Since `bpf_bind()` is
allowed to bind only to IP and autobind in `inet_dgram_connect` binds
only port there is no chance of double-bind.

bpf_bind() sets `force_bind_address_no_port` to bind to only IP despite
of value of `bind_address_no_port` socket field.

bpf_bind() sets `with_lock` to `false` when calling to __inet_bind()
and __inet6_bind() since all call-sites, where bpf_bind() is called,
already hold socket lock.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 67dc4a6471ad..c6ab295e6dcb 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -116,12 +116,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type)			       \
+({									       \
+	int __ret = 0;							       \
+	if (cgroup_bpf_enabled)	{					       \
+		lock_sock(sk);						       \
+		__ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);    \
+		release_sock(sk);					       \
+	}								       \
+	__ret;								       \
+})
+
 #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr)			       \
 	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND)
 
 #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr)			       \
 	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND)
 
+#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \
+					    sk->sk_prot->pre_connect)
+
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr)			       \
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
+
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr)			       \
+	BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
+
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)		       \
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
+
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)		       \
+	BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT)
+
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)				       \
 ({									       \
 	int __ret = 0;							       \
@@ -151,11 +177,16 @@ struct cgroup_bpf {};
 static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 
+#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
-- 
cgit v1.2.3


From aac3fc320d9404f2665a8b1249dc3170d5fa3caf Mon Sep 17 00:00:00 2001
From: Andrey Ignatov <rdna@fb.com>
Date: Fri, 30 Mar 2018 15:08:07 -0700
Subject: bpf: Post-hooks for sys_bind

"Post-hooks" are hooks that are called right before returning from
sys_bind. At this time IP and port are already allocated and no further
changes to `struct sock` can happen before returning from sys_bind but
BPF program has a chance to inspect the socket and change sys_bind
result.

Specifically it can e.g. inspect what port was allocated and if it
doesn't satisfy some policy, BPF program can force sys_bind to fail and
return EPERM to user.

Another example of usage is recording the IP:port pair to some map to
use it in later calls to sys_connect. E.g. if some TCP server inside
cgroup was bound to some IP:port_n, it can be recorded to a map. And
later when some TCP client inside same cgroup is trying to connect to
127.0.0.1:port_n, BPF hook for sys_connect can override the destination
and connect application to IP:port_n instead of 127.0.0.1:port_n. That
helps forcing all applications inside a cgroup to use desired IP and not
break those applications if they e.g. use localhost to communicate
between each other.

== Implementation details ==

Post-hooks are implemented as two new attach types
`BPF_CGROUP_INET4_POST_BIND` and `BPF_CGROUP_INET6_POST_BIND` for
existing prog type `BPF_PROG_TYPE_CGROUP_SOCK`.

Separate attach types for IPv4 and IPv6 are introduced to avoid access
to IPv6 field in `struct sock` from `inet_bind()` and to IPv4 field from
`inet6_bind()` since those fields might not make sense in such cases.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf-cgroup.h | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c6ab295e6dcb..30d15e64b993 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -98,16 +98,24 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 	__ret;								       \
 })
 
-#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
+#define BPF_CGROUP_RUN_SK_PROG(sk, type)				       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled) {					       \
-		__ret = __cgroup_bpf_run_filter_sk(sk,			       \
-						 BPF_CGROUP_INET_SOCK_CREATE); \
+		__ret = __cgroup_bpf_run_filter_sk(sk, type);		       \
 	}								       \
 	__ret;								       \
 })
 
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk)				       \
+	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE)
+
+#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk)				       \
+	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND)
+
+#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk)				       \
+	BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND)
+
 #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type)				       \
 ({									       \
 	int __ret = 0;							       \
@@ -183,6 +191,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
-- 
cgit v1.2.3