From f8bbbfc3b97f4c7a6c7c23185e520b22bfc3a21d Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 28 Mar 2014 18:58:18 +0100
Subject: net: filter: add jited flag to indicate jit compiled filters

This patch adds a jited flag into sk_filter struct in order to indicate
whether a filter is currently jited or not. The size of sk_filter is
not being expanded as the 32 bit 'len' member allows upper bits to be
reused since a filter can currently only grow as large as BPF_MAXINSNS.

Therefore, there's enough room also for other in future needed flags to
reuse 'len' field if necessary. The jited flag also allows for having
alternative interpreter functions running as currently, we can only
detect jit compiled filters by testing fp->bpf_func to not equal the
address of sk_run_filter().

Joint work with Alexei Starovoitov.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index e568c8ef896b..e65e23087367 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -25,7 +25,8 @@ struct sock;
 struct sk_filter
 {
 	atomic_t		refcnt;
-	unsigned int         	len;	/* Number of filter blocks */
+	u32			jited:1,	/* Is our filter JIT'ed? */
+				len:31;		/* Number of filter blocks */
 	struct rcu_head		rcu;
 	unsigned int		(*bpf_func)(const struct sk_buff *skb,
 					    const struct sock_filter *filter);
-- 
cgit v1.2.3


From a3ea269b8bcdbb0c5fa2fd449a436e7987446975 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 28 Mar 2014 18:58:19 +0100
Subject: net: filter: keep original BPF program around

In order to open up the possibility to internally transform a BPF program
into an alternative and possibly non-trivial reversible representation, we
need to keep the original BPF program around, so that it can be passed back
to user space w/o the need of a complex decoder.

The reason for that use case resides in commit a8fc92778080 ("sk-filter:
Add ability to get socket filter program (v2)"), that is, the ability
to retrieve the currently attached BPF filter from a given socket used
mainly by the checkpoint-restore project, for example.

Therefore, we add two helpers sk_{store,release}_orig_filter for taking
care of that. In the sk_unattached_filter_create() case, there's no such
possibility/requirement to retrieve a loaded BPF program. Therefore, we
can spare us the work in that case.

This approach will simplify and slightly speed up both, sk_get_filter()
and sock_diag_put_filterinfo() handlers as we won't need to successively
decode filters anymore through sk_decode_filter(). As we still need
sk_decode_filter() later on, we're keeping it around.

Joint work with Alexei Starovoitov.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index e65e23087367..93a9792e27bc 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -19,14 +19,19 @@ struct compat_sock_fprog {
 };
 #endif
 
+struct sock_fprog_kern {
+	u16			len;
+	struct sock_filter	*filter;
+};
+
 struct sk_buff;
 struct sock;
 
-struct sk_filter
-{
+struct sk_filter {
 	atomic_t		refcnt;
 	u32			jited:1,	/* Is our filter JIT'ed? */
 				len:31;		/* Number of filter blocks */
+	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
 	struct rcu_head		rcu;
 	unsigned int		(*bpf_func)(const struct sk_buff *skb,
 					    const struct sock_filter *filter);
@@ -42,14 +47,20 @@ static inline unsigned int sk_filter_size(unsigned int proglen)
 		   offsetof(struct sk_filter, insns[proglen]));
 }
 
+#define sk_filter_proglen(fprog)			\
+		(fprog->len * sizeof(fprog->filter[0]))
+
 extern int sk_filter(struct sock *sk, struct sk_buff *skb);
 extern unsigned int sk_run_filter(const struct sk_buff *skb,
 				  const struct sock_filter *filter);
+
 extern int sk_unattached_filter_create(struct sk_filter **pfp,
 				       struct sock_fprog *fprog);
 extern void sk_unattached_filter_destroy(struct sk_filter *fp);
+
 extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 extern int sk_detach_filter(struct sock *sk);
+
 extern int sk_chk_filter(struct sock_filter *filter, unsigned int flen);
 extern int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned len);
 extern void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to);
-- 
cgit v1.2.3


From fbc907f0b1386c02e00516aa78a0fa6b0454fd0b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 28 Mar 2014 18:58:20 +0100
Subject: net: filter: move filter accounting to filter core

This patch basically does two things, i) removes the extern keyword
from the include/linux/filter.h file to be more consistent with the
rest of Joe's changes, and ii) moves filter accounting into the filter
core framework.

Filter accounting mainly done through sk_filter_{un,}charge() take
care of the case when sockets are being cloned through sk_clone_lock()
so that removal of the filter on one socket won't result in eviction
as it's still referenced by the other.

These functions actually belong to net/core/filter.c and not
include/net/sock.h as we want to keep all that in a central place.
It's also not in fast-path so uninlining them is fine and even allows
us to get rd of sk_filter_release_rcu()'s EXPORT_SYMBOL and a forward
declaration.

Joint work with Alexei Starovoitov.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 30 +++++++++++++++++-------------
 include/net/sock.h     | 27 ---------------------------
 2 files changed, 17 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 93a9792e27bc..9bde3ed19fe6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -50,28 +50,32 @@ static inline unsigned int sk_filter_size(unsigned int proglen)
 #define sk_filter_proglen(fprog)			\
 		(fprog->len * sizeof(fprog->filter[0]))
 
-extern int sk_filter(struct sock *sk, struct sk_buff *skb);
-extern unsigned int sk_run_filter(const struct sk_buff *skb,
-				  const struct sock_filter *filter);
+int sk_filter(struct sock *sk, struct sk_buff *skb);
+unsigned int sk_run_filter(const struct sk_buff *skb,
+			   const struct sock_filter *filter);
 
-extern int sk_unattached_filter_create(struct sk_filter **pfp,
-				       struct sock_fprog *fprog);
-extern void sk_unattached_filter_destroy(struct sk_filter *fp);
+int sk_unattached_filter_create(struct sk_filter **pfp,
+				struct sock_fprog *fprog);
+void sk_unattached_filter_destroy(struct sk_filter *fp);
 
-extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
-extern int sk_detach_filter(struct sock *sk);
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
+int sk_detach_filter(struct sock *sk);
 
-extern int sk_chk_filter(struct sock_filter *filter, unsigned int flen);
-extern int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned len);
-extern void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to);
+int sk_chk_filter(struct sock_filter *filter, unsigned int flen);
+int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
+		  unsigned int len);
+void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to);
+
+void sk_filter_charge(struct sock *sk, struct sk_filter *fp);
+void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
 
 #ifdef CONFIG_BPF_JIT
 #include <stdarg.h>
 #include <linux/linkage.h>
 #include <linux/printk.h>
 
-extern void bpf_jit_compile(struct sk_filter *fp);
-extern void bpf_jit_free(struct sk_filter *fp);
+void bpf_jit_compile(struct sk_filter *fp);
+void bpf_jit_free(struct sk_filter *fp);
 
 static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 				u32 pass, void *image)
diff --git a/include/net/sock.h b/include/net/sock.h
index 8d7c431a0660..06a5668f05c9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1621,33 +1621,6 @@ void sk_common_release(struct sock *sk);
 /* Initialise core socket variables */
 void sock_init_data(struct socket *sock, struct sock *sk);
 
-void sk_filter_release_rcu(struct rcu_head *rcu);
-
-/**
- *	sk_filter_release - release a socket filter
- *	@fp: filter to remove
- *
- *	Remove a filter from a socket and release its resources.
- */
-
-static inline void sk_filter_release(struct sk_filter *fp)
-{
-	if (atomic_dec_and_test(&fp->refcnt))
-		call_rcu(&fp->rcu, sk_filter_release_rcu);
-}
-
-static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
-{
-	atomic_sub(sk_filter_size(fp->len), &sk->sk_omem_alloc);
-	sk_filter_release(fp);
-}
-
-static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
-{
-	atomic_inc(&fp->refcnt);
-	atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
-}
-
 /*
  * Socket reference counting postulates.
  *
-- 
cgit v1.2.3


From e62d2df084e2849edffb206559725fa81bb569a8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 28 Mar 2014 18:58:21 +0100
Subject: net: ptp: use sk_unattached_filter_create() for BPF

This patch migrates an open-coded sk_run_filter() implementation with
proper use of the BPF API, that is, sk_unattached_filter_create(). This
migration is needed, as we will be internally transforming the filter
to a different representation, and therefore needs to be decoupled.

It is okay to do so as skb_timestamping_init() is called during
initialization of the network stack in core initcall via sock_init().
This would effectively also allow for PTP filters to be jit compiled if
bpf_jit_enable is set.

For better readability, there are also some newlines introduced, also
ptp_classify.h is only in kernel space.

Joint work with Alexei Starovoitov.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Richard Cochran <richard.cochran@omicron.at>
Cc: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptp_classify.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h
index 1dc420ba213a..3decfa4d3732 100644
--- a/include/linux/ptp_classify.h
+++ b/include/linux/ptp_classify.h
@@ -27,11 +27,7 @@
 #include <linux/if_vlan.h>
 #include <linux/ip.h>
 #include <linux/filter.h>
-#ifdef __KERNEL__
 #include <linux/in.h>
-#else
-#include <netinet/in.h>
-#endif
 
 #define PTP_CLASS_NONE  0x00 /* not a PTP event message */
 #define PTP_CLASS_V1    0x01 /* protocol version 1 */
-- 
cgit v1.2.3


From 164d8c6665213c931645578310256da7b1259331 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 28 Mar 2014 18:58:22 +0100
Subject: net: ptp: do not reimplement PTP/BPF classifier

There are currently pch_gbe, cpts, and ixp4xx_eth drivers that open-code
and reimplement a BPF classifier for the PTP protocol. Since all of them
effectively do the very same thing and load the very same PTP/BPF filter,
we can just consolidate that code by introducing ptp_classify_raw() in
the time-stamping core framework which can be used in drivers.

As drivers get initialized after bootstrapping the core networking
subsystem, they can make use of ptp_insns wrapped through
ptp_classify_raw(), which allows to simplify and remove PTP classifier
setup code in drivers.

Joint work with Alexei Starovoitov.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Richard Cochran <richard.cochran@omicron.at>
Cc: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ptp_classify.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h
index 3decfa4d3732..6d3b0a2ef9ce 100644
--- a/include/linux/ptp_classify.h
+++ b/include/linux/ptp_classify.h
@@ -80,14 +80,6 @@
 #define OP_RETA	(BPF_RET | BPF_A)
 #define OP_RETK	(BPF_RET | BPF_K)
 
-static inline int ptp_filter_init(struct sock_filter *f, int len)
-{
-	if (OP_LDH == f[0].code)
-		return sk_chk_filter(f, len);
-	else
-		return 0;
-}
-
 #define PTP_FILTER \
 	{OP_LDH,	0,   0, OFF_ETYPE		}, /*              */ \
 	{OP_JEQ,	0,  12, ETH_P_IP		}, /* f goto L20   */ \
@@ -133,4 +125,6 @@ static inline int ptp_filter_init(struct sock_filter *f, int len)
 	{OP_RETA,	0,   0, 0			}, /*              */ \
 /*L6x*/	{OP_RETK,	0,   0, PTP_CLASS_NONE		},
 
+unsigned int ptp_classify_raw(const struct sk_buff *skb);
+
 #endif
-- 
cgit v1.2.3


From 77e0114ae9ae08685c503772a57af21d299c6701 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Fri, 28 Mar 2014 18:58:24 +0100
Subject: net: isdn: use sk_unattached_filter api

Similarly as in ppp, we need to migrate the ISDN/PPP code to make use
of the sk_unattached_filter api in order to decouple having direct
filter structure access. By using sk_unattached_filter_{create,destroy},
we can allow for the possibility to jit compile filters for faster
filter verdicts as well.

Joint work with Alexei Starovoitov.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Karsten Keil <isdn@linux-pingi.de>
Cc: isdn4linux@listserv.isdn4linux.de
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/isdn_ppp.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/isdn_ppp.h b/include/linux/isdn_ppp.h
index d5f62bc5f4be..8e10f57f109f 100644
--- a/include/linux/isdn_ppp.h
+++ b/include/linux/isdn_ppp.h
@@ -180,9 +180,8 @@ struct ippp_struct {
   struct slcompress *slcomp;
 #endif
 #ifdef CONFIG_IPPP_FILTER
-  struct sock_filter *pass_filter;	/* filter for packets to pass */
-  struct sock_filter *active_filter;	/* filter for pkts to reset idle */
-  unsigned pass_len, active_len;
+  struct sk_filter *pass_filter;   /* filter for packets to pass */
+  struct sk_filter *active_filter; /* filter for pkts to reset idle */
 #endif
   unsigned long debug;
   struct isdn_ppp_compressor *compressor,*decompressor;
-- 
cgit v1.2.3


From bd4cf0ed331a275e9bf5a49e6d0fd55dffc551b8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 28 Mar 2014 18:58:25 +0100
Subject: net: filter: rework/optimize internal BPF interpreter's instruction
 set

This patch replaces/reworks the kernel-internal BPF interpreter with
an optimized BPF instruction set format that is modelled closer to
mimic native instruction sets and is designed to be JITed with one to
one mapping. Thus, the new interpreter is noticeably faster than the
current implementation of sk_run_filter(); mainly for two reasons:

1. Fall-through jumps:

  BPF jump instructions are forced to go either 'true' or 'false'
  branch which causes branch-miss penalty. The new BPF jump
  instructions have only one branch and fall-through otherwise,
  which fits the CPU branch predictor logic better. `perf stat`
  shows drastic difference for branch-misses between the old and
  new code.

2. Jump-threaded implementation of interpreter vs switch
   statement:

  Instead of single table-jump at the top of 'switch' statement,
  gcc will now generate multiple table-jump instructions, which
  helps CPU branch predictor logic.

Note that the verification of filters is still being done through
sk_chk_filter() in classical BPF format, so filters from user- or
kernel space are verified in the same way as we do now, and same
restrictions/constraints hold as well.

We reuse current BPF JIT compilers in a way that this upgrade would
even be fine as is, but nevertheless allows for a successive upgrade
of BPF JIT compilers to the new format.

The internal instruction set migration is being done after the
probing for JIT compilation, so in case JIT compilers are able to
create a native opcode image, we're going to use that, and in all
other cases we're doing a follow-up migration of the BPF program's
instruction set, so that it can be transparently run in the new
interpreter.

In short, the *internal* format extends BPF in the following way (more
details can be taken from the appended documentation):

  - Number of registers increase from 2 to 10
  - Register width increases from 32-bit to 64-bit
  - Conditional jt/jf targets replaced with jt/fall-through
  - Adds signed > and >= insns
  - 16 4-byte stack slots for register spill-fill replaced
    with up to 512 bytes of multi-use stack space
  - Introduction of bpf_call insn and register passing convention
    for zero overhead calls from/to other kernel functions
  - Adds arithmetic right shift and endianness conversion insns
  - Adds atomic_add insn
  - Old tax/txa insns are replaced with 'mov dst,src' insn

Performance of two BPF filters generated by libpcap resp. bpf_asm
was measured on x86_64, i386 and arm32 (other libpcap programs
have similar performance differences):

fprog #1 is taken from Documentation/networking/filter.txt:
tcpdump -i eth0 port 22 -dd

fprog #2 is taken from 'man tcpdump':
tcpdump -i eth0 'tcp port 22 and (((ip[2:2] - ((ip[0]&0xf)<<2)) -
   ((tcp[12]&0xf0)>>2)) != 0)' -dd

Raw performance data from BPF micro-benchmark: SK_RUN_FILTER on the
same SKB (cache-hit) or 10k SKBs (cache-miss); time in ns per call,
smaller is better:

--x86_64--
         fprog #1  fprog #1   fprog #2  fprog #2
         cache-hit cache-miss cache-hit cache-miss
old BPF      90       101        192       202
new BPF      31        71         47        97
old BPF jit  12        34         17        44
new BPF jit TBD

--i386--
         fprog #1  fprog #1   fprog #2  fprog #2
         cache-hit cache-miss cache-hit cache-miss
old BPF     107       136        227       252
new BPF      40       119         69       172

--arm32--
         fprog #1  fprog #1   fprog #2  fprog #2
         cache-hit cache-miss cache-hit cache-miss
old BPF     202       300        475       540
new BPF     180       270        330       470
old BPF jit  26       182         37       202
new BPF jit TBD

Thus, without changing any userland BPF filters, applications on
top of AF_PACKET (or other families) such as libpcap/tcpdump, cls_bpf
classifier, netfilter's xt_bpf, team driver's load-balancing mode,
and many more will have better interpreter filtering performance.

While we are replacing the internal BPF interpreter, we also need
to convert seccomp BPF in the same step to make use of the new
internal structure since it makes use of lower-level API details
without being further decoupled through higher-level calls like
sk_unattached_filter_{create,destroy}(), for example.

Just as for normal socket filtering, also seccomp BPF experiences
a time-to-verdict speedup:

05-sim-long_jumps.c of libseccomp was used as micro-benchmark:

  seccomp_rule_add_exact(ctx,...
  seccomp_rule_add_exact(ctx,...

  rc = seccomp_load(ctx);

  for (i = 0; i < 10000000; i++)
     syscall(199, 100);

'short filter' has 2 rules
'large filter' has 200 rules

'short filter' performance is slightly better on x86_64/i386/arm32
'large filter' is much faster on x86_64 and i386 and shows no
               difference on arm32

--x86_64-- short filter
old BPF: 2.7 sec
 39.12%  bench  libc-2.15.so       [.] syscall
  8.10%  bench  [kernel.kallsyms]  [k] sk_run_filter
  6.31%  bench  [kernel.kallsyms]  [k] system_call
  5.59%  bench  [kernel.kallsyms]  [k] trace_hardirqs_on_caller
  4.37%  bench  [kernel.kallsyms]  [k] trace_hardirqs_off_caller
  3.70%  bench  [kernel.kallsyms]  [k] __secure_computing
  3.67%  bench  [kernel.kallsyms]  [k] lock_is_held
  3.03%  bench  [kernel.kallsyms]  [k] seccomp_bpf_load
new BPF: 2.58 sec
 42.05%  bench  libc-2.15.so       [.] syscall
  6.91%  bench  [kernel.kallsyms]  [k] system_call
  6.25%  bench  [kernel.kallsyms]  [k] trace_hardirqs_on_caller
  6.07%  bench  [kernel.kallsyms]  [k] __secure_computing
  5.08%  bench  [kernel.kallsyms]  [k] sk_run_filter_int_seccomp

--arm32-- short filter
old BPF: 4.0 sec
 39.92%  bench  [kernel.kallsyms]  [k] vector_swi
 16.60%  bench  [kernel.kallsyms]  [k] sk_run_filter
 14.66%  bench  libc-2.17.so       [.] syscall
  5.42%  bench  [kernel.kallsyms]  [k] seccomp_bpf_load
  5.10%  bench  [kernel.kallsyms]  [k] __secure_computing
new BPF: 3.7 sec
 35.93%  bench  [kernel.kallsyms]  [k] vector_swi
 21.89%  bench  libc-2.17.so       [.] syscall
 13.45%  bench  [kernel.kallsyms]  [k] sk_run_filter_int_seccomp
  6.25%  bench  [kernel.kallsyms]  [k] __secure_computing
  3.96%  bench  [kernel.kallsyms]  [k] syscall_trace_exit

--x86_64-- large filter
old BPF: 8.6 seconds
    73.38%    bench  [kernel.kallsyms]  [k] sk_run_filter
    10.70%    bench  libc-2.15.so       [.] syscall
     5.09%    bench  [kernel.kallsyms]  [k] seccomp_bpf_load
     1.97%    bench  [kernel.kallsyms]  [k] system_call
new BPF: 5.7 seconds
    66.20%    bench  [kernel.kallsyms]  [k] sk_run_filter_int_seccomp
    16.75%    bench  libc-2.15.so       [.] syscall
     3.31%    bench  [kernel.kallsyms]  [k] system_call
     2.88%    bench  [kernel.kallsyms]  [k] __secure_computing

--i386-- large filter
old BPF: 5.4 sec
new BPF: 3.8 sec

--arm32-- large filter
old BPF: 13.5 sec
 73.88%  bench  [kernel.kallsyms]  [k] sk_run_filter
 10.29%  bench  [kernel.kallsyms]  [k] vector_swi
  6.46%  bench  libc-2.17.so       [.] syscall
  2.94%  bench  [kernel.kallsyms]  [k] seccomp_bpf_load
  1.19%  bench  [kernel.kallsyms]  [k] __secure_computing
  0.87%  bench  [kernel.kallsyms]  [k] sys_getuid
new BPF: 13.5 sec
 76.08%  bench  [kernel.kallsyms]  [k] sk_run_filter_int_seccomp
 10.98%  bench  [kernel.kallsyms]  [k] vector_swi
  5.87%  bench  libc-2.17.so       [.] syscall
  1.77%  bench  [kernel.kallsyms]  [k] __secure_computing
  0.93%  bench  [kernel.kallsyms]  [k] sys_getuid

BPF filters generated by seccomp are very branchy, so the new
internal BPF performance is better than the old one. Performance
gains will be even higher when BPF JIT is committed for the
new structure, which is planned in future work (as successive
JIT migrations).

BPF has also been stress-tested with trinity's BPF fuzzer.

Joint work with Daniel Borkmann.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Paul Moore <pmoore@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: linux-kernel@vger.kernel.org
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h  | 74 ++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/seccomp.h |  1 -
 2 files changed, 64 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9bde3ed19fe6..262dcbb75ffe 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -9,13 +9,58 @@
 #include <linux/workqueue.h>
 #include <uapi/linux/filter.h>
 
-#ifdef CONFIG_COMPAT
-/*
- * A struct sock_filter is architecture independent.
+/* Internally used and optimized filter representation with extended
+ * instruction set based on top of classic BPF.
  */
+
+/* instruction classes */
+#define BPF_ALU64	0x07	/* alu mode in double word width */
+
+/* ld/ldx fields */
+#define BPF_DW		0x18	/* double word */
+#define BPF_XADD	0xc0	/* exclusive add */
+
+/* alu/jmp fields */
+#define BPF_MOV		0xb0	/* mov reg to reg */
+#define BPF_ARSH	0xc0	/* sign extending arithmetic shift right */
+
+/* change endianness of a register */
+#define BPF_END		0xd0	/* flags for endianness conversion: */
+#define BPF_TO_LE	0x00	/* convert to little-endian */
+#define BPF_TO_BE	0x08	/* convert to big-endian */
+#define BPF_FROM_LE	BPF_TO_LE
+#define BPF_FROM_BE	BPF_TO_BE
+
+#define BPF_JNE		0x50	/* jump != */
+#define BPF_JSGT	0x60	/* SGT is signed '>', GT in x86 */
+#define BPF_JSGE	0x70	/* SGE is signed '>=', GE in x86 */
+#define BPF_CALL	0x80	/* function call */
+#define BPF_EXIT	0x90	/* function return */
+
+/* BPF has 10 general purpose 64-bit registers and stack frame. */
+#define MAX_BPF_REG	11
+
+/* BPF program can access up to 512 bytes of stack space. */
+#define MAX_BPF_STACK	512
+
+/* Arg1, context and stack frame pointer register positions. */
+#define ARG1_REG	1
+#define CTX_REG		6
+#define FP_REG		10
+
+struct sock_filter_int {
+	__u8	code;		/* opcode */
+	__u8	a_reg:4;	/* dest register */
+	__u8	x_reg:4;	/* source register */
+	__s16	off;		/* signed offset */
+	__s32	imm;		/* signed immediate constant */
+};
+
+#ifdef CONFIG_COMPAT
+/* A struct sock_filter is architecture independent. */
 struct compat_sock_fprog {
 	u16		len;
-	compat_uptr_t	filter;		/* struct sock_filter * */
+	compat_uptr_t	filter;	/* struct sock_filter * */
 };
 #endif
 
@@ -26,6 +71,7 @@ struct sock_fprog_kern {
 
 struct sk_buff;
 struct sock;
+struct seccomp_data;
 
 struct sk_filter {
 	atomic_t		refcnt;
@@ -34,9 +80,10 @@ struct sk_filter {
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
 	struct rcu_head		rcu;
 	unsigned int		(*bpf_func)(const struct sk_buff *skb,
-					    const struct sock_filter *filter);
+					    const struct sock_filter_int *filter);
 	union {
-		struct sock_filter     	insns[0];
+		struct sock_filter	insns[0];
+		struct sock_filter_int	insnsi[0];
 		struct work_struct	work;
 	};
 };
@@ -50,9 +97,18 @@ static inline unsigned int sk_filter_size(unsigned int proglen)
 #define sk_filter_proglen(fprog)			\
 		(fprog->len * sizeof(fprog->filter[0]))
 
+#define SK_RUN_FILTER(filter, ctx)			\
+		(*filter->bpf_func)(ctx, filter->insnsi)
+
 int sk_filter(struct sock *sk, struct sk_buff *skb);
-unsigned int sk_run_filter(const struct sk_buff *skb,
-			   const struct sock_filter *filter);
+
+u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
+			      const struct sock_filter_int *insni);
+u32 sk_run_filter_int_skb(const struct sk_buff *ctx,
+			  const struct sock_filter_int *insni);
+
+int sk_convert_filter(struct sock_filter *prog, int len,
+		      struct sock_filter_int *new_prog, int *new_len);
 
 int sk_unattached_filter_create(struct sk_filter **pfp,
 				struct sock_fprog *fprog);
@@ -86,7 +142,6 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 		print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
 			       16, 1, image, proglen, false);
 }
-#define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns)
 #else
 #include <linux/slab.h>
 static inline void bpf_jit_compile(struct sk_filter *fp)
@@ -96,7 +151,6 @@ static inline void bpf_jit_free(struct sk_filter *fp)
 {
 	kfree(fp);
 }
-#define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns)
 #endif
 
 static inline int bpf_tell_extensions(void)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 6f19cfd1840e..4054b0994071 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -76,7 +76,6 @@ static inline int seccomp_mode(struct seccomp *s)
 #ifdef CONFIG_SECCOMP_FILTER
 extern void put_seccomp_filter(struct task_struct *tsk);
 extern void get_seccomp_filter(struct task_struct *tsk);
-extern u32 seccomp_bpf_load(int off);
 #else  /* CONFIG_SECCOMP_FILTER */
 static inline void put_seccomp_filter(struct task_struct *tsk)
 {
-- 
cgit v1.2.3