From 1b2b234b1318afb3775d4c6624fd5a96558f19df Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 10 Dec 2018 15:43:00 -0800 Subject: bpf: pass struct btf pointer to the map_check_btf() callback If key_type or value_type are of non-trivial data types (e.g. structure or typedef), it's not possible to check them without the additional information, which can't be obtained without a pointer to the btf structure. So, let's pass btf pointer to the map_check_btf() callbacks. Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0c992b86eb2c..e734f163bd0b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -23,6 +23,7 @@ struct bpf_prog; struct bpf_map; struct sock; struct seq_file; +struct btf; struct btf_type; /* map is generic key/value storage optionally accesible by eBPF programs */ @@ -52,6 +53,7 @@ struct bpf_map_ops { void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); int (*map_check_btf)(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); }; @@ -126,6 +128,7 @@ static inline bool bpf_map_support_seq_show(const struct bpf_map *map) } int map_check_no_btf(const struct bpf_map *map, + const struct btf *btf, const struct btf_type *key_type, const struct btf_type *value_type); -- cgit v1.2.3 From 9a1126b63190e2541dd5d643f4bfeb5a7f493729 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 10 Dec 2018 15:43:01 -0800 Subject: bpf: add bpffs pretty print for cgroup local storage maps Implement bpffs pretty printing for cgroup local storage maps (both shared and per-cpu). Output example (captured for tools/testing/selftests/bpf/netcnt_prog.c): Shared: $ cat /sys/fs/bpf/map_2 # WARNING!! The output is for debug purpose only # WARNING!! The output format will change {4294968594,1}: {9999,1039896} Per-cpu: $ cat /sys/fs/bpf/map_1 # WARNING!! The output is for debug purpose only # WARNING!! The output format will change {4294968594,1}: { cpu0: {0,0,0,0,0} cpu1: {0,0,0,0,0} cpu2: {1,104,0,0,0} cpu3: {0,0,0,0,0} } Signed-off-by: Roman Gushchin Cc: Alexei Starovoitov Cc: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index b98405a56383..a4cf075b89eb 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -47,6 +47,7 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); bool btf_name_offset_valid(const struct btf *btf, u32 offset); +bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); -- cgit v1.2.3 From c872bdb38febb4c31ece3599c52cf1f833b89f4e Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 12 Dec 2018 09:37:46 -0800 Subject: bpf: include sub program tags in bpf_prog_info Changes v2 -> v3: 1. remove check for bpf_dump_raw_ok(). Changes v1 -> v2: 1. Fix error path as Martin suggested. This patch adds nr_prog_tags and prog_tags to bpf_prog_info. This is a reliable way for user space to get tags of all sub programs. Before this patch, user space need to find sub program tags via kallsyms. This feature will be used in BPF introspection, where user space queries information about BPF programs via sys_bpf. Signed-off-by: Song Liu Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa582cd5bfcf..e7d57e89f25f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2717,6 +2717,8 @@ struct bpf_prog_info { __u32 nr_jited_line_info; __u32 line_info_rec_size; __u32 jited_line_info_rec_size; + __u32 nr_prog_tags; + __aligned_u64 prog_tags; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.2.3 From 23127b33ec80e656921362d7dc82a0064bac20a2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 13 Dec 2018 10:41:46 -0800 Subject: bpf: Create a new btf_name_by_offset() for non type name use case The current btf_name_by_offset() is returning "(anon)" type name for the offset == 0 case and "(invalid-name-offset)" for the out-of-bound offset case. It fits well for the internal BTF verbose log purpose which is focusing on type. For example, offset == 0 => "(anon)" => anonymous type/name. Returning non-NULL for the bad offset case is needed during the BTF verification process because the BTF verifier may complain about another field first before discovering the name_off is invalid. However, it may not be ideal for the newer use case which does not necessary mean type name. For example, when logging line_info in the BPF verifier in the next patch, it is better to log an empty src line instead of logging "(anon)". The existing bpf_name_by_offset() is renamed to __bpf_name_by_offset() and static to btf.c. A new bpf_name_by_offset() is added for generic context usage. It returns "\0" for name_off == 0 (note that btf->strings[0] is "\0") and NULL for invalid offset. It allows the caller to decide what is the best output in its context. The new btf_name_by_offset() is overlapped with btf_name_offset_valid(). Hence, btf_name_offset_valid() is removed from btf.h to keep the btf.h API minimal. The existing btf_name_offset_valid() usage in btf.c could also be replaced later. Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index a4cf075b89eb..58000d7e06e3 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -46,7 +46,6 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); -bool btf_name_offset_valid(const struct btf *btf, u32 offset); bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size); #ifdef CONFIG_BPF_SYSCALL -- cgit v1.2.3 From d9762e84ede3eae9636f5dbbe0c8f0390d37e114 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 13 Dec 2018 10:41:48 -0800 Subject: bpf: verbose log bpf_line_info in verifier This patch adds bpf_line_info during the verifier's verbose. It can give error context for debug purpose. ~~~~~~~~~~ Here is the verbose log for backedge: while (a) { a += bpf_get_smp_processor_id(); bpf_trace_printk(fmt, sizeof(fmt), a); } ~> bpftool prog load ./test_loop.o /sys/fs/bpf/test_loop type tracepoint 13: while (a) { 3: a += bpf_get_smp_processor_id(); back-edge from insn 13 to 3 ~~~~~~~~~~ Here is the verbose log for invalid pkt access: Modification to test_xdp_noinline.c: data = (void *)(long)xdp->data; data_end = (void *)(long)xdp->data_end; /* if (data + 4 > data_end) return XDP_DROP; */ *(u32 *)data = dst->dst; ~> bpftool prog load ./test_xdp_noinline.o /sys/fs/bpf/test_xdp_noinline type xdp ; data = (void *)(long)xdp->data; 224: (79) r2 = *(u64 *)(r10 -112) 225: (61) r2 = *(u32 *)(r2 +0) ; *(u32 *)data = dst->dst; 226: (63) *(u32 *)(r2 +0) = r1 invalid access to packet, off=0 size=4, R2(id=0,off=0,r=0) R2 offset is outside of the packet Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c736945be7c5..548dcbdb7111 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -224,6 +224,7 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ + const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; -- cgit v1.2.3 From 9242b5f5615c823bfc1e9aea284617ff25a55f10 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 13 Dec 2018 11:42:34 -0800 Subject: bpf: add self-check logic to liveness analysis Introduce REG_LIVE_DONE to check the liveness propagation and prepare the states for merging. See algorithm description in clean_live_states(). Signed-off-by: Alexei Starovoitov Acked-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 548dcbdb7111..c233efc106c6 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -38,6 +38,7 @@ enum bpf_reg_liveness { REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */ REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */ + REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */ }; struct bpf_reg_state { -- cgit v1.2.3 From 6c4fc209fcf9d27efbaa48368773e4d2bfbd59aa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 16 Dec 2018 00:49:47 +0100 Subject: bpf: remove useless version check for prog load Existing libraries and tracing frameworks work around this kernel version check by automatically deriving the kernel version from uname(3) or similar such that the user does not need to do it manually; these workarounds also make the version check useless at the same time. Moreover, most other BPF tracing types enabling bpf_probe_read()-like functionality have /not/ adapted this check, and in general these days it is well understood anyway that all the tracing programs are not stable with regards to future kernels as kernel internal data structures are subject to change from release to release. Back at last netconf we discussed [0] and agreed to remove this check from bpf_prog_load() and instead document it here in the uapi header that there is no such guarantee for stable API for these programs. [0] http://vger.kernel.org/netconf2018_files/DanielBorkmann_netconf2018.pdf Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e7d57e89f25f..1d324c2cbca2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -133,6 +133,14 @@ enum bpf_map_type { BPF_MAP_TYPE_STACK, }; +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, @@ -343,7 +351,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ - __u32 kern_version; /* checked when prog_type=kprobe */ + __u32 kern_version; /* not used */ __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ -- cgit v1.2.3 From 9d5f9f701b1891466fb3dbb1806ad97716f95cc3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Dec 2018 22:13:51 -0800 Subject: bpf: btf: fix struct/union/fwd types with kind_flag This patch fixed two issues with BTF. One is related to struct/union bitfield encoding and the other is related to forward type. Issue #1 and solution: ====================== Current btf encoding of bitfield follows what pahole generates. For each bitfield, pahole will duplicate the type chain and put the bitfield size at the final int or enum type. Since the BTF enum type cannot encode bit size, pahole workarounds the issue by generating an int type whenever the enum bit size is not 32. For example, -bash-4.4$ cat t.c typedef int ___int; enum A { A1, A2, A3 }; struct t { int a[5]; ___int b:4; volatile enum A c:4; } g; -bash-4.4$ gcc -c -O2 -g t.c The current kernel supports the following BTF encoding: $ pahole -JV t.o [1] TYPEDEF ___int type_id=2 [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [3] ENUM A size=4 vlen=3 A1 val=0 A2 val=1 A3 val=2 [4] STRUCT t size=24 vlen=3 a type_id=5 bits_offset=0 b type_id=9 bits_offset=160 c type_id=11 bits_offset=164 [5] ARRAY (anon) type_id=2 index_type_id=2 nr_elems=5 [6] INT sizetype size=8 bit_offset=0 nr_bits=64 encoding=(none) [7] VOLATILE (anon) type_id=3 [8] INT int size=1 bit_offset=0 nr_bits=4 encoding=(none) [9] TYPEDEF ___int type_id=8 [10] INT (anon) size=1 bit_offset=0 nr_bits=4 encoding=SIGNED [11] VOLATILE (anon) type_id=10 Two issues are in the above: . by changing enum type to int, we lost the original type information and this will not be ideal later when we try to convert BTF to a header file. . the type duplication for bitfields will cause BTF bloat. Duplicated types cannot be deduplicated later if the bitfield size is different. To fix this issue, this patch implemented a compatible change for BTF struct type encoding: . the bit 31 of struct_type->info, previously reserved, now is used to indicate whether bitfield_size is encoded in btf_member or not. . if bit 31 of struct_type->info is set, btf_member->offset will encode like: bit 0 - 23: bit offset bit 24 - 31: bitfield size if bit 31 is not set, the old behavior is preserved: bit 0 - 31: bit offset So if the struct contains a bit field, the maximum bit offset will be reduced to (2^24 - 1) instead of MAX_UINT. The maximum bitfield size will be 256 which is enough for today as maximum bitfield in compiler can be 128 where int128 type is supported. This kernel patch intends to support the new BTF encoding: $ pahole -JV t.o [1] TYPEDEF ___int type_id=2 [2] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [3] ENUM A size=4 vlen=3 A1 val=0 A2 val=1 A3 val=2 [4] STRUCT t kind_flag=1 size=24 vlen=3 a type_id=5 bitfield_size=0 bits_offset=0 b type_id=1 bitfield_size=4 bits_offset=160 c type_id=7 bitfield_size=4 bits_offset=164 [5] ARRAY (anon) type_id=2 index_type_id=2 nr_elems=5 [6] INT sizetype size=8 bit_offset=0 nr_bits=64 encoding=(none) [7] VOLATILE (anon) type_id=3 Issue #2 and solution: ====================== Current forward type in BTF does not specify whether the original type is struct or union. This will not work for type pretty print and BTF-to-header-file conversion as struct/union must be specified. $ cat tt.c struct t; union u; int foo(struct t *t, union u *u) { return 0; } $ gcc -c -g -O2 tt.c $ pahole -JV tt.o [1] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [2] FWD t type_id=0 [3] PTR (anon) type_id=2 [4] FWD u type_id=0 [5] PTR (anon) type_id=4 To fix this issue, similar to issue #1, type->info bit 31 is used. If the bit is set, it is union type. Otherwise, it is a struct type. $ pahole -JV tt.o [1] INT int size=4 bit_offset=0 nr_bits=32 encoding=SIGNED [2] FWD t kind_flag=0 type_id=0 [3] PTR (anon) kind_flag=0 type_id=2 [4] FWD u kind_flag=1 type_id=0 [5] PTR (anon) kind_flag=0 type_id=4 Pahole/LLVM change: =================== The new kind_flag functionality has been implemented in pahole and llvm: https://github.com/yonghong-song/pahole/tree/bitfield https://github.com/yonghong-song/llvm/tree/bitfield Note that pahole hasn't implemented func/func_proto kind and .BTF.ext. So to print function signature with bpftool, the llvm compiler should be used. Fixes: 69b693f0aefa ("bpf: btf: Introduce BPF Type Format (BTF)") Acked-by: Martin KaFai Lau Signed-off-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/uapi/linux/btf.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 14f66948fc95..7b7475ef2f17 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -34,7 +34,9 @@ struct btf_type { * bits 0-15: vlen (e.g. # of struct's members) * bits 16-23: unused * bits 24-27: kind (e.g. int, ptr, array...etc) - * bits 28-31: unused + * bits 28-30: unused + * bit 31: kind_flag, currently used by + * struct, union and fwd */ __u32 info; /* "size" is used by INT, ENUM, STRUCT and UNION. @@ -52,6 +54,7 @@ struct btf_type { #define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) #define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KFLAG(info) ((info) >> 31) #define BTF_KIND_UNKN 0 /* Unknown */ #define BTF_KIND_INT 1 /* Integer */ @@ -110,9 +113,22 @@ struct btf_array { struct btf_member { __u32 name_off; __u32 type; - __u32 offset; /* offset in bits */ + /* If the type info kind_flag is set, the btf_member offset + * contains both member bitfield size and bit offset. The + * bitfield size is set for bitfield members. If the type + * info kind_flag is not set, the offset contains only bit + * offset. + */ + __u32 offset; }; +/* If the struct/union type info kind_flag is set, the + * following two macros are used to access bitfield_size + * and bit_offset from btf_member.offset. + */ +#define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) +#define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) + /* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". * The exact number of btf_param is stored in the vlen (of the * info in "struct btf_type"). -- cgit v1.2.3 From ffa0c1cf59596fba54546ea828305acfcc2cf55e Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 15 Dec 2018 22:13:52 -0800 Subject: bpf: enable cgroup local storage map pretty print with kind_flag Commit 970289fc0a83 ("bpf: add bpffs pretty print for cgroup local storage maps") added bpffs pretty print for cgroup local storage maps. The commit worked for struct without kind_flag set. This patch refactored and made pretty print also work with kind_flag set for the struct. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/btf.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 58000d7e06e3..12502e25e767 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -7,6 +7,7 @@ #include struct btf; +struct btf_member; struct btf_type; union bpf_attr; @@ -46,7 +47,9 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); -bool btf_type_is_reg_int(const struct btf_type *t, u32 expected_size); +bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, + const struct btf_member *m, + u32 expected_offset, u32 expected_size); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); -- cgit v1.2.3 From a38d1107f937ca95dcf820161ef44ea683d6a0b1 Mon Sep 17 00:00:00 2001 From: Matt Mullins Date: Wed, 12 Dec 2018 16:42:37 -0800 Subject: bpf: support raw tracepoints in modules Distributions build drivers as modules, including network and filesystem drivers which export numerous tracepoints. This enables bpf(BPF_RAW_TRACEPOINT_OPEN) to attach to those tracepoints. Signed-off-by: Matt Mullins Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/module.h | 4 ++++ include/linux/trace_events.h | 8 ++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index fce6b4335e36..5f147dd5e709 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -432,6 +432,10 @@ struct module { unsigned int num_tracepoints; tracepoint_ptr_t *tracepoints_ptrs; #endif +#ifdef CONFIG_BPF_EVENTS + unsigned int num_bpf_raw_events; + struct bpf_raw_event_map *bpf_raw_events; +#endif #ifdef HAVE_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 4130a5497d40..8a62731673f7 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -471,7 +471,8 @@ void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); -struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); +struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name); +void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr); @@ -502,10 +503,13 @@ static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf { return -EOPNOTSUPP; } -static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +static inline struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) { return NULL; } +static inline void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) +{ +} static inline int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, -- cgit v1.2.3 From 3bdbd0228e7555ec745e08469b98e5a0966409d6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 16 Dec 2018 15:47:04 -0800 Subject: bpf: sockmap, metadata support for reporting size of msg This adds metadata to sk_msg_md for BPF programs to read the sk_msg size. When the SK_MSG program is running under an application that is using sendfile the data is not copied into sk_msg buffers by default. Rather the BPF program uses sk_msg_pull_data to read the bytes in. This avoids doing the costly memcopy instructions when they are not in fact needed. However, if we don't know the size of the sk_msg we have to guess if needed bytes are available by doing a pull request which may fail. By including the size of the sk_msg BPF programs can check the size before issuing sk_msg_pull_data requests. Additionally, the same applies for sendmsg calls when the application provides multiple iovs. Here the BPF program needs to pull in data to update data pointers but its not clear where the data ends without a size parameter. In many cases "guessing" is not easy to do and results in multiple calls to pull and without bounded loops everything gets fairly tricky. Clean this up by including a u32 size field. Note, all writes into sk_msg_md are rejected already from sk_msg_is_valid_access so nothing additional is needed there. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 3 +++ include/uapi/linux/bpf.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 2a11e9d91dfa..eb8f6cb84c10 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -36,6 +36,9 @@ struct sk_msg_sg { struct scatterlist data[MAX_MSG_FRAGS + 1]; }; +/* UAPI in filter.c depends on struct sk_msg_sg being first element. If + * this is moved filter.c also must be updated. + */ struct sk_msg { struct sk_msg_sg sg; void *data; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1d324c2cbca2..91c43884f295 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2665,6 +2665,7 @@ struct sk_msg_md { __u32 local_ip6[4]; /* Stored in network byte order */ __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ + __u32 size; /* Total size of sk_msg */ }; struct sk_reuseport_md { -- cgit v1.2.3 From 7a69c0f250568e6ab72f401b2c69aa0e666c94f2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:31 -0800 Subject: bpf: skmsg, replace comments with BUILD bug Enforce comment on structure layout dependency with a BUILD_BUG_ON to ensure the condition is maintained. Suggested-by: Daniel Borkmann Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index eb8f6cb84c10..dd57e6f408b1 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -36,9 +36,7 @@ struct sk_msg_sg { struct scatterlist data[MAX_MSG_FRAGS + 1]; }; -/* UAPI in filter.c depends on struct sk_msg_sg being first element. If - * this is moved filter.c also must be updated. - */ +/* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { struct sk_msg_sg sg; void *data; -- cgit v1.2.3 From 552de91068828daef50a227a665068cf8dde835e Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:33 -0800 Subject: bpf: sk_msg, fix socket data_ready events When a skb verdict program is in-use and either another BPF program redirects to that socket or the new SK_PASS support is used the data_ready callback does not wake up application. Instead because the stream parser/verdict is using the sk data_ready callback we wake up the stream parser/verdict block. Fix this by adding a helper to check if the stream parser block is enabled on the sk and if so call the saved pointer which is the upper layers wake up function. This fixes application stalls observed when an application is waiting for data in a blocking read(). Fixes: d829e9c4112b ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index dd57e6f408b1..178a3933a71b 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -417,6 +417,14 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) sk_psock_drop(sk, psock); } +static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock) +{ + if (psock->parser.enabled) + psock->parser.saved_data_ready(sk); + else + sk->sk_data_ready(sk); +} + static inline void psock_set_prog(struct bpf_prog **pprog, struct bpf_prog *prog) { -- cgit v1.2.3 From 0608c69c9a805c6264689d7eab4203eab88cf1da Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 20 Dec 2018 11:35:35 -0800 Subject: bpf: sk_msg, sock{map|hash} redirect through ULP A sockmap program that redirects through a kTLS ULP enabled socket will not work correctly because the ULP layer is skipped. This fixes the behavior to call through the ULP layer on redirect to ensure any operations required on the data stream at the ULP layer continue to be applied. To do this we add an internal flag MSG_SENDPAGE_NOPOLICY to avoid calling the BPF layer on a redirected message. This is required to avoid calling the BPF layer multiple times (possibly recursively) which is not the current/expected behavior without ULPs. In the future we may add a redirect flag if users _do_ want the policy applied again but this would need to work for both ULP and non-ULP sockets and be opt-in to avoid breaking existing programs. Also to avoid polluting the flag space with an internal flag we reuse the flag space overlapping MSG_SENDPAGE_NOPOLICY with MSG_WAITFORONE. Here WAITFORONE is specific to recv path and SENDPAGE_NOPOLICY is only used for sendpage hooks. The last thing to verify is user space API is masked correctly to ensure the flag can not be set by user. (Note this needs to be true regardless because we have internal flags already in-use that user space should not be able to set). But for completeness we have two UAPI paths into sendpage, sendfile and splice. In the sendfile case the function do_sendfile() zero's flags, ./fs/read_write.c: static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count, loff_t max) { ... fl = 0; #if 0 /* * We need to debate whether we can enable this or not. The * man page documents EAGAIN return for the output at least, * and the application is arguably buggy if it doesn't expect * EAGAIN on a non-blocking file descriptor. */ if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif file_start_write(out.file); retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); } In the splice case the pipe_to_sendpage "actor" is used which masks flags with SPLICE_F_MORE. ./fs/splice.c: static int pipe_to_sendpage(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { ... more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; ... } Confirming what we expect that internal flags are in fact internal to socket side. Fixes: d3b18ad31f93 ("tls: add bpf support to sk_msg handling") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/socket.h | 1 + include/net/tls.h | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 8b571e9b9f76..84c48a3c0227 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -286,6 +286,7 @@ struct ucred { #define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ #define MSG_MORE 0x8000 /* Sender will send more */ #define MSG_WAITFORONE 0x10000 /* recvmmsg(): block until 1+ packets avail */ +#define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */ #define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */ #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN diff --git a/include/net/tls.h b/include/net/tls.h index bab5627ff5e3..23601f3b02ee 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -454,6 +454,15 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx) return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx; } +static inline bool tls_sw_has_ctx_tx(const struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + + if (!ctx) + return false; + return !!tls_sw_ctx_tx(ctx); +} + static inline struct tls_offload_context_rx * tls_offload_ctx_rx(const struct tls_context *tls_ctx) { -- cgit v1.2.3