From 303def35f64e37bcd5401d202889f5fbc0241179 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 17 May 2018 14:16:58 -0700 Subject: bpf: allow sk_msg programs to read sock fields Currently sk_msg programs only have access to the raw data. However, it is often useful when building policies to have the policies specific to the socket endpoint. This allows using the socket tuple as input into filters, etc. This patch adds ctx access to the sock fields. Signed-off-by: John Fastabend Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 9dbcb9d55921..d358d1815c16 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -517,6 +517,7 @@ struct sk_msg_buff { bool sg_copy[MAX_SKB_FRAGS]; __u32 flags; struct sock *sk_redir; + struct sock *sk; struct sk_buff *skb; struct list_head list; }; -- cgit v1.2.3 From dcab51f19b291d5ee23724c51b0a3a597c16451a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 15:03:31 -0700 Subject: bpf: Expose check_uarg_tail_zero() This patch exposes check_uarg_tail_zero() which will be reused by a later BTF patch. Its name is changed to bpf_check_uarg_tail_zero(). Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ed0122b45b63..f6fe3c719ca8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -463,6 +463,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); int bpf_get_file_flag(int flags); +int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size, + size_t actual_size); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. -- cgit v1.2.3 From 9b2cf328b2eccf761537a06bef914d2a0700fba7 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 22 May 2018 14:57:21 -0700 Subject: bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info In "struct bpf_map_info", the name "btf_id", "btf_key_id" and "btf_value_id" could cause confusion because the "id" of "btf_id" means the BPF obj id given to the BTF object while "btf_key_id" and "btf_value_id" means the BTF type id within that BTF object. To make it clear, btf_key_id and btf_value_id are renamed to btf_key_type_id and btf_value_type_id. Suggested-by: Daniel Borkmann Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f6fe3c719ca8..1795eeee846c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -69,8 +69,8 @@ struct bpf_map { u32 pages; u32 id; int numa_node; - u32 btf_key_id; - u32 btf_value_id; + u32 btf_key_type_id; + u32 btf_value_type_id; struct btf *btf; bool unpriv_array; /* 55 bytes hole */ -- cgit v1.2.3 From cd3092c7f8db8c320ea7f1aa7c1adeac2450f43a Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:15 +0100 Subject: bpf: Split lwt inout verifier structures The new bpf_lwt_push_encap helper should only be accessible within the LWT BPF IN hook, and not the OUT one, as this may lead to a skb under panic. At the moment, both LWT BPF IN and OUT share the same list of helpers, whose calls are authorized by the verifier. This patch separates the verifier ops for the IN and OUT hooks, and allows the IN hook to call the bpf_lwt_push_encap helper. This patch is also the occasion to put all lwt_*_func_proto functions together for clarity. At the moment, socks_op_func_proto is in the middle of lwt_inout_func_proto and lwt_xmit_func_proto. Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b67f8793de0d..aa5c8b878474 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -9,8 +9,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) -BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) -- cgit v1.2.3 From 004d4b274e2a1a895a0e5dc66158b90a7d463d44 Mon Sep 17 00:00:00 2001 From: Mathieu Xhonneux Date: Sun, 20 May 2018 14:58:16 +0100 Subject: ipv6: sr: Add seg6local action End.BPF This patch adds the End.BPF action to the LWT seg6local infrastructure. This action works like any other seg6local End action, meaning that an IPv6 header with SRH is needed, whose DA has to be equal to the SID of the action. It will also advance the SRH to the next segment, the BPF program does not have to take care of this. Since the BPF program may not be a source of instability in the kernel, it is important to ensure that the integrity of the packet is maintained before yielding it back to the IPv6 layer. The hook hence keeps track if the SRH has been altered through the helpers, and re-validates its content if needed with seg6_validate_srh. The state kept for validation is stored in a per-CPU buffer. The BPF program is not allowed to directly write into the packet, and only some fields of the SRH can be altered through the helper bpf_lwt_seg6_store_bytes. Performances profiling has shown that the SRH re-validation does not induce a significant overhead. If the altered SRH is deemed as invalid, the packet is dropped. This validation is also done before executing any action through bpf_lwt_seg6_action, and will not be performed again if the SRH is not modified after calling the action. The BPF program may return 3 types of return codes: - BPF_OK: the End.BPF action will look up the next destination through seg6_lookup_nexthop. - BPF_REDIRECT: if an action has been executed through the bpf_lwt_seg6_action helper, the BPF program should return this value, as the skb's destination is already set and the default lookup should not be performed. - BPF_DROP : the packet will be dropped. Signed-off-by: Mathieu Xhonneux Acked-by: David Lebrun Signed-off-by: Daniel Borkmann --- include/linux/bpf_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index aa5c8b878474..b161e506dcfc 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -12,6 +12,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) -- cgit v1.2.3 From f8d959a5b188dc81e57a6bac34a1b2986f61e2fd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 24 May 2018 11:21:08 -0700 Subject: perf/core: add perf_get_event() to return perf_event given a struct file A new extern function, perf_get_event(), is added to return a perf event given a struct file. This function will be used in later patches. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/perf_event.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e71e99eb9a4e..eec302b61cfe 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); extern struct file *perf_event_get(unsigned int fd); +extern const struct perf_event *perf_get_event(struct file *file); extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); @@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); } +static inline const struct perf_event *perf_get_event(struct file *file) +{ + return ERR_PTR(-EINVAL); +} static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event) { return ERR_PTR(-EINVAL); -- cgit v1.2.3 From 41bdc4b40ed6fb26c6acc655ed9a243a348709c9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 24 May 2018 11:21:09 -0700 Subject: bpf: introduce bpf subcommand BPF_TASK_FD_QUERY Currently, suppose a userspace application has loaded a bpf program and attached it to a tracepoint/kprobe/uprobe, and a bpf introspection tool, e.g., bpftool, wants to show which bpf program is attached to which tracepoint/kprobe/uprobe. Such attachment information will be really useful to understand the overall bpf deployment in the system. There is a name field (16 bytes) for each program, which could be used to encode the attachment point. There are some drawbacks for this approaches. First, bpftool user (e.g., an admin) may not really understand the association between the name and the attachment point. Second, if one program is attached to multiple places, encoding a proper name which can imply all these attachments becomes difficult. This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY. Given a pid and fd, if the is associated with a tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return . prog_id . tracepoint name, or . k[ret]probe funcname + offset or kernel addr, or . u[ret]probe filename + offset to the userspace. The user can use "bpftool prog" to find more information about bpf program itself with prog_id. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/trace_events.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 2bde3eff564c..d34144a3c5b5 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); +int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, + u32 *fd_type, const char **buf, + u64 *probe_offset, u64 *probe_addr); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name { return NULL; } +static inline int bpf_get_perf_event_info(const struct perf_event *event, + u32 *prog_id, u32 *fd_type, + const char **buf, u64 *probe_offset, + u64 *probe_addr) +{ + return -EOPNOTSUPP; +} #endif enum { @@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int flags); #ifdef CONFIG_KPROBE_EVENTS extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_kprobe_destroy(struct perf_event *event); +extern int bpf_get_kprobe_info(const struct perf_event *event, + u32 *fd_type, const char **symbol, + u64 *probe_offset, u64 *probe_addr, + bool perf_type_tracepoint); #endif #ifdef CONFIG_UPROBE_EVENTS extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe); extern void perf_uprobe_destroy(struct perf_event *event); +extern int bpf_get_uprobe_info(const struct perf_event *event, + u32 *fd_type, const char **filename, + u64 *probe_offset, bool perf_type_tracepoint); #endif extern int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str); -- cgit v1.2.3 From 67f29e07e131ffa13ea158c259a513f474c7df27 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:46 +0200 Subject: bpf: devmap introduce dev_map_enqueue Functionality is the same, but the ndo_xdp_xmit call is now simply invoked from inside the devmap.c code. V2: Fix compile issue reported by kbuild test robot V5: Cleanups requested by Daniel - Newlines before func definition - Use BUILD_BUG_ON checks - Remove unnecessary use return value store in dev_map_enqueue Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1795eeee846c..23a809da452d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -487,14 +487,16 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); /* Map specifics */ -struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +struct xdp_buff; + +struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); void __cpu_map_flush(struct bpf_map *map); -struct xdp_buff; int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx); @@ -573,6 +575,15 @@ static inline void __dev_map_flush(struct bpf_map *map) { } +struct xdp_buff; +struct bpf_dtab_netdev; + +static inline +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +{ + return 0; +} + static inline struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) { @@ -587,7 +598,6 @@ static inline void __cpu_map_flush(struct bpf_map *map) { } -struct xdp_buff; static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) -- cgit v1.2.3 From 38edddb81172e8b8decb057c0cd23271583a5fa0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:45:57 +0200 Subject: xdp: add tracepoint for devmap like cpumap have Notice how this allow us get XDP statistic without affecting the XDP performance, as tracepoint is no-longer activated on a per packet basis. V5: Spotted by John Fastabend. Fix 'sent' also counted 'drops' in this patch, a later patch corrected this, but it was a mistake in this intermediate step. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 23a809da452d..bbe297436e5d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -492,7 +492,8 @@ struct xdp_buff; struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp); +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_insert_ctx(struct bpf_map *map, u32 index); @@ -579,7 +580,8 @@ struct xdp_buff; struct bpf_dtab_netdev; static inline -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp) +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, + struct net_device *dev_rx) { return 0; } -- cgit v1.2.3 From 735fc4054b3a25034445c6713d259da0f96f8131 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 24 May 2018 16:46:12 +0200 Subject: xdp: change ndo_xdp_xmit API to support bulking This patch change the API for ndo_xdp_xmit to support bulking xdp_frames. When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown. Most of the slowdown is caused by DMA API indirect function calls, but also the net_device->ndo_xdp_xmit() call. Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed performance improved: for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps With frames avail as a bulk inside the driver ndo_xdp_xmit call, further optimizations are possible, like bulk DMA-mapping for TX. Testing without CONFIG_RETPOLINE show the same performance for physical NIC drivers. The virtual NIC driver tun sees a huge performance boost, as it can avoid doing per frame producer locking, but instead amortize the locking cost over the bulk. V2: Fix compile errors reported by kbuild test robot V4: Isolated ndo, driver changes and callers. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 03ed492c4e14..debdb6286170 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1185,9 +1185,13 @@ struct dev_ifalias { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp); - * This function is used to submit a XDP packet for transmit on a - * netdevice. + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp); + * This function is used to submit @n XDP packets for transmit on a + * netdevice. Returns number of frames successfully transmitted, frames + * that got dropped are freed/returned via xdp_return_frame(). + * Returns negative number, means general error invoking ndo, meaning + * no frames were xmit'ed and core-caller will free all frames. + * TODO: Consider add flag to allow sending flush operation. * void (*ndo_xdp_flush)(struct net_device *dev); * This function is used to inform the driver to flush a particular * xdp tx queue. Must be called on same CPU as xdp_xmit. @@ -1375,8 +1379,8 @@ struct net_device_ops { int needed_headroom); int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); - int (*ndo_xdp_xmit)(struct net_device *dev, - struct xdp_frame *xdp); + int (*ndo_xdp_xmit)(struct net_device *dev, int n, + struct xdp_frame **xdp); void (*ndo_xdp_flush)(struct net_device *dev); }; -- cgit v1.2.3