From 13193b0f392f5a65d0d54185cb95ed5e99c0a5bf Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 25 May 2018 08:55:22 -0700 Subject: bpf: Define cgroup_bpf_enabled for CONFIG_CGROUP_BPF=n Static key is used to enable/disable cgroup-bpf related code paths at run time. Though it's not defined when cgroup-bpf is disabled at compile time, i.e. CONFIG_CGROUP_BPF=n, and if some code wants to use it, it has to do this: #ifdef CONFIG_CGROUP_BPF if (cgroup_bpf_enabled) { /* ... some work ... */ } #endif This code can be simplified by setting cgroup_bpf_enabled to 0 for CONFIG_CGROUP_BPF=n case: if (cgroup_bpf_enabled) { /* ... some work ... */ } And it aligns well with existing BPF_CGROUP_RUN_PROG_* macros that defined for both states of CONFIG_CGROUP_BPF. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 30d15e64b993..de8e89a3758b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -185,6 +185,7 @@ struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +#define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) -- cgit v1.2.3 From 1cedee13d25ab118d325f95588c1a084e9317229 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 25 May 2018 08:55:23 -0700 Subject: bpf: Hooks for sys_sendmsg In addition to already existing BPF hooks for sys_bind and sys_connect, the patch provides new hooks for sys_sendmsg. It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that provides access to socket itlself (properties like family, type, protocol) and user-passed `struct sockaddr *` so that BPF program can override destination IP and port for system calls such as sendto(2) or sendmsg(2) and/or assign source IP to the socket. The hooks are implemented as two new attach types: `BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and UDPv6 correspondingly. UDPv4 and UDPv6 separate attach types for same reason as sys_bind and sys_connect hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. The difference with already existing hooks is sys_sendmsg are implemented only for unconnected UDP. For TCP it doesn't make sense to change user-provided `struct sockaddr *` at sendto(2)/sendmsg(2) time since socket either was already connected and has source/destination set or wasn't connected and call to sendto(2)/sendmsg(2) would lead to ENOTCONN anyway. Connected UDP is already handled by sys_connect hooks that can override source/destination at connect time and use fast-path later, i.e. these hooks don't affect UDP fast-path. Rewriting source IP is implemented differently than that in sys_connect hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to just bind socket to desired local IP address since source IP can be set on per-packet basis by using ancillary data (cmsg(3)). So no matter if socket is bound or not, source IP has to be rewritten on every call to sys_sendmsg. To do so two new fields are added to UAPI `struct bpf_sock_addr`; * `msg_src_ip4` to set source IPv4 for UDPv4; * `msg_src_ip6` to set source IPv6 for UDPv6. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 23 +++++++++++++++++------ include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 8 ++++++++ 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index de8e89a3758b..975fb4cf1bb7 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -66,7 +66,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, - enum bpf_attach_type type); + enum bpf_attach_type type, + void *t_ctx); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, @@ -120,16 +121,18 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + NULL); \ __ret; \ }) -#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type) \ +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + t_ctx); \ release_sock(sk); \ } \ __ret; \ @@ -151,10 +154,16 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL) + +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx) + +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ @@ -198,6 +207,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/linux/filter.h b/include/linux/filter.h index d358d1815c16..d90abdaea94b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1010,6 +1010,7 @@ struct bpf_sock_addr_kern { * only two (src and dst) are available at convert_ctx_access time */ u64 tmp_reg; + void *t_ctx; /* Attach type specific context. */ }; struct bpf_sock_ops_kern { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9b8c6e310e9a..cc68787f2d97 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -160,6 +160,8 @@ enum bpf_attach_type { BPF_CGROUP_INET6_CONNECT, BPF_CGROUP_INET4_POST_BIND, BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_UDP4_SENDMSG, + BPF_CGROUP_UDP6_SENDMSG, __MAX_BPF_ATTACH_TYPE }; @@ -2363,6 +2365,12 @@ struct bpf_sock_addr { __u32 family; /* Allows 4-byte read, but no write */ __u32 type; /* Allows 4-byte read, but no write */ __u32 protocol; /* Allows 4-byte read, but no write */ + __u32 msg_src_ip4; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ + __u32 msg_src_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ }; /* User bpf_sock_ops struct to access socket values and specify request ops -- cgit v1.2.3 From 7a279e93330bd4f0074973293f1cd1aacf7c5fa6 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 29 May 2018 12:27:44 +0100 Subject: bpf: clean up eBPF helpers documentation These are minor edits for the eBPF helpers documentation in include/uapi/linux/bpf.h. The main fix consists in removing "BPF_FIB_LOOKUP_", because it ends with a non-escaped underscore that gets interpreted by rst2man and produces the following message in the resulting manual page: DOCUTILS SYSTEM MESSAGES System Message: ERROR/3 (/tmp/bpf-helpers.rst:, line 1514) Unknown target name: "bpf_fib_lookup". Other edits consist in: - Improving formatting for flag values for "bpf_fib_lookup()" helper. - Emphasising a parameter name in description of the return value for "bpf_get_stack()" helper. - Removing unnecessary blank lines between "Description" and "Return" sections for the few helpers that would use it, for consistency. Signed-off-by: Quentin Monnet Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cc68787f2d97..3f556b35ac8d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1010,7 +1010,6 @@ union bpf_attr { * :: * * # sysctl kernel.perf_event_max_stack= - * * Return * The positive or null stack id on success, or a negative error * in case of failure. @@ -1821,10 +1820,9 @@ union bpf_attr { * :: * * # sysctl kernel.perf_event_max_stack= - * * Return - * a non-negative value equal to or less than size on success, or - * a negative error in case of failure. + * A non-negative value equal to or less than *size* on success, + * or a negative error in case of failure. * * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) * Description @@ -1845,7 +1843,6 @@ union bpf_attr { * in socket filters where *skb*\ **->data** does not always point * to the start of the mac header and where "direct packet access" * is not available. - * * Return * 0 on success, or a negative error in case of failure. * @@ -1861,16 +1858,18 @@ union bpf_attr { * rt_metric is set to metric from route. * * *plen* argument is the size of the passed in struct. - * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags: + * *flags* argument can be a combination of one or more of the + * following values: * - * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs - * full lookup using FIB rules - * **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress - * perspective (default is ingress) + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. - * * Return * Egress device index on success, 0 if packet needs to continue * up the stack for further processing or a negative error in case -- cgit v1.2.3 From fa898d769b264ede3c10cc30a537316c6a946956 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 29 May 2018 10:58:07 -0700 Subject: bpf: Drop mpls from bpf_fib_lookup MPLS support will not be submitted this dev cycle, but in working on it I do see a few changes are needed to the API. For now, drop mpls from the API. Since the fields in question are unions, the mpls fields can be added back later without affecting the uapi. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3f556b35ac8d..33f37eb0b6bf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1852,10 +1852,10 @@ union bpf_attr { * If lookup is successful and result shows packet is to be * forwarded, the neighbor tables are searched for the nexthop. * If successful (ie., FIB lookup shows forwarding and nexthop - * is resolved), the nexthop address is returned in ipv4_dst, - * ipv6_dst or mpls_out based on family, smac is set to mac - * address of egress device, dmac is set to nexthop mac address, - * rt_metric is set to metric from route. + * is resolved), the nexthop address is returned in ipv4_dst + * or ipv6_dst based on family, smac is set to mac address of + * egress device, dmac is set to nexthop mac address, rt_metric + * is set to metric from route (IPv4/IPv6 only). * * *plen* argument is the size of the passed in struct. * *flags* argument can be a combination of one or more of the @@ -2537,8 +2537,10 @@ struct bpf_raw_tracepoint_args { #define BPF_FIB_LOOKUP_OUTPUT BIT(1) struct bpf_fib_lookup { - /* input */ - __u8 family; /* network family, AF_INET, AF_INET6, AF_MPLS */ + /* input: network family for lookup (AF_INET, AF_INET6) + * output: network family of egress nexthop + */ + __u8 family; /* set if lookup is to consider L4 data - e.g., FIB rules */ __u8 l4_protocol; @@ -2554,22 +2556,20 @@ struct bpf_fib_lookup { __u8 tos; /* AF_INET */ __be32 flowlabel; /* AF_INET6 */ - /* output: metric of fib result */ - __u32 rt_metric; + /* output: metric of fib result (IPv4/IPv6 only) */ + __u32 rt_metric; }; union { - __be32 mpls_in; __be32 ipv4_src; __u32 ipv6_src[4]; /* in6_addr; network order */ }; - /* input to bpf_fib_lookup, *dst is destination address. - * output: bpf_fib_lookup sets to gateway address + /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in + * network header. output: bpf_fib_lookup sets to gateway address + * if FIB lookup returns gateway route */ union { - /* return for MPLS lookups */ - __be32 mpls_out[4]; /* support up to 4 labels */ __be32 ipv4_dst; __u32 ipv6_dst[4]; /* in6_addr; network order */ }; -- cgit v1.2.3 From f4364dcfc86df7c1ca47b256eaf6b6d0cdd0d936 Mon Sep 17 00:00:00 2001 From: Sean Young Date: Sun, 27 May 2018 12:24:09 +0100 Subject: media: rc: introduce BPF_PROG_LIRC_MODE2 Add support for BPF_PROG_LIRC_MODE2. This type of BPF program can call rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report that the last key should be repeated. The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall; the target_fd must be the /dev/lircN device. Acked-by: Yonghong Song Signed-off-by: Sean Young Signed-off-by: Daniel Borkmann --- include/linux/bpf_lirc.h | 29 ++++++++++++++++++++++++++ include/linux/bpf_types.h | 3 +++ include/uapi/linux/bpf.h | 53 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 include/linux/bpf_lirc.h (limited to 'include') diff --git a/include/linux/bpf_lirc.h b/include/linux/bpf_lirc.h new file mode 100644 index 000000000000..5f8a4283092d --- /dev/null +++ b/include/linux/bpf_lirc.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_LIRC_H +#define _BPF_LIRC_H + +#include + +#ifdef CONFIG_BPF_LIRC_MODE2 +int lirc_prog_attach(const union bpf_attr *attr); +int lirc_prog_detach(const union bpf_attr *attr); +int lirc_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); +#else +static inline int lirc_prog_attach(const union bpf_attr *attr) +{ + return -EINVAL; +} + +static inline int lirc_prog_detach(const union bpf_attr *attr) +{ + return -EINVAL; +} + +static inline int lirc_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} +#endif + +#endif /* _BPF_LIRC_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b161e506dcfc..c5700c2d5549 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -26,6 +26,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) #endif +#ifdef CONFIG_BPF_LIRC_MODE2 +BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 33f37eb0b6bf..64ac0f7a689e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -143,6 +143,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_RAW_TRACEPOINT, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_PROG_TYPE_LWT_SEG6LOCAL, + BPF_PROG_TYPE_LIRC_MODE2, }; enum bpf_attach_type { @@ -162,6 +163,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_POST_BIND, BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, + BPF_LIRC_MODE2, __MAX_BPF_ATTACH_TYPE }; @@ -2005,6 +2007,53 @@ union bpf_attr { * direct packet access. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown** () again with the same values, or calling + * **bpf_rc_repeat** (). + * + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Return + * 0 + * + * int bpf_rc_repeat(void *ctx) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * + * Return + * 0 */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2083,7 +2132,9 @@ union bpf_attr { FN(lwt_push_encap), \ FN(lwt_seg6_store_bytes), \ FN(lwt_seg6_adjust_srh), \ - FN(lwt_seg6_action), + FN(lwt_seg6_action), \ + FN(rc_repeat), \ + FN(rc_keydown), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 06be0864c77ae6861632a678a6378511a4828d6e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:31 +0200 Subject: bpf: test case for map pointer poison with calls/branches Add several test cases where the same or different map pointers originate from different paths in the program and execute a map lookup or tail call at a common location. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index d90abdaea94b..6fd375fe7079 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -289,6 +289,16 @@ struct xdp_buff; .off = OFF, \ .imm = 0 }) +/* Relative call */ + +#define BPF_CALL_REL(TGT) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = BPF_PSEUDO_CALL, \ + .off = 0, \ + .imm = TGT }) + /* Function call */ #define BPF_EMIT_CALL(FUNC) \ -- cgit v1.2.3 From 09772d92cd5ad998b0d5f6f46cd1658f8cb698cf Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:35 +0200 Subject: bpf: avoid retpoline for lookup/update/delete calls on maps While some of the BPF map lookup helpers provide a ->map_gen_lookup() callback for inlining the map lookup altogether it is not available for every map, so the remaining ones have to call bpf_map_lookup_elem() helper which does a dispatch to map->ops->map_lookup_elem(). In times of retpolines, this will control and trap speculative execution rather than letting it do its work for the indirect call and will therefore cause a slowdown. Likewise, bpf_map_update_elem() and bpf_map_delete_elem() do not have an inlined version and need to call into their map->ops->map_update_elem() resp. map->ops->map_delete_elem() handlers. Before: # bpftool prog dump xlated id 1 0: (bf) r2 = r10 1: (07) r2 += -8 2: (7a) *(u64 *)(r2 +0) = 0 3: (18) r1 = map[id:1] 5: (85) call __htab_map_lookup_elem#232656 6: (15) if r0 == 0x0 goto pc+4 7: (71) r1 = *(u8 *)(r0 +35) 8: (55) if r1 != 0x0 goto pc+1 9: (72) *(u8 *)(r0 +35) = 1 10: (07) r0 += 56 11: (15) if r0 == 0x0 goto pc+4 12: (bf) r2 = r0 13: (18) r1 = map[id:1] 15: (85) call bpf_map_delete_elem#215008 <-- indirect call via 16: (95) exit helper After: # bpftool prog dump xlated id 1 0: (bf) r2 = r10 1: (07) r2 += -8 2: (7a) *(u64 *)(r2 +0) = 0 3: (18) r1 = map[id:1] 5: (85) call __htab_map_lookup_elem#233328 6: (15) if r0 == 0x0 goto pc+4 7: (71) r1 = *(u8 *)(r0 +35) 8: (55) if r1 != 0x0 goto pc+1 9: (72) *(u8 *)(r0 +35) = 1 10: (07) r0 += 56 11: (15) if r0 == 0x0 goto pc+4 12: (bf) r2 = r0 13: (18) r1 = map[id:1] 15: (85) call htab_lru_map_delete_elem#238240 <-- direct call 16: (95) exit In all three lookup/update/delete cases however we can use the actual address of the map callback directly if we find that there's only a single path with a map pointer leading to the helper call, meaning when the map pointer has not been poisoned from verifier side. Example code can be seen above for the delete case. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 6fd375fe7079..8e60f1e9a702 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -301,6 +301,9 @@ struct xdp_buff; /* Function call */ +#define BPF_CAST_CALL(x) \ + ((u64 (*)(u64, u64, u64, u64, u64))(x)) + #define BPF_EMIT_CALL(FUNC) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ -- cgit v1.2.3 From cb20b08ead401fd17627a36f035c0bf5bfee5567 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:36 +0200 Subject: bpf: add bpf_skb_cgroup_id helper Add a new bpf_skb_cgroup_id() helper that allows to retrieve the cgroup id from the skb's socket. This is useful in particular to enable bpf_get_cgroup_classid()-like behavior for cgroup v1 in cgroup v2 by allowing ID based matching on egress. This can in particular be used in combination with applying policy e.g. from map lookups, and also complements the older bpf_skb_under_cgroup() interface. In user space the cgroup id for a given path can be retrieved through the f_handle as demonstrated in [0] recently. [0] https://lkml.org/lkml/2018/5/22/1190 Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 64ac0f7a689e..661318141f72 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2054,6 +2054,22 @@ union bpf_attr { * * Return * 0 + * + * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb) + * Description + * Return the cgroup v2 id of the socket associated with the *skb*. + * This is roughly similar to the **bpf_get_cgroup_classid**\ () + * helper for cgroup v1 by providing a tag resp. identifier that + * can be matched on or used for map lookups e.g. to implement + * policy. The cgroup v2 id of a given path in the hierarchy is + * exposed in user space through the f_handle API in order to get + * to the same 64-bit id. + * + * This helper can be used on TC egress path, but not on ingress, + * and is available only if the kernel was compiled with the + * **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2134,7 +2150,8 @@ union bpf_attr { FN(lwt_seg6_adjust_srh), \ FN(lwt_seg6_action), \ FN(rc_repeat), \ - FN(rc_keydown), + FN(rc_keydown), \ + FN(skb_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 1fbc2e0cfcf9677683aea2b1f9ea76fbdc6fb6d1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:37 +0200 Subject: bpf: make sure to clear unused fields in tunnel/xfrm state fetch Since the remaining bits are not filled in struct bpf_tunnel_key resp. struct bpf_xfrm_state and originate from uninitialized stack space, we should make sure to clear them before handing control back to the program. Also add a padding element to struct bpf_xfrm_state for future use similar as we have in struct bpf_tunnel_key and clear it as well. struct bpf_xfrm_state { __u32 reqid; /* 0 4 */ __u32 spi; /* 4 4 */ __u16 family; /* 8 2 */ /* XXX 2 bytes hole, try to pack */ union { __u32 remote_ipv4; /* 4 */ __u32 remote_ipv6[4]; /* 16 */ }; /* 12 16 */ /* size: 28, cachelines: 1, members: 4 */ /* sum members: 26, holes: 1, sum holes: 2 */ /* last cacheline: 28 bytes */ }; Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 661318141f72..f0b6608b1f1c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2268,7 +2268,7 @@ struct bpf_tunnel_key { }; __u8 tunnel_tos; __u8 tunnel_ttl; - __u16 tunnel_ext; + __u16 tunnel_ext; /* Padding, future use. */ __u32 tunnel_label; }; @@ -2279,6 +2279,7 @@ struct bpf_xfrm_state { __u32 reqid; __u32 spi; /* Stored in network byte order */ __u16 family; + __u16 ext; /* Padding, future use. */ union { __u32 remote_ipv4; /* Stored in network byte order */ __u32 remote_ipv6[4]; /* Stored in network byte order */ -- cgit v1.2.3 From bc23105ca0abdeed98366af01c700c2c3aff5cd5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Jun 2018 23:06:39 +0200 Subject: bpf: fix context access in tracing progs on 32 bit archs Wang reported that all the testcases for BPF_PROG_TYPE_PERF_EVENT program type in test_verifier report the following errors on x86_32: 172/p unpriv: spill/fill of different pointers ldx FAIL Unexpected error message! 0: (bf) r6 = r10 1: (07) r6 += -8 2: (15) if r1 == 0x0 goto pc+3 R1=ctx(id=0,off=0,imm=0) R6=fp-8,call_-1 R10=fp0,call_-1 3: (bf) r2 = r10 4: (07) r2 += -76 5: (7b) *(u64 *)(r6 +0) = r2 6: (55) if r1 != 0x0 goto pc+1 R1=ctx(id=0,off=0,imm=0) R2=fp-76,call_-1 R6=fp-8,call_-1 R10=fp0,call_-1 fp-8=fp 7: (7b) *(u64 *)(r6 +0) = r1 8: (79) r1 = *(u64 *)(r6 +0) 9: (79) r1 = *(u64 *)(r1 +68) invalid bpf_context access off=68 size=8 378/p check bpf_perf_event_data->sample_period byte load permitted FAIL Failed to load prog 'Permission denied'! 0: (b7) r0 = 0 1: (71) r0 = *(u8 *)(r1 +68) invalid bpf_context access off=68 size=1 379/p check bpf_perf_event_data->sample_period half load permitted FAIL Failed to load prog 'Permission denied'! 0: (b7) r0 = 0 1: (69) r0 = *(u16 *)(r1 +68) invalid bpf_context access off=68 size=2 380/p check bpf_perf_event_data->sample_period word load permitted FAIL Failed to load prog 'Permission denied'! 0: (b7) r0 = 0 1: (61) r0 = *(u32 *)(r1 +68) invalid bpf_context access off=68 size=4 381/p check bpf_perf_event_data->sample_period dword load permitted FAIL Failed to load prog 'Permission denied'! 0: (b7) r0 = 0 1: (79) r0 = *(u64 *)(r1 +68) invalid bpf_context access off=68 size=8 Reason is that struct pt_regs on x86_32 doesn't fully align to 8 byte boundary due to its size of 68 bytes. Therefore, bpf_ctx_narrow_access_ok() will then bail out saying that off & (size_default - 1) which is 68 & 7 doesn't cleanly align in the case of sample_period access from struct bpf_perf_event_data, hence verifier wrongly thinks we might be doing an unaligned access here though underlying arch can handle it just fine. Therefore adjust this down to machine size and check and rewrite the offset for narrow access on that basis. We also need to fix corresponding pe_prog_is_valid_access(), since we hit the check for off % size != 0 (e.g. 68 % 8 -> 4) in the first and last test. With that in place, progs for tracing work on x86_32. Reported-by: Wang YanQing Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Tested-by: Wang YanQing Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 8e60f1e9a702..45fc0f5000d8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -639,16 +639,34 @@ static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) return prog->type == BPF_PROG_TYPE_UNSPEC; } -static inline bool -bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default) +static inline u32 bpf_ctx_off_adjust_machine(u32 size) +{ + const u32 size_machine = sizeof(unsigned long); + + if (size > size_machine && size % size_machine == 0) + size = size_machine; + + return size; +} + +static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access, + u32 size_default) { - bool off_ok; + size_default = bpf_ctx_off_adjust_machine(size_default); + size_access = bpf_ctx_off_adjust_machine(size_access); + #ifdef __LITTLE_ENDIAN - off_ok = (off & (size_default - 1)) == 0; + return (off & (size_default - 1)) == 0; #else - off_ok = (off & (size_default - 1)) + size == size_default; + return (off & (size_default - 1)) + size_access == size_default; #endif - return off_ok && size <= size_default && (size & (size - 1)) == 0; +} + +static inline bool +bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) +{ + return bpf_ctx_narrow_align_ok(off, size, size_default) && + size <= size_default && (size & (size - 1)) == 0; } #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) -- cgit v1.2.3 From 42b33468987bac0dd95c30f14820c7abac04a153 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 31 May 2018 10:59:47 +0200 Subject: xdp: add flags argument to ndo_xdp_xmit API This patch only change the API and reject any use of flags. This is an intermediate step that allows us to implement the flush flag operation later, for each individual driver in a separate patch. The plan is to implement flush operation via XDP_XMIT_FLUSH flag and then remove XDP_XMIT_FLAGS_NONE when done. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 7 ++++--- include/net/xdp.h | 5 +++++ 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8452f72087ef..7f17785a59d7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1185,13 +1185,13 @@ struct dev_ifalias { * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. - * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp); + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, + * u32 flags); * This function is used to submit @n XDP packets for transmit on a * netdevice. Returns number of frames successfully transmitted, frames * that got dropped are freed/returned via xdp_return_frame(). * Returns negative number, means general error invoking ndo, meaning * no frames were xmit'ed and core-caller will free all frames. - * TODO: Consider add flag to allow sending flush operation. * void (*ndo_xdp_flush)(struct net_device *dev); * This function is used to inform the driver to flush a particular * xdp tx queue. Must be called on same CPU as xdp_xmit. @@ -1380,7 +1380,8 @@ struct net_device_ops { int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, int n, - struct xdp_frame **xdp); + struct xdp_frame **xdp, + u32 flags); void (*ndo_xdp_flush)(struct net_device *dev); }; diff --git a/include/net/xdp.h b/include/net/xdp.h index 7ad779237ae8..0c45f0f943ed 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -40,6 +40,11 @@ enum xdp_mem_type { MEM_TYPE_MAX, }; +/* XDP flags for ndo_xdp_xmit */ +#define XDP_XMIT_FLAGS_NONE 0U +#define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ +#define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH + struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ u32 id; -- cgit v1.2.3 From 73de5717eb30ceedef6abf9da4bc2194f25d92ca Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 31 May 2018 11:00:13 +0200 Subject: xdp: done implementing ndo_xdp_xmit flush flag for all drivers Removing XDP_XMIT_FLAGS_NONE as all driver now implement a flush operation in their ndo_xdp_xmit call. The compiler will catch if any users of XDP_XMIT_FLAGS_NONE remains. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 0c45f0f943ed..a3b71a4dd71d 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -41,7 +41,6 @@ enum xdp_mem_type { }; /* XDP flags for ndo_xdp_xmit */ -#define XDP_XMIT_FLAGS_NONE 0U #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH -- cgit v1.2.3 From bf6fa2c893c5237b48569a13fa3c673041430b6c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 3 Jun 2018 15:59:41 -0700 Subject: bpf: implement bpf_get_current_cgroup_id() helper bpf has been used extensively for tracing. For example, bcc contains an almost full set of bpf-based tools to trace kernel and user functions/events. Most tracing tools are currently either filtered based on pid or system-wide. Containers have been used quite extensively in industry and cgroup is often used together to provide resource isolation and protection. Several processes may run inside the same container. It is often desirable to get container-level tracing results as well, e.g. syscall count, function count, I/O activity, etc. This patch implements a new helper, bpf_get_current_cgroup_id(), which will return cgroup id based on the cgroup within which the current task is running. The later patch will provide an example to show that userspace can get the same cgroup id so it could configure a filter or policy in the bpf program based on task cgroup id. The helper is currently implemented for tracing. It can be added to other program types as well when needed. Acked-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bbe297436e5d..995c3b1e59bf 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -746,6 +746,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; +extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f0b6608b1f1c..18712b0dbfe7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2070,6 +2070,11 @@ union bpf_attr { * **CONFIG_SOCK_CGROUP_DATA** configuration option. * Return * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_get_current_cgroup_id(void) + * Return + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2151,7 +2156,8 @@ union bpf_attr { FN(lwt_seg6_action), \ FN(rc_repeat), \ FN(rc_keydown), \ - FN(skb_cgroup_id), + FN(skb_cgroup_id), \ + FN(get_current_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From bd3a08aaa9a383ffbbd5b788b797ae6e64eaa7a1 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 3 Jun 2018 08:15:19 -0700 Subject: bpf: flowlabel in bpf_fib_lookup should be flowinfo As Michal noted the flow struct takes both the flow label and priority. Update the bpf_fib_lookup API to note that it is flowinfo and not just the flow label. Cc: Michal Kubecek Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 18712b0dbfe7..eeb6237be5c2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2629,7 +2629,7 @@ struct bpf_fib_lookup { union { /* inputs to lookup */ __u8 tos; /* AF_INET */ - __be32 flowlabel; /* AF_INET6 */ + __be32 flowinfo; /* AF_INET6, flow_label + priority */ /* output: metric of fib result (IPv4/IPv6 only) */ __u32 rt_metric; -- cgit v1.2.3 From bbff2f321a864ee07c9d3d1245af498023146951 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 13:57:13 +0200 Subject: xsk: new descriptor addressing scheme MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, AF_XDP only supports a fixed frame-size memory scheme where each frame is referenced via an index (idx). A user passes the frame index to the kernel, and the kernel acts upon the data. Some NICs, however, do not have a fixed frame-size model, instead they have a model where a memory window is passed to the hardware and multiple frames are filled into that window (referred to as the "type-writer" model). By changing the descriptor format from the current frame index addressing scheme, AF_XDP can in the future be extended to support these kinds of NICs. In the index-based model, an idx refers to a frame of size frame_size. Addressing a frame in the UMEM is done by offseting the UMEM starting address by a global offset, idx * frame_size + offset. Communicating via the fill- and completion-rings are done by means of idx. In this commit, the idx is removed in favor of an address (addr), which is a relative address ranging over the UMEM. To convert an idx-based address to the new addr is simply: addr = idx * frame_size + offset. We also stop referring to the UMEM "frame" as a frame. Instead it is simply called a chunk. To transfer ownership of a chunk to the kernel, the addr of the chunk is passed in the fill-ring. Note, that the kernel will mask addr to make it chunk aligned, so there is no need for userspace to do that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or 3000 to the fill-ring will refer to the same chunk. On the completion-ring, the addr will match that of the Tx descriptor, passed to the kernel. Changing the descriptor format to use chunks/addr will allow for future changes to move to a type-writer based model, where multiple frames can reside in one chunk. In this model passing one single chunk into the fill-ring, would potentially result in multiple Rx descriptors. This commit changes the uapi of AF_XDP sockets, and updates the documentation. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 4737cfe222f5..e411d6f9ac65 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -48,8 +48,8 @@ struct xdp_mmap_offsets { struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ __u64 len; /* Length of packet data area */ - __u32 frame_size; /* Frame size */ - __u32 frame_headroom; /* Frame head room */ + __u32 chunk_size; + __u32 headroom; }; struct xdp_statistics { @@ -66,13 +66,11 @@ struct xdp_statistics { /* Rx/Tx descriptor */ struct xdp_desc { - __u32 idx; + __u64 addr; __u32 len; - __u16 offset; - __u8 flags; - __u8 padding[5]; + __u32 options; }; -/* UMEM descriptor is __u32 */ +/* UMEM descriptor is __u64 */ #endif /* _LINUX_IF_XDP_H */ -- cgit v1.2.3 From 189454e868c45ce3476bfac03ace921dec34208a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 5 Jun 2018 13:55:50 +0200 Subject: net: remove net_device operation ndo_xdp_flush All drivers are cleaned up and no references to ndo_xdp_flush are left in drivers, it is time to remove the net_device_ops operation ndo_xdp_flush. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/linux/netdevice.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7f17785a59d7..42c6ea35a6f2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1192,9 +1192,6 @@ struct dev_ifalias { * that got dropped are freed/returned via xdp_return_frame(). * Returns negative number, means general error invoking ndo, meaning * no frames were xmit'ed and core-caller will free all frames. - * void (*ndo_xdp_flush)(struct net_device *dev); - * This function is used to inform the driver to flush a particular - * xdp tx queue. Must be called on same CPU as xdp_xmit. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1382,7 +1379,6 @@ struct net_device_ops { int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, u32 flags); - void (*ndo_xdp_flush)(struct net_device *dev); }; /** -- cgit v1.2.3 From e61e62b9e2cc14b336f330f37f517f9d373ff31e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:51 +0200 Subject: xsk: moved struct xdp_umem definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Moved struct xdp_umem to xdp_sock.h, in order to prepare for zero-copy support. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 7a647c56ec15..3a6cd88f179d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -6,12 +6,34 @@ #ifndef _LINUX_XDP_SOCK_H #define _LINUX_XDP_SOCK_H +#include +#include #include +#include #include struct net_device; struct xsk_queue; -struct xdp_umem; + +struct xdp_umem_props { + u64 chunk_mask; + u64 size; +}; + +struct xdp_umem { + struct xsk_queue *fq; + struct xsk_queue *cq; + struct page **pgs; + struct xdp_umem_props props; + u32 headroom; + u32 chunk_size_nohr; + struct user_struct *user; + struct pid *pid; + unsigned long address; + refcount_t users; + struct work_struct work; + u32 npgs; +}; struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ -- cgit v1.2.3 From 8aef7340ae9695912a411886452ae9773206e845 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:52 +0200 Subject: xsk: introduce xdp_umem_page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xdp_umem_page holds the address for a page. Trade memory for faster lookup. Later, we'll add DMA address here as well. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 3a6cd88f179d..caf343a7e224 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -20,10 +20,14 @@ struct xdp_umem_props { u64 size; }; +struct xdp_umem_page { + void *addr; +}; + struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; - struct page **pgs; + struct xdp_umem_page *pages; struct xdp_umem_props props; u32 headroom; u32 chunk_size_nohr; @@ -32,6 +36,7 @@ struct xdp_umem { unsigned long address; refcount_t users; struct work_struct work; + struct page **pgs; u32 npgs; }; -- cgit v1.2.3 From 74515c5750f30244a901c3c0c82a2fe534b3c9c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:53 +0200 Subject: net: xdp: added bpf_netdev_command XDP_{QUERY, SETUP}_XSK_UMEM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend ndo_bpf with two new commands used for query zero-copy support and register an UMEM to a queue_id of a netdev. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/linux/netdevice.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 42c6ea35a6f2..cc4ea7ab6d24 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -817,10 +817,13 @@ enum bpf_netdev_command { BPF_OFFLOAD_DESTROY, BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, + XDP_QUERY_XSK_UMEM, + XDP_SETUP_XSK_UMEM, }; struct bpf_prog_offload_ops; struct netlink_ext_ack; +struct xdp_umem; struct netdev_bpf { enum bpf_netdev_command command; @@ -851,6 +854,11 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; + /* XDP_SETUP_XSK_UMEM */ + struct { + struct xdp_umem *umem; + u16 queue_id; + } xsk; }; }; -- cgit v1.2.3 From 02b55e5657c3a569fc681ba851e464cfa6b90d4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:54 +0200 Subject: xdp: add MEM_TYPE_ZERO_COPY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here, a new type of allocator support is added to the XDP return API. A zero-copy allocated xdp_buff cannot be converted to an xdp_frame. Instead is the buff has to be copied. This is not supported at all in this commit. Also, an opaque "handle" is added to xdp_buff. This can be used as a context for the zero-copy allocator implementation. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index a3b71a4dd71d..2deea7166a34 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -37,6 +37,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, + MEM_TYPE_ZERO_COPY, MEM_TYPE_MAX, }; @@ -51,6 +52,10 @@ struct xdp_mem_info { struct page_pool; +struct zero_copy_allocator { + void (*free)(struct zero_copy_allocator *zca, unsigned long handle); +}; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; @@ -63,6 +68,7 @@ struct xdp_buff { void *data_end; void *data_meta; void *data_hard_start; + unsigned long handle; struct xdp_rxq_info *rxq; }; @@ -86,6 +92,10 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; + /* TODO: implement clone, copy, use "native" MEM_TYPE */ + if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) + return NULL; + /* Assure headroom is available for storing info */ headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; -- cgit v1.2.3 From 173d3adb6f437037f216270955886ca9878187a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 4 Jun 2018 14:05:55 +0200 Subject: xsk: add zero-copy support for Rx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the xsk_rcv to support the new MEM_TYPE_ZERO_COPY memory, and wireup ndo_bpf call in bind. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 6 ++++++ include/uapi/linux/if_xdp.h | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index caf343a7e224..d93d3aac3fc9 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -22,6 +22,7 @@ struct xdp_umem_props { struct xdp_umem_page { void *addr; + dma_addr_t dma; }; struct xdp_umem { @@ -38,6 +39,9 @@ struct xdp_umem { struct work_struct work; struct page **pgs; u32 npgs; + struct net_device *dev; + u16 queue_id; + bool zc; }; struct xdp_sock { @@ -60,6 +64,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); +void xsk_umem_discard_addr(struct xdp_umem *umem); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index e411d6f9ac65..1fa0e977ea8d 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -13,7 +13,9 @@ #include /* Options for the sxdp_flags field */ -#define XDP_SHARED_UMEM 1 +#define XDP_SHARED_UMEM (1 << 0) +#define XDP_COPY (1 << 1) /* Force copy-mode */ +#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ struct sockaddr_xdp { __u16 sxdp_family; -- cgit v1.2.3 From e3760c7e50ac6cdf1188fec44938dd7e6e6eef61 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 Jun 2018 14:05:56 +0200 Subject: net: added netdevice operation for Tx Added ndo_xsk_async_xmit. This ndo "kicks" the netdev to start to pull userland AF_XDP Tx frames from a NAPI context. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cc4ea7ab6d24..03ffeadf8a41 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1387,6 +1387,8 @@ struct net_device_ops { int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, u32 flags); + int (*ndo_xsk_async_xmit)(struct net_device *dev, + u32 queue_id); }; /** -- cgit v1.2.3 From ac98d8aab61baf785eb8f099b36daf34fc76a70e Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 Jun 2018 14:05:57 +0200 Subject: xsk: wire upp Tx zero-copy functions Here we add the functionality required to support zero-copy Tx, and also exposes various zero-copy related functions for the netdevs. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index d93d3aac3fc9..9fe472f2ac95 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,8 @@ struct xdp_umem { struct net_device *dev; u16 queue_id; bool zc; + spinlock_t xsk_list_lock; + struct list_head xsk_list; }; struct xdp_sock { @@ -53,6 +56,8 @@ struct xdp_sock { struct list_head flush_node; u16 queue_id; struct xsk_queue *tx ____cacheline_aligned_in_smp; + struct list_head list; + bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; u64 rx_dropped; @@ -64,8 +69,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +/* Used from netdev driver */ u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); void xsk_umem_discard_addr(struct xdp_umem *umem); +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); +bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len); +void xsk_umem_consume_tx_done(struct xdp_umem *umem); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { -- cgit v1.2.3