From e7d4798960b3ebcd243ae6a59e04d4fe6518c96c Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 13 Jun 2019 18:39:58 +0900 Subject: xdp: Add tracepoint for bulk XDP_TX This is introduced for admins to check what is happening on XDP_TX when bulk XDP_TX is in use, which will be first introduced in veth in next commit. v3: - Add act field to be in line with other XDP tracepoints. Signed-off-by: Toshiaki Makita Acked-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann --- include/trace/events/xdp.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index bb5e380e2ef3..81e708c4b513 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -50,6 +50,35 @@ TRACE_EVENT(xdp_exception, __entry->ifindex) ); +TRACE_EVENT(xdp_bulk_tx, + + TP_PROTO(const struct net_device *dev, + int sent, int drops, int err), + + TP_ARGS(dev, sent, drops, err), + + TP_STRUCT__entry( + __field(int, ifindex) + __field(u32, act) + __field(int, drops) + __field(int, sent) + __field(int, err) + ), + + TP_fast_assign( + __entry->ifindex = dev->ifindex; + __entry->act = XDP_TX; + __entry->drops = drops; + __entry->sent = sent; + __entry->err = err; + ), + + TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d", + __entry->ifindex, + __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), + __entry->sent, __entry->drops, __entry->err) +); + DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, -- cgit v1.2.3 From d57d76428ae9abca51fb89f9326da9d4b1cf8270 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 26 Jun 2019 17:35:24 +0300 Subject: xsk: Add API to check for available entries in FQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a function that checks whether the Fill Ring has the specified amount of descriptors available. It will be useful for mlx5e that wants to check in advance, whether it can allocate a bulk of RX descriptors, to get the best performance. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Tariq Toukan Acked-by: Saeed Mahameed Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index ae0f368a62bb..b6f5ebae43a1 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -77,6 +77,7 @@ int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); /* Used from netdev driver */ +bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); void xsk_umem_discard_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); @@ -99,6 +100,16 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) } /* Reuse-queue aware version of FILL queue helpers */ +static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + if (rq->length >= cnt) + return true; + + return xsk_umem_has_addrs(umem, cnt - rq->length); +} + static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) { struct xdp_umem_fq_reuse *rq = umem->fq_reuse; @@ -146,6 +157,11 @@ static inline bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) return false; } +static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) +{ + return false; +} + static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) { return NULL; @@ -200,6 +216,11 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) return 0; } +static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) +{ + return false; +} + static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) { return NULL; -- cgit v1.2.3 From 2640d3c8123223e0a205b2a25a446df6f072b3ea Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 26 Jun 2019 17:35:25 +0300 Subject: xsk: Add getsockopt XDP_OPTIONS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make it possible for the application to determine whether the AF_XDP socket is running in zero-copy mode. To achieve this, add a new getsockopt option XDP_OPTIONS that returns flags. The only flag supported for now is the zero-copy mode indicator. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Tariq Toukan Acked-by: Saeed Mahameed Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/if_xdp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index caed8b1614ff..faaa5ca2a117 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -46,6 +46,7 @@ struct xdp_mmap_offsets { #define XDP_UMEM_FILL_RING 5 #define XDP_UMEM_COMPLETION_RING 6 #define XDP_STATISTICS 7 +#define XDP_OPTIONS 8 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ @@ -60,6 +61,13 @@ struct xdp_statistics { __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ }; +struct xdp_options { + __u32 flags; +}; + +/* Flags for the flags field of struct xdp_options */ +#define XDP_OPTIONS_ZEROCOPY (1 << 0) + /* Pgoff for mmaping the rings */ #define XDP_PGOFF_RX_RING 0 #define XDP_PGOFF_TX_RING 0x80000000 -- cgit v1.2.3 From 4bce4e5cb65587f805655ec6808a20af2036627a Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Wed, 26 Jun 2019 17:35:28 +0300 Subject: xsk: Return the whole xdp_desc from xsk_umem_consume_tx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some drivers want to access the data transmitted in order to implement acceleration features of the NICs. It is also useful in AF_XDP TX flow. Change the xsk_umem_consume_tx API to return the whole xdp_desc, that contains the data pointer, length and DMA address, instead of only the latter two. Adapt the implementation of i40e and ixgbe to this change. Signed-off-by: Maxim Mikityanskiy Signed-off-by: Tariq Toukan Acked-by: Saeed Mahameed Cc: Björn Töpel Cc: Magnus Karlsson Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index b6f5ebae43a1..057b159ff8b9 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -81,7 +81,7 @@ bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); void xsk_umem_discard_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); -bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len); +bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); void xsk_umem_consume_tx_done(struct xdp_umem *umem); struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries); struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, @@ -175,8 +175,8 @@ static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) { } -static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, - u32 *len) +static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, + struct xdp_desc *desc) { return false; } -- cgit v1.2.3 From 0d01da6afc5402f60325c5da31b22f7d56689b49 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 27 Jun 2019 13:38:47 -0700 Subject: bpf: implement getsockopt and setsockopt hooks Implement new BPF_PROG_TYPE_CGROUP_SOCKOPT program type and BPF_CGROUP_{G,S}ETSOCKOPT cgroup hooks. BPF_CGROUP_SETSOCKOPT can modify user setsockopt arguments before passing them down to the kernel or bypass kernel completely. BPF_CGROUP_GETSOCKOPT can can inspect/modify getsockopt arguments that kernel returns. Both hooks reuse existing PTR_TO_PACKET{,_END} infrastructure. The buffer memory is pre-allocated (because I don't think there is a precedent for working with __user memory from bpf). This might be slow to do for each {s,g}etsockopt call, that's why I've added __cgroup_bpf_prog_array_is_empty that exits early if there is nothing attached to a cgroup. Note, however, that there is a race between __cgroup_bpf_prog_array_is_empty and BPF_PROG_RUN_ARRAY where cgroup program layout might have changed; this should not be a problem because in general there is a race between multiple calls to {s,g}etsocktop and user adding/removing bpf progs from a cgroup. The return code of the BPF program is handled as follows: * 0: EPERM * 1: success, continue with next BPF program in the cgroup chain v9: * allow overwriting setsockopt arguments (Alexei Starovoitov): * use set_fs (same as kernel_setsockopt) * buffer is always kzalloc'd (no small on-stack buffer) v8: * use s32 for optlen (Andrii Nakryiko) v7: * return only 0 or 1 (Alexei Starovoitov) * always run all progs (Alexei Starovoitov) * use optval=0 as kernel bypass in setsockopt (Alexei Starovoitov) (decided to use optval=-1 instead, optval=0 might be a valid input) * call getsockopt hook after kernel handlers (Alexei Starovoitov) v6: * rework cgroup chaining; stop as soon as bpf program returns 0 or 2; see patch with the documentation for the details * drop Andrii's and Martin's Acked-by (not sure they are comfortable with the new state of things) v5: * skip copy_to_user() and put_user() when ret == 0 (Martin Lau) v4: * don't export bpf_sk_fullsock helper (Martin Lau) * size != sizeof(__u64) for uapi pointers (Martin Lau) * offsetof instead of bpf_ctx_range when checking ctx access (Martin Lau) v3: * typos in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY comments (Andrii Nakryiko) * reverse christmas tree in BPF_PROG_CGROUP_SOCKOPT_RUN_ARRAY (Andrii Nakryiko) * use __bpf_md_ptr instead of __u32 for optval{,_end} (Martin Lau) * use BPF_FIELD_SIZEOF() for consistency (Martin Lau) * new CG_SOCKOPT_ACCESS macro to wrap repeated parts v2: * moved bpf_sockopt_kern fields around to remove a hole (Martin Lau) * aligned bpf_sockopt_kern->buf to 8 bytes (Martin Lau) * bpf_prog_array_is_empty instead of bpf_prog_array_length (Martin Lau) * added [0,2] return code check to verifier (Martin Lau) * dropped unused buf[64] from the stack (Martin Lau) * use PTR_TO_SOCKET for bpf_sockopt->sk (Martin Lau) * dropped bpf_target_off from ctx rewrites (Martin Lau) * use return code for kernel bypass (Martin Lau & Andrii Nakryiko) Cc: Andrii Nakryiko Cc: Martin Lau Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/bpf.h | 2 ++ include/linux/bpf_types.h | 1 + include/linux/filter.h | 10 ++++++++++ include/uapi/linux/bpf.h | 14 ++++++++++++++ 5 files changed, 72 insertions(+) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index bd79ae32909a..169fd25f6bc2 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -124,6 +124,14 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, loff_t *ppos, void **new_buf, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval); +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval); + static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { @@ -286,6 +294,38 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, __ret; \ }) +#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ + kernel_optval) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ + optname, optval, \ + optlen, \ + kernel_optval); \ + __ret; \ +}) + +#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + get_user(__ret, optlen); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \ + max_optlen, retval) \ +({ \ + int __ret = retval; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ + optname, optval, \ + optlen, max_optlen, \ + retval); \ + __ret; \ +}) + int cgroup_bpf_prog_attach(const union bpf_attr *attr, enum bpf_prog_type ptype, struct bpf_prog *prog); int cgroup_bpf_prog_detach(const union bpf_attr *attr, @@ -357,6 +397,11 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) +#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ + optlen, max_optlen, retval) ({ retval; }) +#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ + kernel_optval) ({ 0; }) #define for_each_cgroup_storage_type(stype) for (; false; ) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a62e7889b0b6..18f4cc2c6acd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -518,6 +518,7 @@ struct bpf_prog_array { struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); +bool bpf_prog_array_is_empty(struct bpf_prog_array *array); int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, __u32 __user *prog_ids, u32 cnt); @@ -1051,6 +1052,7 @@ extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_strtol_proto; extern const struct bpf_func_proto bpf_strtoul_proto; +extern const struct bpf_func_proto bpf_tcp_sock_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5a9975678d6f..eec5aeeeaf92 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -30,6 +30,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt) #endif #ifdef CONFIG_BPF_LIRC_MODE2 BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) diff --git a/include/linux/filter.h b/include/linux/filter.h index 43b45d6db36d..340f7d648974 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1199,4 +1199,14 @@ struct bpf_sysctl_kern { u64 tmp_reg; }; +struct bpf_sockopt_kern { + struct sock *sk; + u8 *optval; + u8 *optval_end; + s32 level; + s32 optname; + s32 optlen; + s32 retval; +}; + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b077507efa3f..a396b516a2b2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -170,6 +170,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + BPF_PROG_TYPE_CGROUP_SOCKOPT, }; enum bpf_attach_type { @@ -194,6 +195,8 @@ enum bpf_attach_type { BPF_CGROUP_SYSCTL, BPF_CGROUP_UDP4_RECVMSG, BPF_CGROUP_UDP6_RECVMSG, + BPF_CGROUP_GETSOCKOPT, + BPF_CGROUP_SETSOCKOPT, __MAX_BPF_ATTACH_TYPE }; @@ -3541,4 +3544,15 @@ struct bpf_sysctl { */ }; +struct bpf_sockopt { + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(void *, optval); + __bpf_md_ptr(void *, optval_end); + + __s32 level; + __s32 optname; + __s32 optlen; + __s32 retval; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From c8af5cd75e2411d5a5aacf115f59a5ff6b87f3fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: xskmap: Move non-standard list manipulation to helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper in list.h for the non-standard way of clearing a list that is used in xskmap. This makes it easier to reuse it in the other map types, and also makes sure this usage is not forgotten in any list refactorings in the future. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann --- include/linux/list.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/list.h b/include/linux/list.h index e951228db4b2..85c92555e31f 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -106,6 +106,20 @@ static inline void __list_del(struct list_head * prev, struct list_head * next) WRITE_ONCE(prev->next, next); } +/* + * Delete a list entry and clear the 'prev' pointer. + * + * This is a special-purpose list clearing method used in the networking code + * for lists allocated as per-cpu, where we don't want to incur the extra + * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this + * needs to check the node 'prev' pointer instead of calling list_empty(). + */ +static inline void __list_del_clearprev(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->prev = NULL; +} + /** * list_del - deletes entry from list. * @entry: the element to delete from the list. -- cgit v1.2.3 From 4b55cf290dc6bd3a9e5da26d1ad60e77aa88c8cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: devmap: Rename ifindex member in bpf_redirect_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_redirect_info struct has an 'ifindex' member which was named back when the redirects could only target egress interfaces. Now that we can also redirect to sockets and CPUs, this is a bit misleading, so rename the member to tgt_index. Reorder the struct members so we can have 'tgt_index' and 'tgt_value' next to each other in a subsequent patch. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 340f7d648974..92bd192f7786 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -578,8 +578,8 @@ struct bpf_skb_data_end { }; struct bpf_redirect_info { - u32 ifindex; u32 flags; + u32 tgt_index; struct bpf_map *map; struct bpf_map *map_to_flush; u32 kern_flags; -- cgit v1.2.3 From 43e74c0267a35d6f5127218054b2d80c7fe801f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 28 Jun 2019 11:12:34 +0200 Subject: bpf_xdp_redirect_map: Perform map lookup in eBPF helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_redirect_map() helper used by XDP programs doesn't return any indication of whether it can successfully redirect to the map index it was given. Instead, BPF programs have to track this themselves, leading to programs using duplicate maps to track which entries are populated in the devmap. This patch fixes this by moving the map lookup into the bpf_redirect_map() helper, which makes it possible to return failure to the eBPF program. The lower bits of the flags argument is used as the return code, which means that existing users who pass a '0' flag argument will get XDP_ABORTED. With this, a BPF program can check the return code from the helper call and react by, for instance, substituting a different redirect. This works for any type of map used for redirect. Signed-off-by: Toke Høiland-Jørgensen Acked-by: Jonathan Lemon Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- include/linux/filter.h | 1 + include/trace/events/xdp.h | 5 ++--- include/uapi/linux/bpf.h | 7 +++++-- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 92bd192f7786..1fe53e78c7e3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -580,6 +580,7 @@ struct bpf_skb_data_end { struct bpf_redirect_info { u32 flags; u32 tgt_index; + void *tgt_value; struct bpf_map *map; struct bpf_map *map_to_flush; u32 kern_flags; diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 81e708c4b513..68899fdc985b 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -175,9 +175,8 @@ struct _bpf_dtab_netdev { #endif /* __DEVMAP_OBJ_TYPE */ #define devmap_ifindex(fwd, map) \ - (!fwd ? 0 : \ - ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ - ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)) + ((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \ + ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0) #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a396b516a2b2..cffea1826a1f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1571,8 +1571,11 @@ union bpf_attr { * but this is only implemented for native XDP (with driver * support) as of this writing). * - * All values for *flags* are reserved for future usage, and must - * be left at zero. + * The lower two bits of *flags* are used as the return code if + * the map lookup fails. This is so that the return value can be + * one of the XDP program return codes up to XDP_TX, as chosen by + * the caller. Any higher bits in the *flags* argument must be + * unset. * * When used to redirect packets to net devices, this helper * provides a high performance increase over **bpf_redirect**\ (). -- cgit v1.2.3 From 23729ff23186424e54b4d6678fcd526cdacef4d3 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 2 Jul 2019 09:13:56 -0700 Subject: bpf: add BPF_CGROUP_SOCK_OPS callback that is executed on every RTT Performance impact should be minimal because it's under a new BPF_SOCK_OPS_RTT_CB_FLAG flag that has to be explicitly enabled. Suggested-by: Eric Dumazet Cc: Eric Dumazet Cc: Priyaranjan Jha Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/net/tcp.h | 8 ++++++++ include/uapi/linux/bpf.h | 6 +++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 9d36cc88d043..e16d8a3fd3b4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2221,6 +2221,14 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } +static inline void tcp_bpf_rtt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTT_CB_FLAG)) + tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); +} + #if IS_ENABLED(CONFIG_SMC) extern struct static_key_false tcp_have_smc; #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cffea1826a1f..9cdd0aaeba06 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1770,6 +1770,7 @@ union bpf_attr { * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) * * Therefore, this function can be used to clear a callback flag by * setting the appropriate bit to zero. e.g. to disable the RTO @@ -3314,7 +3315,8 @@ struct bpf_sock_ops { #define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) #define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) #define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) -#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently +#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3) +#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently * supported cb flags */ @@ -3369,6 +3371,8 @@ enum { BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after * socket transition to LISTEN state. */ + BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect -- cgit v1.2.3 From 0357746d1e40a8226f68a42c8d7222a12d7c451f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 2 Jul 2019 09:13:58 -0700 Subject: bpf: add dsack_dups/delivered{, _ce} to bpf_tcp_sock Add more fields to bpf_tcp_sock that might be useful for debugging congestion control issues. Cc: Eric Dumazet Cc: Priyaranjan Jha Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9cdd0aaeba06..bfb0b1a76684 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3073,6 +3073,11 @@ struct bpf_tcp_sock { * sum(delta(snd_una)), or how many bytes * were acked. */ + __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups + * total number of DSACK blocks received + */ + __u32 delivered; /* Total data packets delivered incl. rexmits */ + __u32 delivered_ce; /* Like the above but only ECE marked packets */ }; struct bpf_sock_tuple { -- cgit v1.2.3 From c2cb5e82a720c05b707701c75dfeb356fe184787 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 2 Jul 2019 09:13:59 -0700 Subject: bpf: add icsk_retransmits to bpf_tcp_sock Add some inet_connection_sock fields to bpf_tcp_sock that might be useful for debugging congestion control issues. Cc: Eric Dumazet Cc: Priyaranjan Jha Cc: Yuchung Cheng Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Acked-by: Yuchung Cheng Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bfb0b1a76684..ead27aebf491 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3078,6 +3078,7 @@ struct bpf_tcp_sock { */ __u32 delivered; /* Total data packets delivered incl. rexmits */ __u32 delivered_ce; /* Like the above but only ECE marked packets */ + __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */ }; struct bpf_sock_tuple { -- cgit v1.2.3