From 6b0a249a301e2af9adda84adbced3a2988248b95 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 21 Aug 2020 11:44:18 -0700 Subject: bpf: Implement link_query for bpf iterators This patch implemented bpf_link callback functions show_fdinfo and fill_link_info to support link_query interface. The general interface for show_fdinfo and fill_link_info will print/fill the target_name. Each targets can register show_fdinfo and fill_link_info callbacks to print/fill more target specific information. For example, the below is a fdinfo result for a bpf task iterator. $ cat /proc/1749/fdinfo/7 pos: 0 flags: 02000000 mnt_id: 14 link_type: iter link_id: 11 prog_tag: 990e1f8152f7e54f prog_id: 59 target_name: task Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200821184418.574122-1-yhs@fb.com --- tools/include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0480f893facd..a1bbaff7a0af 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4071,6 +4071,13 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + union { + __u32 map_id; + } map; + } iter; struct { __u32 netns_ino; __u32 attach_type; -- cgit v1.2.3 From 2b8ee4f05d4f6a6c427ad30dd6c1bb49eb2efd3b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:21 -0700 Subject: tcp: bpf: Add TCP_BPF_DELACK_MAX setsockopt This change is mostly from an internal patch and adapts it from sysctl config to the bpf_setsockopt setup. The bpf_prog can set the max delay ack by using bpf_setsockopt(TCP_BPF_DELACK_MAX). This max delay ack can be communicated to its peer through bpf header option. The receiving peer can then use this max delay ack and set a potentially lower rto by using bpf_setsockopt(TCP_BPF_RTO_MIN) which will be introduced in the next patch. Another later selftest patch will also use it like the above to show how to write and parse bpf tcp header option. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190021.2884000-1-kafai@fb.com --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a1bbaff7a0af..7b905cb0213e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4257,6 +4257,7 @@ enum { enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ }; struct bpf_perf_event_value { -- cgit v1.2.3 From ca584ba070864c606f3a54faaafe774726d5b4a1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:27 -0700 Subject: tcp: bpf: Add TCP_BPF_RTO_MIN for bpf_setsockopt This patch adds bpf_setsockopt(TCP_BPF_RTO_MIN) to allow bpf prog to set the min rto of a connection. It could be used together with the earlier patch which has added bpf_setsockopt(TCP_BPF_DELACK_MAX). A later selftest patch will communicate the max delay ack in a bpf tcp header option and then the receiving side can use bpf_setsockopt(TCP_BPF_RTO_MIN) to set a shorter rto. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190027.2884170-1-kafai@fb.com --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7b905cb0213e..1ae20058b574 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4258,6 +4258,7 @@ enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ }; struct bpf_perf_event_value { -- cgit v1.2.3 From 00d211a4ea6f48e8e3b758813fe23ad28193d3bf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:46 -0700 Subject: bpf: tcp: Add bpf_skops_parse_hdr() The patch adds a function bpf_skops_parse_hdr(). It will call the bpf prog to parse the TCP header received at a tcp_sock that has at least reached the ESTABLISHED state. For the packets received during the 3WHS (SYN, SYNACK and ACK), the received skb will be available to the bpf prog during the callback in bpf_skops_established() introduced in the previous patch and in the bpf_skops_write_hdr_opt() that will be added in the next patch. Calling bpf prog to parse header is controlled by two new flags in tp->bpf_sock_ops_cb_flags: BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG and BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG. When BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG is set, the bpf prog will only be called when there is unknown option in the TCP header. When BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG is set, the bpf prog will be called on all received TCP header. This function is half implemented to highlight the changes in TCP stack. The actual codes preparing the bpf running context and invoking the bpf prog will be added in the later patch with other necessary bpf pieces. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190046.2885054-1-kafai@fb.com --- tools/include/uapi/linux/bpf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1ae20058b574..010ed2abcb66 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4173,8 +4173,10 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x3F, }; /* List of known BPF sock_ops operators. -- cgit v1.2.3 From 331fca4315efa3bbd258fbdf8209d59d253c0480 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:52 -0700 Subject: bpf: tcp: Add bpf_skops_hdr_opt_len() and bpf_skops_write_hdr_opt() The bpf prog needs to parse the SYN header to learn what options have been sent by the peer's bpf-prog before writing its options into SYNACK. This patch adds a "syn_skb" arg to tcp_make_synack() and send_synack(). This syn_skb will eventually be made available (as read-only) to the bpf prog. This will be the only SYN packet available to the bpf prog during syncookie. For other regular cases, the bpf prog can also use the saved_syn. When writing options, the bpf prog will first be called to tell the kernel its required number of bytes. It is done by the new bpf_skops_hdr_opt_len(). The bpf prog will only be called when the new BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG is set in tp->bpf_sock_ops_cb_flags. When the bpf prog returns, the kernel will know how many bytes are needed and then update the "*remaining" arg accordingly. 4 byte alignment will be included in the "*remaining" before this function returns. The 4 byte aligned number of bytes will also be stored into the opts->bpf_opt_len. "bpf_opt_len" is a newly added member to the struct tcp_out_options. Then the new bpf_skops_write_hdr_opt() will call the bpf prog to write the header options. The bpf prog is only called if it has reserved spaces before (opts->bpf_opt_len > 0). The bpf prog is the last one getting a chance to reserve header space and writing the header option. These two functions are half implemented to highlight the changes in TCP stack. The actual codes preparing the bpf running context and invoking the bpf prog will be added in the later patch with other necessary bpf pieces. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190052.2885316-1-kafai@fb.com --- tools/include/uapi/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 010ed2abcb66..18d0e128bc3c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4175,8 +4175,9 @@ enum { BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0x3F, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; /* List of known BPF sock_ops operators. -- cgit v1.2.3 From 0813a841566f0962a5551be7749b43c45f0022a0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:01:04 -0700 Subject: bpf: tcp: Allow bpf prog to write and parse TCP header option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Note: The TCP changes here is mainly to implement the bpf pieces into the bpf_skops_*() functions introduced in the earlier patches. ] The earlier effort in BPF-TCP-CC allows the TCP Congestion Control algorithm to be written in BPF. It opens up opportunities to allow a faster turnaround time in testing/releasing new congestion control ideas to production environment. The same flexibility can be extended to writing TCP header option. It is not uncommon that people want to test new TCP header option to improve the TCP performance. Another use case is for data-center that has a more controlled environment and has more flexibility in putting header options for internal only use. For example, we want to test the idea in putting maximum delay ACK in TCP header option which is similar to a draft RFC proposal [1]. This patch introduces the necessary BPF API and use them in the TCP stack to allow BPF_PROG_TYPE_SOCK_OPS program to parse and write TCP header options. It currently supports most of the TCP packet except RST. Supported TCP header option: ─────────────────────────── This patch allows the bpf-prog to write any option kind. Different bpf-progs can write its own option by calling the new helper bpf_store_hdr_opt(). The helper will ensure there is no duplicated option in the header. By allowing bpf-prog to write any option kind, this gives a lot of flexibility to the bpf-prog. Different bpf-prog can write its own option kind. It could also allow the bpf-prog to support a recently standardized option on an older kernel. Sockops Callback Flags: ────────────────────── The bpf program will only be called to parse/write tcp header option if the following newly added callback flags are enabled in tp->bpf_sock_ops_cb_flags: BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG A few words on the PARSE CB flags. When the above PARSE CB flags are turned on, the bpf-prog will be called on packets received at a sk that has at least reached the ESTABLISHED state. The parsing of the SYN-SYNACK-ACK will be discussed in the "3 Way HandShake" section. The default is off for all of the above new CB flags, i.e. the bpf prog will not be called to parse or write bpf hdr option. There are details comment on these new cb flags in the UAPI bpf.h. sock_ops->skb_data and bpf_load_hdr_opt() ───────────────────────────────────────── sock_ops->skb_data and sock_ops->skb_data_end covers the whole TCP header and its options. They are read only. The new bpf_load_hdr_opt() helps to read a particular option "kind" from the skb_data. Please refer to the comment in UAPI bpf.h. It has details on what skb_data contains under different sock_ops->op. 3 Way HandShake ─────────────── The bpf-prog can learn if it is sending SYN or SYNACK by reading the sock_ops->skb_tcp_flags. * Passive side When writing SYNACK (i.e. sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB), the received SYN skb will be available to the bpf prog. The bpf prog can use the SYN skb (which may carry the header option sent from the remote bpf prog) to decide what bpf header option should be written to the outgoing SYNACK skb. The SYN packet can be obtained by getsockopt(TCP_BPF_SYN*). More on this later. Also, the bpf prog can learn if it is in syncookie mode (by checking sock_ops->args[0] == BPF_WRITE_HDR_TCP_SYNACK_COOKIE). The bpf prog can store the received SYN pkt by using the existing bpf_setsockopt(TCP_SAVE_SYN). The example in a later patch does it. [ Note that the fullsock here is a listen sk, bpf_sk_storage is not very useful here since the listen sk will be shared by many concurrent connection requests. Extending bpf_sk_storage support to request_sock will add weight to the minisock and it is not necessary better than storing the whole ~100 bytes SYN pkt. ] When the connection is established, the bpf prog will be called in the existing PASSIVE_ESTABLISHED_CB callback. At that time, the bpf prog can get the header option from the saved syn and then apply the needed operation to the newly established socket. The later patch will use the max delay ack specified in the SYN header and set the RTO of this newly established connection as an example. The received ACK (that concludes the 3WHS) will also be available to the bpf prog during PASSIVE_ESTABLISHED_CB through the sock_ops->skb_data. It could be useful in syncookie scenario. More on this later. There is an existing getsockopt "TCP_SAVED_SYN" to return the whole saved syn pkt which includes the IP[46] header and the TCP header. A few "TCP_BPF_SYN*" getsockopt has been added to allow specifying where to start getting from, e.g. starting from TCP header, or from IP[46] header. The new getsockopt(TCP_BPF_SYN*) will also know where it can get the SYN's packet from: - (a) the just received syn (available when the bpf prog is writing SYNACK) and it is the only way to get SYN during syncookie mode. or - (b) the saved syn (available in PASSIVE_ESTABLISHED_CB and also other existing CB). The bpf prog does not need to know where the SYN pkt is coming from. The getsockopt(TCP_BPF_SYN*) will hide this details. Similarly, a flags "BPF_LOAD_HDR_OPT_TCP_SYN" is also added to bpf_load_hdr_opt() to read a particular header option from the SYN packet. * Fastopen Fastopen should work the same as the regular non fastopen case. This is a test in a later patch. * Syncookie For syncookie, the later example patch asks the active side's bpf prog to resend the header options in ACK. The server can use bpf_load_hdr_opt() to look at the options in this received ACK during PASSIVE_ESTABLISHED_CB. * Active side The bpf prog will get a chance to write the bpf header option in the SYN packet during WRITE_HDR_OPT_CB. The received SYNACK pkt will also be available to the bpf prog during the existing ACTIVE_ESTABLISHED_CB callback through the sock_ops->skb_data and bpf_load_hdr_opt(). * Turn off header CB flags after 3WHS If the bpf prog does not need to write/parse header options beyond the 3WHS, the bpf prog can clear the bpf_sock_ops_cb_flags to avoid being called for header options. Or the bpf-prog can select to leave the UNKNOWN_HDR_OPT_CB_FLAG on so that the kernel will only call it when there is option that the kernel cannot handle. [1]: draft-wang-tcpm-low-latency-opt-00 https://tools.ietf.org/html/draft-wang-tcpm-low-latency-opt-00 Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200820190104.2885895-1-kafai@fb.com --- tools/include/uapi/linux/bpf.h | 300 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 299 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 18d0e128bc3c..f67ec5d9e57d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3395,6 +3395,120 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * + * If *flags* is 0, it will search the option from the + * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * has details on what skb_data contains under different + * sock_ops->op. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * >0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. + * + * **-EINVAL** If param is invalid + * + * **-ENOMSG** The option is not found + * + * **-ENOENT** No syn packet available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * + * **-ENOSPC** Not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** Cannot parse the header options in the packet + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid + * + * **-ENOSPC** Not enough space in the header. + * Nothing has been written + * + * **-EEXIST** The option has already existed + * + * **-EFAULT** Cannot parse the existing header options + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by bpf_store_hdr_opt() later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * If bpf_reserve_hdr_opt() is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if param is invalid + * + * **-ENOSPC** Not enough space in the header. + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3539,6 +3653,9 @@ union bpf_attr { FN(skc_to_tcp_request_sock), \ FN(skc_to_udp6_sock), \ FN(get_task_stack), \ + FN(load_hdr_opt), \ + FN(store_hdr_opt), \ + FN(reserve_hdr_opt), /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4165,6 +4282,36 @@ struct bpf_sock_ops { __u64 bytes_received; __u64 bytes_acked; __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ }; /* Definitions for bpf_sock_ops_cb_flags */ @@ -4173,8 +4320,48 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), - BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, @@ -4233,6 +4420,63 @@ enum { */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect @@ -4262,6 +4506,60 @@ enum { TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ }; struct bpf_perf_event_value { -- cgit v1.2.3 From 267cf9fa43d1c9d525d5d818a8651f2900e3aa9e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:01:23 -0700 Subject: tcp: bpf: Optionally store mac header in TCP_SAVE_SYN This patch is adapted from Eric's patch in an earlier discussion [1]. The TCP_SAVE_SYN currently only stores the network header and tcp header. This patch allows it to optionally store the mac header also if the setsockopt's optval is 2. It requires one more bit for the "save_syn" bit field in tcp_sock. This patch achieves this by moving the syn_smc bit next to the is_mptcp. The syn_smc is currently used with the TCP experimental option. Since syn_smc is only used when CONFIG_SMC is enabled, this patch also puts the "IS_ENABLED(CONFIG_SMC)" around it like the is_mptcp did with "IS_ENABLED(CONFIG_MPTCP)". The mac_hdrlen is also stored in the "struct saved_syn" to allow a quick offset from the bpf prog if it chooses to start getting from the network header or the tcp header. [1]: https://lore.kernel.org/netdev/CANn89iLJNWh6bkH7DNhy_kmcAexuUCccqERqe7z2QsvPhGrYPQ@mail.gmail.com/ Suggested-by: Eric Dumazet Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190123.2886935-1-kafai@fb.com --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f67ec5d9e57d..544b89a64918 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4540,6 +4540,7 @@ enum { */ TCP_BPF_SYN = 1005, /* Copy the TCP header */ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ }; enum { -- cgit v1.2.3 From f836a56e84ffc9f1a1cd73f77e10404ca46a4616 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:15 +0200 Subject: bpf: Generalize bpf_sk_storage Refactor the functionality in bpf_sk_storage.c so that concept of storage linked to kernel objects can be extended to other objects like inode, task_struct etc. Each new local storage will still be a separate map and provide its own set of helpers. This allows for future object specific extensions and still share a lot of the underlying implementation. This includes the changes suggested by Martin in: https://lore.kernel.org/bpf/20200725013047.4006241-1-kafai@fb.com/ adding new map operations to support bpf_local_storage maps: * storages for different kernel objects to optionally have different memory charging strategy (map_local_storage_charge, map_local_storage_uncharge) * Functionality to extract the storage pointer from a pointer to the owning object (map_owner_storage_ptr) Co-developed-by: Martin KaFai Lau Signed-off-by: Martin KaFai Lau Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200825182919.1118197-4-kpsingh@chromium.org --- tools/include/uapi/linux/bpf.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 544b89a64918..2cbd137eed86 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3765,9 +3765,13 @@ enum { BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), }; -/* BPF_FUNC_sk_storage_get flags */ +/* BPF_FUNC__storage_get flags */ enum { - BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, }; /* BPF_FUNC_read_branch_records flags. */ -- cgit v1.2.3 From 8ea636848aca35b9f97c5b5dee30225cf2dd0fe6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:17 +0200 Subject: bpf: Implement bpf_local_storage for inodes Similar to bpf_local_storage for sockets, add local storage for inodes. The life-cycle of storage is managed with the life-cycle of the inode. i.e. the storage is destroyed along with the owning inode. The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the security blob which are now stackable and can co-exist with other LSMs. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200825182919.1118197-6-kpsingh@chromium.org --- tools/include/uapi/linux/bpf.h | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2cbd137eed86..b6bfcd085a76 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, }; /* Note that tracing related programs such as @@ -3509,6 +3510,41 @@ union bpf_attr { * * **-EPERM** This helper cannot be used under the * current sock_ops->op. + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3655,7 +3691,9 @@ union bpf_attr { FN(get_task_stack), \ FN(load_hdr_opt), \ FN(store_hdr_opt), \ - FN(reserve_hdr_opt), + FN(reserve_hdr_opt), \ + FN(inode_storage_get), \ + FN(inode_storage_delete), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 30897832d8b97e93833fb52c0a02951db3692ed2 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:18 +0200 Subject: bpf: Allow local storage to be used from LSM programs Adds support for both bpf_{sk, inode}_storage_{get, delete} to be used in LSM programs. These helpers are not used for tracing programs (currently) as their usage is tied to the life-cycle of the object and should only be used where the owning object won't be freed (when the owning object is passed as an argument to the LSM hook). Thus, they are safer to use in LSM hooks than tracing. Usage of local storage in tracing programs will probably follow a per function based whitelist approach. Since the UAPI helper signature for bpf_sk_storage expect a bpf_sock, it, leads to a compilation warning for LSM programs, it's also updated to accept a void * pointer instead. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200825182919.1118197-7-kpsingh@chromium.org --- tools/include/uapi/linux/bpf.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b6bfcd085a76..0e1cdf806fe1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2808,7 +2808,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) * Description * Get a bpf-local-storage from a *sk*. * @@ -2824,6 +2824,9 @@ union bpf_attr { * "type". The bpf-local-storage "type" (i.e. the *map*) is * searched against all bpf-local-storages residing at *sk*. * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be * used such that a new bpf-local-storage will be * created if one does not exist. *value* can be used @@ -2836,7 +2839,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return -- cgit v1.2.3 From eae2e83e62633a2659e3bc690facba1c2fc9c45b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:19 +0200 Subject: bpf: Add BTF_SET_START/END macros Adding support to define sorted set of BTF ID values. Following defines sorted set of BTF ID values: BTF_SET_START(btf_allowlist_d_path) BTF_ID(func, vfs_truncate) BTF_ID(func, vfs_fallocate) BTF_ID(func, dentry_open) BTF_ID(func, vfs_getattr) BTF_ID(func, filp_close) BTF_SET_END(btf_allowlist_d_path) It defines following 'struct btf_id_set' variable to access values and count: struct btf_id_set btf_allowlist_d_path; Adding 'allowed' callback to struct bpf_func_proto, to allow verifier the check on allowed callers. Adding btf_id_set_contains function, which will be used by allowed callbacks to verify the caller's BTF ID value is within allowed set. Also removing extra '\' in __BTF_ID_LIST macro. Added BTF_SET_START_GLOBAL macro for global sets. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-10-jolsa@kernel.org --- tools/include/linux/btf_ids.h | 51 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h index 4867d549e3c1..210b086188a3 100644 --- a/tools/include/linux/btf_ids.h +++ b/tools/include/linux/btf_ids.h @@ -3,6 +3,11 @@ #ifndef _LINUX_BTF_IDS_H #define _LINUX_BTF_IDS_H +struct btf_id_set { + u32 cnt; + u32 ids[]; +}; + #ifdef CONFIG_DEBUG_INFO_BTF #include /* for __PASTE */ @@ -62,7 +67,7 @@ asm( \ ".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ "." #scope " " #name "; \n" \ #name ":; \n" \ -".popsection; \n"); \ +".popsection; \n"); #define BTF_ID_LIST(name) \ __BTF_ID_LIST(name, local) \ @@ -88,12 +93,56 @@ asm( \ ".zero 4 \n" \ ".popsection; \n"); +/* + * The BTF_SET_START/END macros pair defines sorted list of + * BTF IDs plus its members count, with following layout: + * + * BTF_SET_START(list) + * BTF_ID(type1, name1) + * BTF_ID(type2, name2) + * BTF_SET_END(list) + * + * __BTF_ID__set__list: + * .zero 4 + * list: + * __BTF_ID__type1__name1__3: + * .zero 4 + * __BTF_ID__type2__name2__4: + * .zero 4 + * + */ +#define __BTF_SET_START(name, scope) \ +asm( \ +".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ +"." #scope " __BTF_ID__set__" #name "; \n" \ +"__BTF_ID__set__" #name ":; \n" \ +".zero 4 \n" \ +".popsection; \n"); + +#define BTF_SET_START(name) \ +__BTF_ID_LIST(name, local) \ +__BTF_SET_START(name, local) + +#define BTF_SET_START_GLOBAL(name) \ +__BTF_ID_LIST(name, globl) \ +__BTF_SET_START(name, globl) + +#define BTF_SET_END(name) \ +asm( \ +".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ +".size __BTF_ID__set__" #name ", .-" #name " \n" \ +".popsection; \n"); \ +extern struct btf_id_set name; + #else #define BTF_ID_LIST(name) static u32 name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_SET_START(name) static struct btf_id_set name = { 0 }; +#define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; +#define BTF_SET_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ -- cgit v1.2.3 From 6e22ab9da79343532cd3cde39df25e5a5478c692 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:20 +0200 Subject: bpf: Add d_path helper Adding d_path helper function that returns full path for given 'struct path' object, which needs to be the kernel BTF 'path' object. The path is returned in buffer provided 'buf' of size 'sz' and is zero terminated. bpf_d_path(&file->f_path, buf, size); The helper calls directly d_path function, so there's only limited set of function it can be called from. Adding just very modest set for the start. Updating also bpf.h tools uapi header and adding 'path' to bpf_helpers_doc.py script. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200825192124.710397-11-jolsa@kernel.org --- tools/include/uapi/linux/bpf.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0e1cdf806fe1..0388bc0200b0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3513,6 +3513,7 @@ union bpf_attr { * * **-EPERM** This helper cannot be used under the * current sock_ops->op. + * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description * Get a bpf_local_storage from an *inode*. @@ -3548,6 +3549,18 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given 'struct path' object, which + * needs to be the kernel BTF 'path' object. The path is + * returned in the provided buffer 'buf' of size 'sz' and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3697,6 +3710,7 @@ union bpf_attr { FN(reserve_hdr_opt), \ FN(inode_storage_get), \ FN(inode_storage_delete), \ + FN(d_path), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From b0c9eb37817943840a1a82dbc998c491609a0afd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 27 Aug 2020 22:19:22 -0700 Subject: bpf: Make bpf_link_info.iter similar to bpf_iter_link_info bpf_link_info.iter is used by link_query to return bpf_iter_link_info to user space. Fields may be different, e.g., map_fd vs. map_id, so we cannot reuse the exact structure. But make them similar, e.g., struct bpf_link_info { /* common fields */ union { struct { ... } raw_tracepoint; struct { ... } tracing; ... struct { /* common fields for iter */ union { struct { __u32 map_id; } map; /* other structs for other targets */ }; }; }; }; so the structure is extensible the same way as bpf_iter_link_info. Fixes: 6b0a249a301e ("bpf: Implement link_query for bpf iterators") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200828051922.758950-1-yhs@fb.com --- tools/include/uapi/linux/bpf.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0388bc0200b0..ef7af384f5ee 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4251,8 +4251,10 @@ struct bpf_link_info { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ union { - __u32 map_id; - } map; + struct { + __u32 map_id; + } map; + }; } iter; struct { __u32 netns_ino; -- cgit v1.2.3 From 1e6c62a8821557720a9b2ea9617359b264f2f67c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 27 Aug 2020 15:01:11 -0700 Subject: bpf: Introduce sleepable BPF programs Introduce sleepable BPF programs that can request such property for themselves via BPF_F_SLEEPABLE flag at program load time. In such case they will be able to use helpers like bpf_copy_from_user() that might sleep. At present only fentry/fexit/fmod_ret and lsm programs can request to be sleepable and only when they are attached to kernel functions that are known to allow sleeping. The non-sleepable programs are relying on implicit rcu_read_lock() and migrate_disable() to protect life time of programs, maps that they use and per-cpu kernel structures used to pass info between bpf programs and the kernel. The sleepable programs cannot be enclosed into rcu_read_lock(). migrate_disable() maps to preempt_disable() in non-RT kernels, so the progs should not be enclosed in migrate_disable() as well. Therefore rcu_read_lock_trace is used to protect the life time of sleepable progs. There are many networking and tracing program types. In many cases the 'struct bpf_prog *' pointer itself is rcu protected within some other kernel data structure and the kernel code is using rcu_dereference() to load that program pointer and call BPF_PROG_RUN() on it. All these cases are not touched. Instead sleepable bpf programs are allowed with bpf trampoline only. The program pointers are hard-coded into generated assembly of bpf trampoline and synchronize_rcu_tasks_trace() is used to protect the life time of the program. The same trampoline can hold both sleepable and non-sleepable progs. When rcu_read_lock_trace is held it means that some sleepable bpf program is running from bpf trampoline. Those programs can use bpf arrays and preallocated hash/lru maps. These map types are waiting on programs to complete via synchronize_rcu_tasks_trace(); Updates to trampoline now has to do synchronize_rcu_tasks_trace() and synchronize_rcu_tasks() to wait for sleepable progs to finish and for trampoline assembly to finish. This is the first step of introducing sleepable progs. Eventually dynamically allocated hash maps can be allowed and networking program types can become sleepable too. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Reviewed-by: Josef Bacik Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200827220114.69225-3-alexei.starovoitov@gmail.com --- tools/include/uapi/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ef7af384f5ee..6e8b706aeb05 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -346,6 +346,14 @@ enum bpf_link_type { /* The verifier internal test flag. Behavior is undefined */ #define BPF_F_TEST_STATE_FREQ (1U << 3) +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * -- cgit v1.2.3 From 07be4c4a3e7a0db148e44b16c5190e753d1c8569 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 27 Aug 2020 15:01:12 -0700 Subject: bpf: Add bpf_copy_from_user() helper. Sleepable BPF programs can now use copy_from_user() to access user memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200827220114.69225-4-alexei.starovoitov@gmail.com --- tools/include/uapi/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6e8b706aeb05..a613750d5515 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3569,6 +3569,13 @@ union bpf_attr { * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of copy_from_user(). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3719,6 +3726,7 @@ union bpf_attr { FN(inode_storage_get), \ FN(inode_storage_delete), \ FN(d_path), \ + FN(copy_from_user), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From bc0b5a03079bd78fb3d5cba1ccabf0a7efb1d99f Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 4 Sep 2020 17:14:54 +0100 Subject: tools, bpf: Synchronise BPF UAPI header with tools Synchronise the bpf.h header under tools, to report the fixes recently brought to the documentation for the BPF helpers. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904161454.31135-4-quentin@isovalent.com --- tools/include/uapi/linux/bpf.h | 87 ++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 42 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8dda13880957..90359cab501d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3349,38 +3349,38 @@ union bpf_attr { * Description * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *task*, which is a valid - * pointer to struct task_struct. To store the stacktrace, the - * bpf program provides *buf* with a nonnegative *size*. + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. * * The last argument, *flags*, holds the number of stack frames to * skip (from 0 to 255), masked with @@ -3410,12 +3410,12 @@ union bpf_attr { * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description * Load header option. Support reading a particular TCP header - * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). * * If *flags* is 0, it will search the option from the - * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** * has details on what skb_data contains under different - * sock_ops->op. + * *skops*\ **->op**. * * The first byte of the *searchby_res* specifies the * kind that it wants to search. @@ -3435,7 +3435,7 @@ union bpf_attr { * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. * * To search for the standard window scale option (3), - * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. * Note, kind-length must be 0 for regular option. * * Searching for No-Op (0) and End-of-Option-List (1) are @@ -3445,27 +3445,30 @@ union bpf_attr { * of a header option. * * Supported flags: + * * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the * saved_syn packet or the just-received syn packet. * * Return - * >0 when found, the header option is copied to *searchby_res*. - * The return value is the total length copied. + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: * - * **-EINVAL** If param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOMSG** The option is not found + * **-ENOMSG** if the option is not found. * - * **-ENOENT** No syn packet available when - * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. * - * **-ENOSPC** Not enough space. Only *len* number of - * bytes are copied. + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. * - * **-EFAULT** Cannot parse the header options in the packet + * **-EFAULT** on failure to parse the header options in the + * packet. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) * Description @@ -3483,44 +3486,44 @@ union bpf_attr { * by searching the same option in the outgoing skb. * * This helper can only be called during - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** If param is invalid + * **-EINVAL** If param is invalid. * - * **-ENOSPC** Not enough space in the header. - * Nothing has been written + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written * - * **-EEXIST** The option has already existed + * **-EEXIST** if the option already exists. * - * **-EFAULT** Cannot parse the existing header options + * **-EFAULT** on failrue to parse the existing header options. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) * Description * Reserve *len* bytes for the bpf header option. The - * space will be used by bpf_store_hdr_opt() later in - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * - * If bpf_reserve_hdr_opt() is called multiple times, + * If **bpf_reserve_hdr_opt**\ () is called multiple times, * the total number of bytes will be reserved. * * This helper can only be called during - * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** if param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOSPC** Not enough space in the header. + * **-ENOSPC** if there is not enough space in the header. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description @@ -3560,9 +3563,9 @@ union bpf_attr { * * long bpf_d_path(struct path *path, char *buf, u32 sz) * Description - * Return full path for given 'struct path' object, which - * needs to be the kernel BTF 'path' object. The path is - * returned in the provided buffer 'buf' of size 'sz' and + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and * is zero terminated. * * Return @@ -3573,7 +3576,7 @@ union bpf_attr { * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) * Description * Read *size* bytes from user space address *user_ptr* and store - * the data in *dst*. This is a wrapper of copy_from_user(). + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. */ -- cgit v1.2.3 From 1aef5b4391f0c75c0a1523706a7b0311846ee12f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 10 Sep 2020 13:33:14 -0700 Subject: bpf: Fix comment for helper bpf_current_task_under_cgroup() This should be "current" not "skb". Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)") Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Cc: Link: https://lore.kernel.org/bpf/20200910203314.70018-1-songliubraving@fb.com --- tools/include/uapi/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 90359cab501d..7dd314176df7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1447,8 +1447,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if the *skb* task belongs to the cgroup2. - * * 1, if the *skb* task does not belong to the cgroup2. + * * 0, if current task belongs to the cgroup2. + * * 1, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) -- cgit v1.2.3 From ef15314aa5de955c6afd87d512e8b00f5ac08d06 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:40 -0700 Subject: bpf: Add BPF_PROG_BIND_MAP syscall This syscall binds a map to a program. Returns success if the map is already bound to the program. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-3-sdf@google.com --- tools/include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7dd314176df7..a22812561064 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -124,6 +124,7 @@ enum bpf_cmd { BPF_ENABLE_STATS, BPF_ITER_CREATE, BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, }; enum bpf_map_type { @@ -658,6 +659,12 @@ union bpf_attr { __u32 flags; } iter_create; + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 27774b7073b5d520c80f1fcb8e9993fc139f21bd Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:19 +0100 Subject: btf: Add BTF_ID_LIST_SINGLE macro Add a convenience macro that allows defining a BTF ID list with a single item. This lets us cut down on repetitive macros. Suggested-by: Andrii Nakryiko Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-4-lmb@cloudflare.com --- tools/include/linux/btf_ids.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'tools/include') diff --git a/tools/include/linux/btf_ids.h b/tools/include/linux/btf_ids.h index 210b086188a3..57890b357f85 100644 --- a/tools/include/linux/btf_ids.h +++ b/tools/include/linux/btf_ids.h @@ -76,6 +76,13 @@ extern u32 name[]; #define BTF_ID_LIST_GLOBAL(name) \ __BTF_ID_LIST(name, globl) +/* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with + * a single entry. + */ +#define BTF_ID_LIST_SINGLE(name, prefix, typename) \ + BTF_ID_LIST(name) \ + BTF_ID(prefix, typename) + /* * The BTF_ID_UNUSED macro defines 4 zero bytes. * It's used when we want to define 'unused' entry @@ -140,6 +147,7 @@ extern struct btf_id_set name; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; #define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; #define BTF_SET_END(name) -- cgit v1.2.3 From a5fa25adf03d4b063aece74ba70ccbb3a71af122 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:03:56 -0700 Subject: bpf: Change bpf_sk_release and bpf_sk_*cgroup_id to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON The previous patch allows the networking bpf prog to use the bpf_skc_to_*() helpers to get a PTR_TO_BTF_ID socket pointer, e.g. "struct tcp_sock *". It allows the bpf prog to read all the fields of the tcp_sock. This patch changes the bpf_sk_release() and bpf_sk_*cgroup_id() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. For example, the following will work: sk = bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); if (!sk) return; tp = bpf_skc_to_tcp_sock(sk); if (!tp) { bpf_sk_release(sk); return; } lsndtime = tp->lsndtime; /* Pass tp to bpf_sk_release() will also work */ bpf_sk_release(tp); Since PTR_TO_BTF_ID could be NULL, the helper taking ARG_PTR_TO_BTF_ID_SOCK_COMMON has to check for NULL at runtime. A btf_id of "struct sock" may not always mean a fullsock. Regardless the helper's running context may get a non-fullsock or not, considering fullsock check/handling is pretty cheap, it is better to keep the same verifier expectation on helper that takes ARG_PTR_TO_BTF_ID* will be able to handle the minisock situation. In the bpf_sk_*cgroup_id() case, it will try to get a fullsock by using sk_to_full_sk() as its skb variant bpf_sk"b"_*cgroup_id() has already been doing. bpf_sk_release can already handle minisock, so nothing special has to be done. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200925000356.3856047-1-kafai@fb.com --- tools/include/uapi/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a22812561064..c96a56d9c3be 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2512,7 +2512,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * long bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(void *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -3234,11 +3234,11 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * u64 bpf_sk_cgroup_id(void *sk) * Description * Return the cgroup v2 id of the socket *sk*. * - * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * *sk* must be a non-**NULL** pointer to a socket, e.g. one * returned from **bpf_sk_lookup_xxx**\ (), * **bpf_sk_fullsock**\ (), etc. The format of returned id is * same as in **bpf_skb_cgroup_id**\ (). @@ -3248,7 +3248,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level) * Description * Return id of cgroup v2 that is ancestor of cgroup associated * with the *sk* at the *ancestor_level*. The root cgroup is at -- cgit v1.2.3 From 592a3498648af000e93dff2d36229ab11cd8c7f6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:02 -0700 Subject: bpf: Change bpf_sk_storage_*() to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_sk_storage_*() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. A micro benchmark has been done on a "cgroup_skb/egress" bpf program which does a bpf_sk_storage_get(). It was driven by netperf doing a 4096 connected UDP_STREAM test with 64bytes packet. The stats from "kernel.bpf_stats_enabled" shows no meaningful difference. The sk_storage_get_btf_proto, sk_storage_delete_btf_proto, btf_sk_storage_get_proto, and btf_sk_storage_delete_proto are no longer needed, so they are removed. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20200925000402.3856307-1-kafai@fb.com --- tools/include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c96a56d9c3be..0ec6dbeb17a5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2861,6 +2861,7 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). * * long bpf_send_signal(u32 sig) * Description -- cgit v1.2.3 From c0df236e1394970f3503a8fb103de95d000014ca Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:09 -0700 Subject: bpf: Change bpf_tcp_*_syncookie to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_tcp_*_syncookie() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20200925000409.3856725-1-kafai@fb.com --- tools/include/uapi/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 0ec6dbeb17a5..69b9e30375bc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2692,7 +2692,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2878,7 +2878,7 @@ union bpf_attr { * * **-EAGAIN** if bpf program can try again. * - * s64 bpf_tcp_gen_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Try to issue a SYN cookie for the packet with corresponding * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. -- cgit v1.2.3 From 27e5203bd9c5cc6d54dcac48c3027f3f04522b8b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:15 -0700 Subject: bpf: Change bpf_sk_assign to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_sk_assign() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. The bpf_sk_lookup_assign() is taking ARG_PTR_TO_SOCKET_"OR_NULL". Meaning it specifically takes a literal NULL. ARG_PTR_TO_BTF_ID_SOCK_COMMON does not allow a literal NULL, so another ARG type is required for this purpose and another follow-up patch can be used if there is such need. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200925000415.3857374-1-kafai@fb.com --- tools/include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 69b9e30375bc..2d6519a2ed77 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3107,7 +3107,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags) * Description * Helper is overloaded depending on BPF program type. This * description applies to **BPF_PROG_TYPE_SCHED_CLS** and -- cgit v1.2.3 From 1b4d60ec162f82ea29a2e7a907b5c6cc9f926321 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 25 Sep 2020 13:54:29 -0700 Subject: bpf: Enable BPF_PROG_TEST_RUN for raw_tracepoint Add .test_run for raw_tracepoint. Also, introduce a new feature that runs the target program on a specific CPU. This is achieved by a new flag in bpf_attr.test, BPF_F_TEST_RUN_ON_CPU. When this flag is set, the program is triggered on cpu with id bpf_attr.test.cpu. This feature is needed for BPF programs that handle perf_event and other percpu resources, as the program can access these resource locally. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200925205432.1777-2-songliubraving@fb.com --- tools/include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2d6519a2ed77..82522f05c021 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -424,6 +424,11 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* Flags for BPF_PROG_TEST_RUN */ + +/* If set, run the test on the cpu specified by bpf_attr.test.cpu */ +#define BPF_F_TEST_RUN_ON_CPU (1U << 0) + /* type for BPF_ENABLE_STATS */ enum bpf_stats_type { /* enabled run_time_ns and run_cnt */ @@ -566,6 +571,8 @@ union bpf_attr { */ __aligned_u64 ctx_in; __aligned_u64 ctx_out; + __u32 flags; + __u32 cpu; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ -- cgit v1.2.3 From c4d0bfb45068d853a478b9067a95969b1886a30f Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:05 +0100 Subject: bpf: Add bpf_snprintf_btf helper A helper is added to support tracing kernel type information in BPF using the BPF Type Format (BTF). Its signature is long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags); struct btf_ptr * specifies - a pointer to the data to be traced - the BTF id of the type of data pointed to - a flags field is provided for future use; these flags are not to be confused with the BTF_F_* flags below that control how the btf_ptr is displayed; the flags member of the struct btf_ptr may be used to disambiguate types in kernel versus module BTF, etc; the main distinction is the flags relate to the type and information needed in identifying it; not how it is displayed. For example a BPF program with a struct sk_buff *skb could do the following: static struct btf_ptr b = { }; b.ptr = skb; b.type_id = __builtin_btf_type_id(struct sk_buff, 1); bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0); Default output looks like this: (struct sk_buff){ .transport_header = (__u16)65535, .mac_header = (__u16)65535, .end = (sk_buff_data_t)192, .head = (unsigned char *)0x000000007524fd8b, .data = (unsigned char *)0x000000007524fd8b, .truesize = (unsigned int)768, .users = (refcount_t){ .refs = (atomic_t){ .counter = (int)1, }, }, } Flags modifying display are as follows: - BTF_F_COMPACT: no formatting around type information - BTF_F_NONAME: no struct/union member names/types - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; equivalent to %px. - BTF_F_ZERO: show zero-valued struct/union members; they are not displayed by default Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-4-git-send-email-alan.maguire@oracle.com --- tools/include/uapi/linux/bpf.h | 67 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 82522f05c021..cca9eb1b13e5 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3594,6 +3594,42 @@ union bpf_attr { * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags) + * Description + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * Return + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3745,6 +3781,7 @@ union bpf_attr { FN(inode_storage_delete), \ FN(d_path), \ FN(copy_from_user), \ + FN(snprintf_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4853,4 +4890,34 @@ struct bpf_sk_lookup { __u32 local_port; /* Host byte order */ }; +/* + * struct btf_ptr is used for typed pointer representation; the + * type id is used to render the pointer data as the appropriate type + * via the bpf_snprintf_btf() helper described above. A flags field - + * potentially to specify additional details about the BTF pointer + * (rather than its mode of display) - is included for future use. + * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately. + */ +struct btf_ptr { + void *ptr; + __u32 type_id; + __u32 flags; /* BTF ptr flags; unused at present. */ +}; + +/* + * Flags to control bpf_snprintf_btf() behaviour. + * - BTF_F_COMPACT: no formatting around type information + * - BTF_F_NONAME: no struct/union member names/types + * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_F_ZERO: show zero-valued struct/union members; they + * are not displayed by default + */ +enum { + BTF_F_COMPACT = (1ULL << 0), + BTF_F_NONAME = (1ULL << 1), + BTF_F_PTR_RAW = (1ULL << 2), + BTF_F_ZERO = (1ULL << 3), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From eb411377aed9e27835e77ee0710ee8f4649958f3 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:09 +0100 Subject: bpf: Add bpf_seq_printf_btf helper A helper is added to allow seq file writing of kernel data structures using vmlinux BTF. Its signature is long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags); Flags and struct btf_ptr definitions/use are identical to the bpf_snprintf_btf helper, and the helper returns 0 on success or a negative error value. Suggested-by: Alexei Starovoitov Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-8-git-send-email-alan.maguire@oracle.com --- tools/include/uapi/linux/bpf.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index cca9eb1b13e5..96ddb00b91dc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3630,6 +3630,14 @@ union bpf_attr { * The number of bytes that were written (or would have been * written if output had to be truncated due to string size), * or a negative error in cases of failure. + * + * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags) + * Description + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * Return + * 0 on success or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3782,6 +3790,7 @@ union bpf_attr { FN(d_path), \ FN(copy_from_user), \ FN(snprintf_btf), \ + FN(seq_printf_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 4a1e7c0c63e02daad751842b7880f9bbcdfb6e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 29 Sep 2020 14:45:51 +0200 Subject: bpf: Support attaching freplace programs to multiple attach points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This enables support for attaching freplace programs to multiple attach points. It does this by amending the UAPI for bpf_link_Create with a target btf ID that can be used to supply the new attachment point along with the target program fd. The target must be compatible with the target that was supplied at program load time. The implementation reuses the checks that were factored out of check_attach_btf_id() to ensure compatibility between the BTF types of the old and new attachment. If these match, a new bpf_tracing_link will be created for the new attach target, allowing multiple attachments to co-exist simultaneously. The code could theoretically support multiple-attach of other types of tracing programs as well, but since I don't have a use case for any of those, there is no API support for doing so. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/160138355169.48470.17165680973640685368.stgit@toke.dk --- tools/include/uapi/linux/bpf.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 96ddb00b91dc..2b1d3f16cbd1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -639,8 +639,13 @@ union bpf_attr { }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ - __aligned_u64 iter_info; /* extra bpf_iter_link_info */ - __u32 iter_info_len; /* iter_info length */ + union { + __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ + }; + }; } link_create; struct { /* struct used by BPF_LINK_UPDATE command */ -- cgit v1.2.3 From b426ce83baa7dff947fb354118d3133f2953aac8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:15 +0200 Subject: bpf: Add classid helper only based on skb->sk Similarly to 5a52ae4e32a6 ("bpf: Allow to retrieve cgroup v1 classid from v2 hooks"), add a helper to retrieve cgroup v1 classid solely based on the skb->sk, so it can be used as key as part of BPF map lookups out of tc from host ns, in particular given the skb->sk is retained these days when crossing net ns thanks to 9c4c325252c5 ("skbuff: preserve sock reference when scrubbing the skb."). This is similar to bpf_skb_cgroup_id() which implements the same for v2. Kubernetes ecosystem is still operating on v1 however, hence net_cls needs to be used there until this can be dropped in with the v2 helper of bpf_skb_cgroup_id(). Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/ed633cf27a1c620e901c5aa99ebdefb028dce600.1601477936.git.daniel@iogearbox.net --- tools/include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2b1d3f16cbd1..6116a7f54c8f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3643,6 +3643,15 @@ union bpf_attr { * *flags* are identical to those used for bpf_snprintf_btf. * Return * 0 on success or a negative error in case of failure. + * + * u64 bpf_skb_cgroup_classid(struct sk_buff *skb) + * Description + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3796,6 +3805,7 @@ union bpf_attr { FN(copy_from_user), \ FN(snprintf_btf), \ FN(seq_printf_btf), \ + FN(skb_cgroup_classid), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From b4ab31414970a7a03a5d55d75083f2c101a30592 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:17 +0200 Subject: bpf: Add redirect_neigh helper as redirect drop-in Add a redirect_neigh() helper as redirect() drop-in replacement for the xmit side. Main idea for the helper is to be very similar in semantics to the latter just that the skb gets injected into the neighboring subsystem in order to let the stack do the work it knows best anyway to populate the L2 addresses of the packet and then hand over to dev_queue_xmit() as redirect() does. This solves two bigger items: i) skbs don't need to go up to the stack on the host facing veth ingress side for traffic egressing the container to achieve the same for populating L2 which also has the huge advantage that ii) the skb->sk won't get orphaned in ip_rcv_core() when entering the IP routing layer on the host stack. Given that skb->sk neither gets orphaned when crossing the netns as per 9c4c325252c5 ("skbuff: preserve sock reference when scrubbing the skb.") the helper can then push the skbs directly to the phys device where FQ scheduler can do its work and TCP stack gets proper backpressure given we hold on to skb->sk as long as skb is still residing in queues. With the helper used in BPF data path to then push the skb to the phys device, I observed a stable/consistent TCP_STREAM improvement on veth devices for traffic going container -> host -> host -> container from ~10Gbps to ~15Gbps for a single stream in my test environment. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: David Ahern Acked-by: Martin KaFai Lau Cc: David Ahern Link: https://lore.kernel.org/bpf/f207de81629e1724899b73b8112e0013be782d35.1601477936.git.daniel@iogearbox.net --- tools/include/uapi/linux/bpf.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6116a7f54c8f..1f17c6752deb 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3652,6 +3652,19 @@ union bpf_attr { * associated socket instead of the current process. * Return * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * fills in e.g. MAC addresses based on the L3 information from + * the packet. This helper is supported for IPv4 and IPv6 protocols. + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3806,6 +3819,7 @@ union bpf_attr { FN(snprintf_btf), \ FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ + FN(redirect_neigh), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 792caccc4526bb489e054f9ab61d7c024b15dea2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 30 Sep 2020 15:49:26 -0700 Subject: bpf: Introduce BPF_F_PRESERVE_ELEMS for perf event array Currently, perf event in perf event array is removed from the array when the map fd used to add the event is closed. This behavior makes it difficult to the share perf events with perf event array. Introduce perf event map that keeps the perf event open with a new flag BPF_F_PRESERVE_ELEMS. With this flag set, perf events in the array are not removed when the original map fd is closed. Instead, the perf event will stay in the map until 1) it is explicitly removed from the array; or 2) the array is freed. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200930224927.1936644-2-songliubraving@fb.com --- tools/include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1f17c6752deb..4f556cfcbfbe 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -414,6 +414,9 @@ enum { /* Enable memory-mapping BPF map */ BPF_F_MMAPABLE = (1U << 10), + +/* Share perf_event among processes */ + BPF_F_PRESERVE_ELEMS = (1U << 11), }; /* Flags for BPF_PROG_QUERY. */ -- cgit v1.2.3 From 4976b718c3551faba2c0616ef55ebeb74db1c5ca Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:44 -0700 Subject: bpf: Introduce pseudo_btf_id Pseudo_btf_id is a type of ld_imm insn that associates a btf_id to a ksym so that further dereferences on the ksym can use the BTF info to validate accesses. Internally, when seeing a pseudo_btf_id ld insn, the verifier reads the btf_id stored in the insn[0]'s imm field and marks the dst_reg as PTR_TO_BTF_ID. The btf_id points to a VAR_KIND, which is encoded in btf_vminux by pahole. If the VAR is not of a struct type, the dst reg will be marked as PTR_TO_MEM instead of PTR_TO_BTF_ID and the mem_size is resolved to the size of the VAR's type. >From the VAR btf_id, the verifier can also read the address of the ksym's corresponding kernel var from kallsyms and use that to fill dst_reg. Therefore, the proper functionality of pseudo_btf_id depends on (1) kallsyms and (2) the encoding of kernel global VARs in pahole, which should be available since pahole v1.18. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-2-haoluo@google.com --- tools/include/uapi/linux/bpf.h | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4f556cfcbfbe..2aa156af24d6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -356,18 +356,36 @@ enum bpf_link_type { #define BPF_F_SLEEPABLE (1U << 4) /* When BPF ldimm64's insn[0].src_reg != 0 then this can have - * two extensions: - * - * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE - * insn[0].imm: map fd map fd - * insn[1].imm: 0 offset into value - * insn[0].off: 0 0 - * insn[1].off: 0 0 - * ldimm64 rewrite: address of map address of map[0]+offset - * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD + * insn[0].imm: map fd + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP */ #define BPF_PSEUDO_MAP_FD 1 +/* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_VALUE 2 +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function -- cgit v1.2.3 From eaa6bcb71ef6ed3dc18fc525ee7e293b06b4882b Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:47 -0700 Subject: bpf: Introduce bpf_per_cpu_ptr() Add bpf_per_cpu_ptr() to help bpf programs access percpu vars. bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the kernel except that it may return NULL. This happens when the cpu parameter is out of range. So the caller must check the returned value. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-5-haoluo@google.com --- tools/include/uapi/linux/bpf.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2aa156af24d6..f3c1b637ab39 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3686,6 +3686,23 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3841,6 +3858,7 @@ union bpf_attr { FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ FN(redirect_neigh), \ + FN(bpf_per_cpu_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 63d9b80dcf2c67bc5ade61cbbaa09d7af21f43f1 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:48 -0700 Subject: bpf: Introducte bpf_this_cpu_ptr() Add bpf_this_cpu_ptr() to help access percpu var on this cpu. This helper always returns a valid pointer, therefore no need to check returned value for NULL. Also note that all programs run with preemption disabled, which means that the returned pointer is stable during all the execution of the program. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-6-haoluo@google.com --- tools/include/uapi/linux/bpf.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f3c1b637ab39..c446394135be 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3703,6 +3703,18 @@ union bpf_attr { * Return * A pointer pointing to the kernel percpu variable on *cpu*, or * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3859,6 +3871,7 @@ union bpf_attr { FN(skb_cgroup_classid), \ FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ + FN(bpf_this_cpu_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 49f3d12b0f70ea867b891ad2a97f6e51bb564e18 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Wed, 7 Oct 2020 07:57:17 +0200 Subject: bpf: Fix typo in uapi/linux/bpf.h Reported-by: Samanta Navarro Signed-off-by: Jakub Wilk Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201007055717.7319-1-jwilk@jwilk.net --- tools/include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c446394135be..d83561e8cd2c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2253,7 +2253,7 @@ union bpf_attr { * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. - * if the verdeict eBPF program returns **SK_PASS**), redirect it + * if the verdict eBPF program returns **SK_PASS**), redirect it * to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and * egress interfaces can be used for redirection. The -- cgit v1.2.3 From eca43ee6c46db92dd850ce659316b0680d70e137 Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Fri, 9 Oct 2020 07:03:25 +0000 Subject: bpf: Add tcp_notsent_lowat bpf setsockopt Adding support for TCP_NOTSENT_LOWAT sockoption (https://lwn.net/Articles/560082/) in tcp bpf programs. Signed-off-by: Nikita V. Shirokov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201009070325.226855-1-tehnerd@tehnerd.com --- tools/include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d83561e8cd2c..42d2df799397 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1698,7 +1698,7 @@ union bpf_attr { * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return -- cgit v1.2.3 From dd2ce6a5373c6f5c830be54be10775458a8bd312 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:01 +0200 Subject: bpf: Improve bpf_redirect_neigh helper description Follow-up to address David's feedback that we should better describe internals of the bpf_redirect_neigh() helper. Suggested-by: David Ahern Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: David Ahern Link: https://lore.kernel.org/bpf/20201010234006.7075-2-daniel@iogearbox.net --- tools/include/uapi/linux/bpf.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 42d2df799397..4272cc53d478 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3679,10 +3679,14 @@ union bpf_attr { * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it - * fills in e.g. MAC addresses based on the L3 information from - * the packet. This helper is supported for IPv4 and IPv6 protocols. + * populates L2 addresses as well, meaning, internally, the helper + * performs a FIB lookup based on the skb's networking header to + * get the address of the next hop and then relies on the neighbor + * lookup for the L2 address of the nexthop. + * * The *flags* argument is reserved and must be 0. The helper is - * currently only supported for tc BPF program types. + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. -- cgit v1.2.3 From 9aa1206e8f48222f35a0c809f33b2f4aaa1e2661 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:02 +0200 Subject: bpf: Add redirect_peer helper Add an efficient ingress to ingress netns switch that can be used out of tc BPF programs in order to redirect traffic from host ns ingress into a container veth device ingress without having to go via CPU backlog queue [0]. For local containers this can also be utilized and path via CPU backlog queue only needs to be taken once, not twice. On a high level this borrows from ipvlan which does similar switch in __netif_receive_skb_core() and then iterates via another_round. This helps to reduce latency for mentioned use cases. Pod to remote pod with redirect(), TCP_RR [1]: # percpu_netperf 10.217.1.33 RT_LATENCY: 122.450 (per CPU: 122.666 122.401 122.333 122.401 ) MEAN_LATENCY: 121.210 (per CPU: 121.100 121.260 121.320 121.160 ) STDDEV_LATENCY: 120.040 (per CPU: 119.420 119.910 125.460 115.370 ) MIN_LATENCY: 46.500 (per CPU: 47.000 47.000 47.000 45.000 ) P50_LATENCY: 118.500 (per CPU: 118.000 119.000 118.000 119.000 ) P90_LATENCY: 127.500 (per CPU: 127.000 128.000 127.000 128.000 ) P99_LATENCY: 130.750 (per CPU: 131.000 131.000 129.000 132.000 ) TRANSACTION_RATE: 32666.400 (per CPU: 8152.200 8169.842 8174.439 8169.897 ) Pod to remote pod with redirect_peer(), TCP_RR: # percpu_netperf 10.217.1.33 RT_LATENCY: 44.449 (per CPU: 43.767 43.127 45.279 45.622 ) MEAN_LATENCY: 45.065 (per CPU: 44.030 45.530 45.190 45.510 ) STDDEV_LATENCY: 84.823 (per CPU: 66.770 97.290 84.380 90.850 ) MIN_LATENCY: 33.500 (per CPU: 33.000 33.000 34.000 34.000 ) P50_LATENCY: 43.250 (per CPU: 43.000 43.000 43.000 44.000 ) P90_LATENCY: 46.750 (per CPU: 46.000 47.000 47.000 47.000 ) P99_LATENCY: 52.750 (per CPU: 51.000 54.000 53.000 53.000 ) TRANSACTION_RATE: 90039.500 (per CPU: 22848.186 23187.089 22085.077 21919.130 ) [0] https://linuxplumbersconf.org/event/7/contributions/674/attachments/568/1002/plumbers_2020_cilium_load_balancer.pdf [1] https://github.com/borkmann/netperf_scripts/blob/master/percpu_netperf Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201010234006.7075-3-daniel@iogearbox.net --- tools/include/uapi/linux/bpf.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4272cc53d478..b97bc5abb3b8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3719,6 +3719,22 @@ union bpf_attr { * never return NULL. * Return * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3876,6 +3892,7 @@ union bpf_attr { FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ + FN(redirect_peer), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 4a8f87e60f6db40e640f1db555d063b2c4dea5f1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:03 +0200 Subject: bpf: Allow for map-in-map with dynamic inner array map entries Recent work in f4d05259213f ("bpf: Add map_meta_equal map ops") and 134fede4eecf ("bpf: Relax max_entries check for most of the inner map types") added support for dynamic inner max elements for most map-in-map types. Exceptions were maps like array or prog array where the map_gen_lookup() callback uses the maps' max_entries field as a constant when emitting instructions. We recently implemented Maglev consistent hashing into Cilium's load balancer which uses map-in-map with an outer map being hash and inner being array holding the Maglev backend table for each service. This has been designed this way in order to reduce overall memory consumption given the outer hash map allows to avoid preallocating a large, flat memory area for all services. Also, the number of service mappings is not always known a-priori. The use case for dynamic inner array map entries is to further reduce memory overhead, for example, some services might just have a small number of back ends while others could have a large number. Right now the Maglev backend table for small and large number of backends would need to have the same inner array map entries which adds a lot of unneeded overhead. Dynamic inner array map entries can be realized by avoiding the inlined code generation for their lookup. The lookup will still be efficient since it will be calling into array_map_lookup_elem() directly and thus avoiding retpoline. The patch adds a BPF_F_INNER_MAP flag to map creation which therefore skips inline code generation and relaxes array_map_meta_equal() check to ignore both maps' max_entries. This also still allows to have faster lookups for map-in-map when BPF_F_INNER_MAP is not specified and hence dynamic max_entries not needed. Example code generation where inner map is dynamic sized array: # bpftool p d x i 125 int handle__sys_enter(void * ctx): ; int handle__sys_enter(void *ctx) 0: (b4) w1 = 0 ; int key = 0; 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 ; 3: (07) r2 += -4 ; inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key); 4: (18) r1 = map[id:468] 6: (07) r1 += 272 7: (61) r0 = *(u32 *)(r2 +0) 8: (35) if r0 >= 0x3 goto pc+5 9: (67) r0 <<= 3 10: (0f) r0 += r1 11: (79) r0 = *(u64 *)(r0 +0) 12: (15) if r0 == 0x0 goto pc+1 13: (05) goto pc+1 14: (b7) r0 = 0 15: (b4) w6 = -1 ; if (!inner_map) 16: (15) if r0 == 0x0 goto pc+6 17: (bf) r2 = r10 ; 18: (07) r2 += -4 ; val = bpf_map_lookup_elem(inner_map, &key); 19: (bf) r1 = r0 | No inlining but instead 20: (85) call array_map_lookup_elem#149280 | call to array_map_lookup_elem() ; return val ? *val : -1; | for inner array lookup. 21: (15) if r0 == 0x0 goto pc+1 ; return val ? *val : -1; 22: (61) r6 = *(u32 *)(r0 +0) ; } 23: (bc) w0 = w6 24: (95) exit Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201010234006.7075-4-daniel@iogearbox.net --- tools/include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b97bc5abb3b8..bf5a99d803e4 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -435,6 +435,9 @@ enum { /* Share perf_event among processes */ BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), }; /* Flags for BPF_PROG_QUERY. */ -- cgit v1.2.3