From 36fbf1e52bd3ff8a5cb604955eedfc9350c2e6cc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:16 +0200 Subject: net: rtnetlink: add linkprop commands to add and delete alternative ifnames Add two commands to add and delete list of link properties. Implement the first property type along - alternative ifnames. Each net device can have multiple alternative names. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/if.h | 1 + include/uapi/linux/if_link.h | 2 ++ include/uapi/linux/rtnetlink.h | 7 +++++++ 3 files changed, 10 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index 7fea0fd7d6f5..4bf33344aab1 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -33,6 +33,7 @@ #define IFNAMSIZ 16 #endif /* __UAPI_DEF_IF_IFNAMSIZ */ #define IFALIASZ 256 +#define ALTIFNAMSIZ 128 #include /* For glibc compatibility. An empty enum does not compile. */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 4a8c02cafa9a..8aec8769d944 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -167,6 +167,8 @@ enum { IFLA_NEW_IFINDEX, IFLA_MIN_MTU, IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ __IFLA_MAX }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index ce2a623abb75..1418a8362bb7 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -164,6 +164,13 @@ enum { RTM_GETNEXTHOP, #define RTM_GETNEXTHOP RTM_GETNEXTHOP + RTM_NEWLINKPROP = 108, +#define RTM_NEWLINKPROP RTM_NEWLINKPROP + RTM_DELLINKPROP, +#define RTM_DELLINKPROP RTM_DELLINKPROP + RTM_GETLINKPROP, +#define RTM_GETLINKPROP RTM_GETLINKPROP + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; -- cgit v1.2.3 From 4b76f9ed47074990d85c2ec52288a7d09c3ea357 Mon Sep 17 00:00:00 2001 From: Sunil Dutt Date: Fri, 13 Sep 2019 18:11:44 +0530 Subject: nl80211: Document the expectation for NL80211_ATTR_IE in NL80211_CMD_CONNECT This commit documents the expectation for NL80211_ATTR_IE when included in NL80211_CMD_CONNECT, as following. Driver shall not modify the IEs specified through NL80211_ATTR_IE if NL80211_ATTR_MAC is included. However, if NL80211_ATTR_MAC_HINT is included, these IEs through NL80211_ATTR_IE are specified by the user space based on the best possible BSS selected. Thus, if the driver ends up selecting a different BSS, it can modify these IEs accordingly (e.g. userspace asks the driver to perform PMKSA caching with BSS1 and the driver ends up selecting BSS2 with different PMKSA cache entry. RSNIE has to get updated with the apt PMKID). Signed-off-by: Sunil Dutt Link: https://lore.kernel.org/r/1568378504-15179-1-git-send-email-usdutt@codeaurora.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index beee59c831a7..64135ab3a7ac 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -571,6 +571,14 @@ * set of BSSID,frequency parameters is used (i.e., either the enforcing * %NL80211_ATTR_MAC,%NL80211_ATTR_WIPHY_FREQ or the less strict * %NL80211_ATTR_MAC_HINT and %NL80211_ATTR_WIPHY_FREQ_HINT). + * Driver shall not modify the IEs specified through %NL80211_ATTR_IE if + * %NL80211_ATTR_MAC is included. However, if %NL80211_ATTR_MAC_HINT is + * included, these IEs through %NL80211_ATTR_IE are specified by the user + * space based on the best possible BSS selected. Thus, if the driver ends + * up selecting a different BSS, it can modify these IEs accordingly (e.g. + * userspace asks the driver to perform PMKSA caching with BSS1 and the + * driver ends up selecting BSS2 with different PMKSA cache entry; RSNIE + * has to get updated with the apt PMKID). * %NL80211_ATTR_PREV_BSSID can be used to request a reassociation within * the ESS in case the device is already associated and an association with * a different BSS is desired. -- cgit v1.2.3 From 070c63f20f6c739a3c534555f56c7327536bfcc2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:39 +0200 Subject: net: devlink: allow to change namespaces during reload All devlink instances are created in init_net and stay there for a lifetime. Allow user to be able to move devlink instances into namespaces during devlink reload operation. That ensures proper re-instantiation of driver objects, including netdevices. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 580b7a2e40e1..b558ea88b766 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -421,6 +421,10 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_FAILED, /* u8 0 or 1 */ + DEVLINK_ATTR_NETNS_FD, /* u32 */ + DEVLINK_ATTR_NETNS_PID, /* u32 */ + DEVLINK_ATTR_NETNS_ID, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From d6547f2a2cfc8b145b59291d3e4b072891f34882 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 3 Oct 2019 23:29:24 +0300 Subject: net, uapi: fix -Wpointer-arith warnings Add casts to fix these warnings: ./usr/include/linux/netfilter_arp/arp_tables.h:200:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/netfilter_bridge/ebtables.h:197:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/netfilter_ipv4/ip_tables.h:223:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/netfilter_ipv6/ip6_tables.h:263:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/tipc_config.h:310:28: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/tipc_config.h:410:24: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/virtio_ring.h:170:16: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] Those are theoretical probably but kernel doesn't control compiler flags in userspace. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/uapi/linux/netfilter_arp/arp_tables.h | 2 +- include/uapi/linux/netfilter_bridge/ebtables.h | 2 +- include/uapi/linux/netfilter_ipv4/ip_tables.h | 2 +- include/uapi/linux/netfilter_ipv6/ip6_tables.h | 2 +- include/uapi/linux/tipc_config.h | 4 ++-- include/uapi/linux/virtio_ring.h | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter_arp/arp_tables.h b/include/uapi/linux/netfilter_arp/arp_tables.h index a2a0927d9bd6..bbf5af2b67a8 100644 --- a/include/uapi/linux/netfilter_arp/arp_tables.h +++ b/include/uapi/linux/netfilter_arp/arp_tables.h @@ -199,7 +199,7 @@ struct arpt_get_entries { /* Helper functions */ static __inline__ struct xt_entry_target *arpt_get_target(struct arpt_entry *e) { - return (void *)e + e->target_offset; + return (struct xt_entry_target *)((char *)e + e->target_offset); } /* diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index 8076c940ffeb..a494cf43a755 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -194,7 +194,7 @@ struct ebt_entry { static __inline__ struct ebt_entry_target * ebt_get_target(struct ebt_entry *e) { - return (void *)e + e->target_offset; + return (struct ebt_entry_target *)((char *)e + e->target_offset); } /* {g,s}etsockopt numbers */ diff --git a/include/uapi/linux/netfilter_ipv4/ip_tables.h b/include/uapi/linux/netfilter_ipv4/ip_tables.h index 6aaeb14bfce1..50c7fee625ae 100644 --- a/include/uapi/linux/netfilter_ipv4/ip_tables.h +++ b/include/uapi/linux/netfilter_ipv4/ip_tables.h @@ -222,7 +222,7 @@ struct ipt_get_entries { static __inline__ struct xt_entry_target * ipt_get_target(struct ipt_entry *e) { - return (void *)e + e->target_offset; + return (struct xt_entry_target *)((char *)e + e->target_offset); } /* diff --git a/include/uapi/linux/netfilter_ipv6/ip6_tables.h b/include/uapi/linux/netfilter_ipv6/ip6_tables.h index 031d0a43bed2..d9e364f96a5c 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6_tables.h +++ b/include/uapi/linux/netfilter_ipv6/ip6_tables.h @@ -262,7 +262,7 @@ struct ip6t_get_entries { static __inline__ struct xt_entry_target * ip6t_get_target(struct ip6t_entry *e) { - return (void *)e + e->target_offset; + return (struct xt_entry_target *)((char *)e + e->target_offset); } /* diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h index 4955e1a9f1bc..4dfc05651c98 100644 --- a/include/uapi/linux/tipc_config.h +++ b/include/uapi/linux/tipc_config.h @@ -309,7 +309,7 @@ static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len) tlv_ptr->tlv_len = htons(tlv_len); if (len && data) { memcpy(TLV_DATA(tlv_ptr), data, len); - memset(TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len); + memset((char *)TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len); } return TLV_SPACE(len); } @@ -409,7 +409,7 @@ static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags, tcm_hdr->tcm_flags = htons(flags); if (data_len && data) { memcpy(TCM_DATA(msg), data, data_len); - memset(TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len); + memset((char *)TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len); } return TCM_SPACE(data_len); } diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index 4c4e24c291a5..559f42e73315 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -169,7 +169,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p, { vr->num = num; vr->desc = p; - vr->avail = p + num*sizeof(struct vring_desc); + vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc)); vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16) + align-1) & ~(align - 1)); } -- cgit v1.2.3 From d26b698dd3cd52f5a3277446a87e5e0198c99cd0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:24 -0700 Subject: net/tls: add skeleton of MIB statistics Add a skeleton structure for adding TLS statistics. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 549a31c29f7d..4abd57948ad4 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -323,4 +323,11 @@ enum __LINUX_MIB_XFRMMAX }; +/* linux TLS mib definitions */ +enum +{ + LINUX_MIB_TLSNUM = 0, + __LINUX_MIB_TLSMAX +}; + #endif /* _LINUX_SNMP_H */ -- cgit v1.2.3 From b32fd3cc31d723bf2ab859667be3612c0086ec72 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:25 -0700 Subject: net/tls: add statistics for installed sessions Add SNMP stats for number of sockets with successfully installed sessions. Break them down to software and hardware ones. Note that if hardware offload fails stack uses software implementation, and counts the session appropriately. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 4abd57948ad4..1b4613b5af70 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -327,6 +327,14 @@ enum enum { LINUX_MIB_TLSNUM = 0, + LINUX_MIB_TLSCURRTXSW, /* TlsCurrTxSw */ + LINUX_MIB_TLSCURRRXSW, /* TlsCurrRxSw */ + LINUX_MIB_TLSCURRTXDEVICE, /* TlsCurrTxDevice */ + LINUX_MIB_TLSCURRRXDEVICE, /* TlsCurrRxDevice */ + LINUX_MIB_TLSTXSW, /* TlsTxSw */ + LINUX_MIB_TLSRXSW, /* TlsRxSw */ + LINUX_MIB_TLSTXDEVICE, /* TlsTxDevice */ + LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From 5c5ec66858062a857cf51f57cbe52b36330f7ae6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:26 -0700 Subject: net/tls: add TlsDecryptError stat Add a statistic for TLS record decryption errors. Since devices are supposed to pass records as-is when they encounter errors this statistic will count bad records in both pure software and inline crypto configurations. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 1b4613b5af70..c9e4963e26f0 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -335,6 +335,7 @@ enum LINUX_MIB_TLSRXSW, /* TlsRxSw */ LINUX_MIB_TLSTXDEVICE, /* TlsTxDevice */ LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ + LINUX_MIB_TLSDECRYPTERROR, /* TlsDecryptError */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From a4d26fdbc2a5414bb1b67198656cc7e24a4a3c3a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:27 -0700 Subject: net/tls: add TlsDeviceRxResync statistic Add a statistic for number of RX resyncs sent down to the NIC. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index c9e4963e26f0..7eee233e78d2 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -336,6 +336,7 @@ enum LINUX_MIB_TLSTXDEVICE, /* TlsTxDevice */ LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ LINUX_MIB_TLSDECRYPTERROR, /* TlsDecryptError */ + LINUX_MIB_TLSRXDEVICERESYNC, /* TlsRxDeviceResync */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From 5f0e5412781b01708f622d00c0b3f77b9dca7367 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 6 Oct 2019 20:07:36 -0700 Subject: uapi/bpf: fix helper docs Various small fixes to BPF helper documentation comments, enabling automatic header generation with a list of BPF helpers. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77c6be96d676..a65c3b0c6935 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -794,7 +794,7 @@ union bpf_attr { * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. * - * int bpf_get_current_comm(char *buf, u32 size_of_buf) + * int bpf_get_current_comm(void *buf, u32 size_of_buf) * Description * Copy the **comm** attribute of the current task into *buf* of * *size_of_buf*. The **comm** attribute contains the name of @@ -1023,7 +1023,7 @@ union bpf_attr { * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * - * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -1068,7 +1068,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) + * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) * Description * This helper was provided as an easy way to load data from a * packet. It can be used to load *len* bytes from *offset* from @@ -1085,7 +1085,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags) + * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context @@ -1154,7 +1154,7 @@ union bpf_attr { * The checksum result, or a negative error code in case of * failure. * - * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Retrieve tunnel options metadata for the packet associated to * *skb*, and store the raw tunnel option data to the buffer *opt* @@ -1172,7 +1172,7 @@ union bpf_attr { * Return * The size of the option data retrieved. * - * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Set tunnel options metadata for the packet associated to *skb* * to the option data contained in the raw buffer *opt* of *size*. @@ -1511,7 +1511,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1595,7 +1595,7 @@ union bpf_attr { * Return * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. * - * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and @@ -1715,7 +1715,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1947,7 +1947,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) + * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *ctx*, which is a pointer @@ -1980,7 +1980,7 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * - * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) + * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* @@ -2033,7 +2033,7 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * - * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. * The *skops* is used as a new value for the entry associated to @@ -2392,7 +2392,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) + * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * For socket policies, insert *len* bytes into *msg* at offset * *start*. @@ -2408,9 +2408,9 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags) + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description - * Will remove *pop* bytes from a *msg* starting at byte *start*. + * Will remove *len* bytes from a *msg* starting at byte *start*. * This may result in **ENOMEM** errors under certain situations if * an allocation and copy are required due to a full ring buffer. * However, the helper will try to avoid doing the allocation @@ -2505,7 +2505,7 @@ union bpf_attr { * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * - * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * int bpf_skb_ecn_set_ce(struct sk_buff *skb) * Description * Set ECN (Explicit Congestion Notification) field of IP header * to **CE** (Congestion Encountered) if current value is **ECT** -- cgit v1.2.3 From b6e6b5f1da7e8d092f86a4351802c27c0170c5a5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 8 Oct 2019 19:27:36 +0800 Subject: sctp: add SCTP_SEND_FAILED_EVENT event This patch is to add a new event SCTP_SEND_FAILED_EVENT described in rfc6458#section-6.1.11. It's a update of SCTP_SEND_FAILED event: struct sctp_sndrcvinfo ssf_info is replaced with struct sctp_sndinfo ssfe_info in struct sctp_send_failed_event. SCTP_SEND_FAILED is being deprecated, but we don't remove it in this patch. Both are being processed in sctp_datamsg_destroy() when the corresp event flag is set. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: Jakub Kicinski --- include/uapi/linux/sctp.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6d5b164af55c..6bce7f9837a9 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -449,6 +449,16 @@ struct sctp_send_failed { __u8 ssf_data[0]; }; +struct sctp_send_failed_event { + __u16 ssf_type; + __u16 ssf_flags; + __u32 ssf_length; + __u32 ssf_error; + struct sctp_sndinfo ssfe_info; + sctp_assoc_t ssf_assoc_id; + __u8 ssf_data[0]; +}; + /* * ssf_flags: 16 bits (unsigned integer) * @@ -605,6 +615,7 @@ struct sctp_event_subscribe { __u8 sctp_stream_reset_event; __u8 sctp_assoc_reset_event; __u8 sctp_stream_change_event; + __u8 sctp_send_failure_event_event; }; /* @@ -632,6 +643,7 @@ union sctp_notification { struct sctp_stream_reset_event sn_strreset_event; struct sctp_assoc_reset_event sn_assocreset_event; struct sctp_stream_change_event sn_strchange_event; + struct sctp_send_failed_event sn_send_failed_event; }; /* Section 5.3.1 @@ -667,7 +679,9 @@ enum sctp_sn_type { #define SCTP_ASSOC_RESET_EVENT SCTP_ASSOC_RESET_EVENT SCTP_STREAM_CHANGE_EVENT, #define SCTP_STREAM_CHANGE_EVENT SCTP_STREAM_CHANGE_EVENT - SCTP_SN_TYPE_MAX = SCTP_STREAM_CHANGE_EVENT, + SCTP_SEND_FAILED_EVENT, +#define SCTP_SEND_FAILED_EVENT SCTP_SEND_FAILED_EVENT + SCTP_SN_TYPE_MAX = SCTP_SEND_FAILED_EVENT, #define SCTP_SN_TYPE_MAX SCTP_SN_TYPE_MAX }; -- cgit v1.2.3 From 14af7fd1d4279c8db7fbbb3ca0df3b13179eb502 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 12 Oct 2019 18:27:57 +0200 Subject: ethtool: Add support for 400Gbps (50Gbps per lane) link modes Add support for 400Gbps speed, link modes of 50Gbps per lane Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 8938b76c4ee3..d4591792f0b4 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1507,6 +1507,11 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, ETHTOOL_LINK_MODE_100baseT1_Full_BIT = 67, ETHTOOL_LINK_MODE_1000baseT1_Full_BIT = 68, + ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT = 69, + ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT = 70, + ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71, + ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, + ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS @@ -1618,6 +1623,7 @@ enum ethtool_link_mode_bit_indices { #define SPEED_56000 56000 #define SPEED_100000 100000 #define SPEED_200000 200000 +#define SPEED_400000 400000 #define SPEED_UNKNOWN -1 -- cgit v1.2.3 From ccfe29eb29c2edcea6552072ef00ff4117f53e83 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:58 -0700 Subject: bpf: Add attach_btf_id attribute to program load Add attach_btf_id attribute to prog_load command. It's similar to existing expected_attach_type attribute which is used in several cgroup based program types. Unfortunately expected_attach_type is ignored for tracing programs and cannot be reused for new purpose. Hence introduce attach_btf_id to verify bpf programs against given in-kernel BTF type id at load time. It is strictly checked to be valid for raw_tp programs only. In a later patches it will become: btf_id == 0 semantics of existing raw_tp progs. btd_id > 0 raw_tp with BTF and additional type safety. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-5-ast@kernel.org --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a65c3b0c6935..3bb2cd1de341 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -420,6 +420,7 @@ union bpf_attr { __u32 line_info_rec_size; /* userspace bpf_line_info size */ __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ + __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.2.3 From a7658e1a4164ce2b9eb4a11aadbba38586e93bd6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:04 -0700 Subject: bpf: Check types of arguments passed into helpers Introduce new helper that reuses existing skb perf_event output implementation, but can be called from raw_tracepoint programs that receive 'struct sk_buff *' as tracepoint argument or can walk other kernel data structures to skb pointer. In order to do that teach verifier to resolve true C types of bpf helpers into in-kernel BTF ids. The type of kernel pointer passed by raw tracepoint into bpf program will be tracked by the verifier all the way until it's passed into helper function. For example: kfree_skb() kernel function calls trace_kfree_skb(skb, loc); bpf programs receives that skb pointer and may eventually pass it into bpf_skb_output() bpf helper which in-kernel is implemented via bpf_skb_event_output() kernel function. Its first argument in the kernel is 'struct sk_buff *'. The verifier makes sure that types match all the way. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-11-ast@kernel.org --- include/uapi/linux/bpf.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3bb2cd1de341..4af8b0819a32 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2751,6 +2751,30 @@ union bpf_attr { * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2863,7 +2887,8 @@ union bpf_attr { FN(sk_storage_get), \ FN(sk_storage_delete), \ FN(send_signal), \ - FN(tcp_gen_syncookie), + FN(tcp_gen_syncookie), \ + FN(skb_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From d54725cd11a57c30f650260cfb0a92c268bdc3e0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Oct 2019 14:30:05 +0200 Subject: netfilter: nf_tables: support for multiple devices per netdev hook This patch allows you to register one netdev basechain to multiple devices. This adds a new NFTA_HOOK_DEVS netlink attribute to specify the list of netdevices. Basechains store a list of hooks. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index ed8881ad18ed..81fed16fe2b2 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -144,12 +144,14 @@ enum nft_list_attributes { * @NFTA_HOOK_HOOKNUM: netfilter hook number (NLA_U32) * @NFTA_HOOK_PRIORITY: netfilter hook priority (NLA_U32) * @NFTA_HOOK_DEV: netdevice name (NLA_STRING) + * @NFTA_HOOK_DEVS: list of netdevices (NLA_NESTED) */ enum nft_hook_attributes { NFTA_HOOK_UNSPEC, NFTA_HOOK_HOOKNUM, NFTA_HOOK_PRIORITY, NFTA_HOOK_DEV, + NFTA_HOOK_DEVS, __NFTA_HOOK_MAX }; #define NFTA_HOOK_MAX (__NFTA_HOOK_MAX - 1) -- cgit v1.2.3 From 480274787d7e3458bc5a7cfbbbe07033984ad711 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 23 Oct 2019 11:09:26 -0400 Subject: tcp: add TCP_INFO status for failed client TFO The TCPI_OPT_SYN_DATA bit as part of tcpi_options currently reports whether or not data-in-SYN was ack'd on both the client and server side. We'd like to gather more information on the client-side in the failure case in order to indicate the reason for the failure. This can be useful for not only debugging TFO, but also for creating TFO socket policies. For example, if a middle box removes the TFO option or drops a data-in-SYN, we can can detect this case, and turn off TFO for these connections saving the extra retransmits. The newly added tcpi_fastopen_client_fail status is 2 bits and has the following 4 states: 1) TFO_STATUS_UNSPEC Catch-all state which includes when TFO is disabled via black hole detection, which is indicated via LINUX_MIB_TCPFASTOPENBLACKHOLE. 2) TFO_COOKIE_UNAVAILABLE If TFO_CLIENT_NO_COOKIE mode is off, this state indicates that no cookie is available in the cache. 3) TFO_DATA_NOT_ACKED Data was sent with SYN, we received a SYN/ACK but it did not cover the data portion. Cookie is not accepted by server because the cookie may be invalid or the server may be overloaded. 4) TFO_SYN_RETRANSMITTED Data was sent with SYN, we received a SYN/ACK which did not cover the data after at least 1 additional SYN was sent (without data). It may be the case that a middle-box is dropping data-in-SYN packets. Thus, it would be more efficient to not use TFO on this connection to avoid extra retransmits during connection establishment. These new fields do not cover all the cases where TFO may fail, but other failures, such as SYN/ACK + data being dropped, will result in the connection not becoming established. And a connection blackhole after session establishment shows up as a stalled connection. Signed-off-by: Jason Baron Cc: Eric Dumazet Cc: Neal Cardwell Cc: Christoph Paasch Cc: Yuchung Cheng Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 81e697978e8b..74af1f759cee 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -155,6 +155,14 @@ enum { TCP_QUEUES_NR, }; +/* why fastopen failed from client perspective */ +enum tcp_fastopen_client_fail { + TFO_STATUS_UNSPEC, /* catch-all */ + TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */ + TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */ + TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */ +}; + /* for TCP_INFO socket option */ #define TCPI_OPT_TIMESTAMPS 1 #define TCPI_OPT_SACK 2 @@ -211,7 +219,7 @@ struct tcp_info { __u8 tcpi_backoff; __u8 tcpi_options; __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; - __u8 tcpi_delivery_rate_app_limited:1; + __u8 tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2; __u32 tcpi_rto; __u32 tcpi_ato; -- cgit v1.2.3 From c199ce4f9dd896c716aece33e6750be34aea1151 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 24 Oct 2019 17:22:01 +0200 Subject: net: Fix misspellings of "configure" and "configuration" Fix various misspellings of "configuration" and "configure". Signed-off-by: Geert Uytterhoeven Acked-by: Kalle Valo Signed-off-by: David S. Miller --- include/uapi/linux/dcbnl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index 69df19aa8e72..a791a94013a6 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -286,7 +286,7 @@ struct dcbmsg { * @DCB_CMD_GNUMTCS: get the number of traffic classes currently supported * @DCB_CMD_SNUMTCS: set the number of traffic classes * @DCB_CMD_GBCN: set backward congestion notification configuration - * @DCB_CMD_SBCN: get backward congestion notification configration. + * @DCB_CMD_SBCN: get backward congestion notification configuration. * @DCB_CMD_GAPP: get application protocol configuration * @DCB_CMD_SAPP: set application protocol configuration * @DCB_CMD_IEEE_SET: set IEEE 802.1Qaz configuration -- cgit v1.2.3 From c0bceb97db9efc72629dd00cd0d9812f24d4ba2d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 30 Oct 2019 14:00:41 +0100 Subject: tipc: add smart nagle feature We introduce a feature that works like a combination of TCP_NAGLE and TCP_CORK, but without some of the weaknesses of those. In particular, we will not observe long delivery delays because of delayed acks, since the algorithm itself decides if and when acks are to be sent from the receiving peer. - The nagle property as such is determined by manipulating a new 'maxnagle' field in struct tipc_sock. If certain conditions are met, 'maxnagle' will define max size of the messages which can be bundled. If it is set to zero no messages are ever bundled, implying that the nagle property is disabled. - A socket with the nagle property enabled enters nagle mode when more than 4 messages have been sent out without receiving any data message from the peer. - A socket leaves nagle mode whenever it receives a data message from the peer. In nagle mode, messages smaller than 'maxnagle' are accumulated in the socket write queue. The last buffer in the queue is marked with a new 'ack_required' bit, which forces the receiving peer to send a CONN_ACK message back to the sender upon reception. The accumulated contents of the write queue is transmitted when one of the following events or conditions occur. - A CONN_ACK message is received from the peer. - A data message is received from the peer. - A SOCK_WAKEUP pseudo message is received from the link level. - The write queue contains more than 64 1k blocks of data. - The connection is being shut down. - There is no CONN_ACK message to expect. I.e., there is currently no outstanding message where the 'ack_required' bit was set. As a consequence, the first message added after we enter nagle mode is always sent directly with this bit set. This new feature gives a 50-100% improvement of throughput for small (i.e., less than MTU size) messages, while it might add up to one RTT to latency time when the socket is in nagle mode. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 7df026ea6aff..76421b878767 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -191,6 +191,7 @@ struct sockaddr_tipc { #define TIPC_GROUP_JOIN 135 /* Takes struct tipc_group_req* */ #define TIPC_GROUP_LEAVE 136 /* No argument */ #define TIPC_SOCK_RECVQ_USED 137 /* Default: none (read only) */ +#define TIPC_NODELAY 138 /* Default: false */ /* * Flag values -- cgit v1.2.3 From abbb0d33632ce931ca9c814813ee131351f6b92f Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:05 +0200 Subject: net: sched: extend TCA_ACT space with TCA_ACT_FLAGS Extend TCA_ACT space with nla_bitfield32 flags. Add TCA_ACT_FLAGS_NO_PERCPU_STATS as the only allowed flag. Parse the flags in tcf_action_init_1() and pass resulting value as additional argument to a_o->init(). Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index a6aa466fac9e..c6ad22f76ede 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -16,9 +16,14 @@ enum { TCA_ACT_STATS, TCA_ACT_PAD, TCA_ACT_COOKIE, + TCA_ACT_FLAGS, __TCA_ACT_MAX }; +#define TCA_ACT_FLAGS_NO_PERCPU_STATS 1 /* Don't use percpu allocator for + * actions stats. + */ + #define TCA_ACT_MAX __TCA_ACT_MAX #define TCA_OLD_COMPAT (TCA_ACT_MAX+1) #define TCA_ACT_MAX_PRIO 32 -- cgit v1.2.3 From f1b9509c2fb0ef4db8d22dac9aef8e856a5d81f6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Oct 2019 15:32:11 -0700 Subject: bpf: Replace prog_raw_tp+btf_id with prog_tracing The bpf program type raw_tp together with 'expected_attach_type' was the most appropriate api to indicate BTF-enabled raw_tp programs. But during development it became apparent that 'expected_attach_type' cannot be used and new 'attach_btf_id' field had to be introduced. Which means that the information is duplicated in two fields where one of them is ignored. Clean it up by introducing new program type where both 'expected_attach_type' and 'attach_btf_id' fields have specific meaning. In the future 'expected_attach_type' will be extended with other attach points that have similar semantics to raw_tp. This patch is replacing BTF-enabled BPF_PROG_TYPE_RAW_TRACEPOINT with prog_type = BPF_RPOG_TYPE_TRACING expected_attach_type = BPF_TRACE_RAW_TP attach_btf_id = btf_id of raw tracepoint inside the kernel Future patches will add expected_attach_type = BPF_TRACE_FENTRY or BPF_TRACE_FEXIT where programs have the same input context and the same helpers, but different attach points. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191030223212.953010-2-ast@kernel.org --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4af8b0819a32..a6bf19dabaab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -173,6 +173,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, BPF_PROG_TYPE_CGROUP_SOCKOPT, + BPF_PROG_TYPE_TRACING, }; enum bpf_attach_type { @@ -199,6 +200,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP6_RECVMSG, BPF_CGROUP_GETSOCKOPT, BPF_CGROUP_SETSOCKOPT, + BPF_TRACE_RAW_TP, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 6ae08ae3dea2cfa03dd3665a3c8475c2d429ef47 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:59 +0100 Subject: bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers The current bpf_probe_read() and bpf_probe_read_str() helpers are broken in that they assume they can be used for probing memory access for kernel space addresses /as well as/ user space addresses. However, plain use of probe_kernel_read() for both cases will attempt to always access kernel space address space given access is performed under KERNEL_DS and some archs in-fact have overlapping address spaces where a kernel pointer and user pointer would have the /same/ address value and therefore accessing application memory via bpf_probe_read{,_str}() would read garbage values. Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess: Add non-pagefault user-space read functions"). Unfortunately, the only way to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}() helpers are kept as-is to retain their current behavior. The two *_user() variants attempt the access always under USER_DS set, the two *_kernel() variants will -EFAULT when accessing user memory if the underlying architecture has non-overlapping address ranges, also avoiding throwing the kernel warning via 00c42373d397 ("x86-64: add warning for non-canonical user access address dereferences"). Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper") Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 122 +++++++++++++++++++++++++++++++---------------- 1 file changed, 82 insertions(+), 40 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a6bf19dabaab..df6809a76404 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -563,10 +563,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *src) + * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from - * address *src* and store the data in *dst*. + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() + * instead. * Return * 0 on success, or a negative error in case of failure. * @@ -1428,45 +1431,14 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) + * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description - * Copy a NUL terminated string from an unsafe address - * *unsafe_ptr* to *dst*. The *size* should include the - * terminating NUL byte. In case the string length is smaller than - * *size*, the target is not padded with further NUL bytes. If the - * string length is larger than *size*, just *size*-1 bytes are - * copied and the last byte is set to NUL. - * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: - * - * :: - * - * SEC("kprobe/sys_open") - * void bpf_sys_open(struct pt_regs *ctx) - * { - * char buf[PATHLEN]; // PATHLEN is defined to 256 - * int res = bpf_probe_read_str(buf, sizeof(buf), - * ctx->di); - * - * // Consume buf, for example push it to - * // userspace via bpf_perf_event_output(); we - * // can use res (the string length) as event - * // size, after checking its boundaries. - * } - * - * In comparison, using **bpf_probe_read()** helper here instead - * to read the string would require to estimate the length at - * compile time, and would often result in copying more memory - * than necessary. + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for + * more details. * - * Another useful use case is when parsing individual process - * arguments or individual environment variables navigating - * *current*\ **->mm->arg_start** and *current*\ - * **->mm->env_start**: using this helper and the return value, - * one can quickly iterate at the right offset of the memory area. + * Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() + * instead. * Return * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative @@ -2777,6 +2749,72 @@ union bpf_attr { * restricted to raw_tracepoint bpf programs. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, the length of the copied string is returned. This + * makes this helper useful in tracing programs for reading + * strings, and more importantly to get its length at runtime. See + * the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user()** helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with bpf_probe_read_user_str() apply. + * Return + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2890,7 +2928,11 @@ union bpf_attr { FN(sk_storage_delete), \ FN(send_signal), \ FN(tcp_gen_syncookie), \ - FN(skb_output), + FN(skb_output), \ + FN(probe_read_user), \ + FN(probe_read_kernel), \ + FN(probe_read_user_str), \ + FN(probe_read_kernel_str), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From b6520fce073b619e6f2c0d510bb3481c9386c70b Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Thu, 26 Sep 2019 12:06:45 +0200 Subject: netfilter: ipset: Add wildcard support to net,iface The net,iface equal functions currently compares the full interface names. In several cases, wildcard (or prefix) matching is useful. For example, when converting a large iptables rule-set to make use of ipset, I was able to significantly reduce the number of set elements by making use of wildcard matching. Wildcard matching is enabled by adding "wildcard" when adding an element to a set. Internally, this causes the IPSET_FLAG_IFACE_WILDCARD-flag to be set. When this flag is set, only the initial part of the interface name is used for comparison. Wildcard matching is done per element and not per set, as there are many cases where mixing wildcard and non-wildcard elements are useful. This means that is up to the user to handle (avoid) overlapping interface names. Signed-off-by: Kristian Evensen Signed-off-by: Jozsef Kadlecsik --- include/uapi/linux/netfilter/ipset/ip_set.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index eea166c52c36..11a72a938eb1 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -205,6 +205,8 @@ enum ipset_cadt_flags { IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD), IPSET_FLAG_BIT_WITH_SKBINFO = 6, IPSET_FLAG_WITH_SKBINFO = (1 << IPSET_FLAG_BIT_WITH_SKBINFO), + IPSET_FLAG_BIT_IFACE_WILDCARD = 7, + IPSET_FLAG_IFACE_WILDCARD = (1 << IPSET_FLAG_BIT_IFACE_WILDCARD), IPSET_FLAG_CADT_MAX = 15, }; -- cgit v1.2.3 From 4d390c287b2f3fbd0bb64c52c1a9418f790986e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 19:13:13 -0800 Subject: net_sched: do not export gnet_stats_basic_packed to uapi gnet_stats_basic_packed was really meant to be private kernel structure. If this proves to be a problem, we will have to rename the in-kernel version. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/gen_stats.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 065408e16a80..4eaacdf452e3 100644 --- a/include/uapi/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h @@ -26,10 +26,6 @@ struct gnet_stats_basic { __u64 bytes; __u32 packets; }; -struct gnet_stats_basic_packed { - __u64 bytes; - __u32 packets; -} __attribute__ ((packed)); /** * struct gnet_stats_rate_est - rate estimator -- cgit v1.2.3 From b33e699fe43aa63f29113311f69357e119ef5276 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 19:13:15 -0800 Subject: net_sched: add TCA_STATS_PKT64 attribute Now the kernel uses 64bit packet counters in scheduler layer, we want to export these counters to user space. Instead risking breaking user space by adding fields to struct gnet_stats_basic, add a new TCA_STATS_PKT64. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/gen_stats.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 4eaacdf452e3..852f234f1fd6 100644 --- a/include/uapi/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h @@ -13,6 +13,7 @@ enum { TCA_STATS_RATE_EST64, TCA_STATS_PAD, TCA_STATS_BASIC_HW, + TCA_STATS_PKT64, __TCA_STATS_MAX, }; #define TCA_STATS_MAX (__TCA_STATS_MAX - 1) -- cgit v1.2.3 From 4ece477870774698e6e73d5821a3dd1605ca123b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Nov 2019 17:01:05 +0800 Subject: lwtunnel: add options setting and dumping for geneve To add options setting and dumping, .build_state(), .fill_encap() and .get_encap_size() in ip_tun_lwt_ops needs to be extended: ip_tun_build_state(): ip_tun_parse_opts(): ip_tun_parse_opts_geneve() ip_tun_fill_encap_info(): ip_tun_fill_encap_opts(): ip_tun_fill_encap_opts_geneve() ip_tun_encap_nlsize() ip_tun_opts_nlsize(): if (tun_flags & TUNNEL_GENEVE_OPT) ip_tun_parse_opts(), ip_tun_fill_encap_opts() and ip_tun_opts_nlsize() processes LWTUNNEL_IP_OPTS. ip_tun_parse_opts_geneve(), ip_tun_fill_encap_opts_geneve() and if (tun_flags & TUNNEL_GENEVE_OPT) processes LWTUNNEL_IP_OPTS_GENEVE. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index de696ca12f2c..b595ab219036 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -27,6 +27,7 @@ enum lwtunnel_ip_t { LWTUNNEL_IP_TOS, LWTUNNEL_IP_FLAGS, LWTUNNEL_IP_PAD, + LWTUNNEL_IP_OPTS, __LWTUNNEL_IP_MAX, }; @@ -41,11 +42,30 @@ enum lwtunnel_ip6_t { LWTUNNEL_IP6_TC, LWTUNNEL_IP6_FLAGS, LWTUNNEL_IP6_PAD, + LWTUNNEL_IP6_OPTS, __LWTUNNEL_IP6_MAX, }; #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1) +enum { + LWTUNNEL_IP_OPTS_UNSPEC, + LWTUNNEL_IP_OPTS_GENEVE, + __LWTUNNEL_IP_OPTS_MAX, +}; + +#define LWTUNNEL_IP_OPTS_MAX (__LWTUNNEL_IP_OPTS_MAX - 1) + +enum { + LWTUNNEL_IP_OPT_GENEVE_UNSPEC, + LWTUNNEL_IP_OPT_GENEVE_CLASS, + LWTUNNEL_IP_OPT_GENEVE_TYPE, + LWTUNNEL_IP_OPT_GENEVE_DATA, + __LWTUNNEL_IP_OPT_GENEVE_MAX, +}; + +#define LWTUNNEL_IP_OPT_GENEVE_MAX (__LWTUNNEL_IP_OPT_GENEVE_MAX - 1) + enum { LWT_BPF_PROG_UNSPEC, LWT_BPF_PROG_FD, -- cgit v1.2.3 From edf31cbb1502481da181a09148adb33e12599185 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Nov 2019 17:01:06 +0800 Subject: lwtunnel: add options setting and dumping for vxlan Based on the code framework built on the last patch, to support setting and dumping for vxlan, we only need to add ip_tun_parse_opts_vxlan() for .build_state and ip_tun_fill_encap_opts_vxlan() for .fill_encap and if (tun_flags & TUNNEL_VXLAN_OPT) for .get_encap_size. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index b595ab219036..638b7b108453 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -51,6 +51,7 @@ enum lwtunnel_ip6_t { enum { LWTUNNEL_IP_OPTS_UNSPEC, LWTUNNEL_IP_OPTS_GENEVE, + LWTUNNEL_IP_OPTS_VXLAN, __LWTUNNEL_IP_OPTS_MAX, }; @@ -66,6 +67,14 @@ enum { #define LWTUNNEL_IP_OPT_GENEVE_MAX (__LWTUNNEL_IP_OPT_GENEVE_MAX - 1) +enum { + LWTUNNEL_IP_OPT_VXLAN_UNSPEC, + LWTUNNEL_IP_OPT_VXLAN_GBP, + __LWTUNNEL_IP_OPT_VXLAN_MAX, +}; + +#define LWTUNNEL_IP_OPT_VXLAN_MAX (__LWTUNNEL_IP_OPT_VXLAN_MAX - 1) + enum { LWT_BPF_PROG_UNSPEC, LWT_BPF_PROG_FD, -- cgit v1.2.3 From b0a21810bd5e1f92e3379899cc8ca9fe144ee8b3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Nov 2019 17:01:07 +0800 Subject: lwtunnel: add options setting and dumping for erspan Based on the code framework built on the last patch, to support setting and dumping for vxlan, we only need to add ip_tun_parse_opts_erspan() for .build_state and ip_tun_fill_encap_opts_erspan() for .fill_encap and if (tun_flags & TUNNEL_ERSPAN_OPT) for .get_encap_size. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 638b7b108453..f6035f737193 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -52,6 +52,7 @@ enum { LWTUNNEL_IP_OPTS_UNSPEC, LWTUNNEL_IP_OPTS_GENEVE, LWTUNNEL_IP_OPTS_VXLAN, + LWTUNNEL_IP_OPTS_ERSPAN, __LWTUNNEL_IP_OPTS_MAX, }; @@ -75,6 +76,17 @@ enum { #define LWTUNNEL_IP_OPT_VXLAN_MAX (__LWTUNNEL_IP_OPT_VXLAN_MAX - 1) +enum { + LWTUNNEL_IP_OPT_ERSPAN_UNSPEC, + LWTUNNEL_IP_OPT_ERSPAN_VER, + LWTUNNEL_IP_OPT_ERSPAN_INDEX, + LWTUNNEL_IP_OPT_ERSPAN_DIR, + LWTUNNEL_IP_OPT_ERSPAN_HWID, + __LWTUNNEL_IP_OPT_ERSPAN_MAX, +}; + +#define LWTUNNEL_IP_OPT_ERSPAN_MAX (__LWTUNNEL_IP_OPT_ERSPAN_MAX - 1) + enum { LWT_BPF_PROG_UNSPEC, LWT_BPF_PROG_FD, -- cgit v1.2.3 From 14f34e36b36ceede9877ca422a62fcac17b52023 Mon Sep 17 00:00:00 2001 From: Gurumoorthi Gnanasambandhan Date: Thu, 31 Oct 2019 23:46:40 +0200 Subject: cfg80211: VLAN offload support for set_key and set_sta_vlan This provides an alternative mechanism for AP VLAN support where a single netdev is used with VLAN tagged frames instead of separate netdevs for each VLAN without tagged frames from the WLAN driver. By setting NL80211_EXT_FEATURE_VLAN_OFFLOAD flag the driver indicates support for a single netdev with VLAN tagged frames. Separate VLAN-specific netdevs can be added using RTM_NEWLINK/IFLA_VLAN_ID similarly to Ethernet. NL80211_CMD_NEW_KEY (for group keys), NL80211_CMD_NEW_STATION, and NL80211_CMD_SET_STATION will optionally specify vlan_id using NL80211_ATTR_VLAN_ID. Signed-off-by: Gurumoorthi Gnanasambandhan Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20191031214640.5012-1-jouni@codeaurora.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 64135ab3a7ac..341e0e8cae46 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -248,6 +248,22 @@ * %NL80211_ATTR_SAE_PASSWORD. */ +/** + * DOC: VLAN offload support for setting group keys and binding STAs to VLANs + * + * By setting @NL80211_EXT_FEATURE_VLAN_OFFLOAD flag drivers can indicate they + * support offloading VLAN functionality in a manner where the driver exposes a + * single netdev that uses VLAN tagged frames and separate VLAN-specific netdevs + * can then be added using RTM_NEWLINK/IFLA_VLAN_ID similarly to the Ethernet + * case. Frames received from stations that are not assigned to any VLAN are + * delivered on the main netdev and frames to such stations can be sent through + * that main netdev. + * + * %NL80211_CMD_NEW_KEY (for group keys), %NL80211_CMD_NEW_STATION, and + * %NL80211_CMD_SET_STATION will optionally specify vlan_id using + * %NL80211_ATTR_VLAN_ID. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -2381,6 +2397,9 @@ enum nl80211_commands { * the allowed channel bandwidth configurations. (u8 attribute) * Defined by IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13. * + * @NL80211_ATTR_VLAN_ID: VLAN ID (1..4094) for the station and VLAN group key + * (u16). + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2843,6 +2862,8 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_EDMG_CHANNELS, NL80211_ATTR_WIPHY_EDMG_BW_CONFIG, + NL80211_ATTR_VLAN_ID, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5492,6 +5513,10 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SAE_OFFLOAD: Device wants to do SAE authentication in * station mode (SAE password is passed as part of the connect command). * + * @NL80211_EXT_FEATURE_VLAN_OFFLOAD: The driver supports a single netdev + * with VLAN tagged frames and separate VLAN-specific netdevs added using + * vconfig similarly to the Ethernet case. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5537,6 +5562,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_EXT_KEY_ID, NL80211_EXT_FEATURE_STA_TX_PWR, NL80211_EXT_FEATURE_SAE_OFFLOAD, + NL80211_EXT_FEATURE_VLAN_OFFLOAD, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 134bdac397661a5841d9f27f508190c68b26232b Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 8 Nov 2019 12:05:10 +0700 Subject: tipc: add new AEAD key structure for user API The new structure 'tipc_aead_key' is added to the 'tipc.h' for user to be able to transfer a key to TIPC in kernel. Netlink will be used for this purpose in the later commits. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 76421b878767..add01db1daef 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -233,6 +233,27 @@ struct tipc_sioc_nodeid_req { char node_id[TIPC_NODEID_LEN]; }; +/* + * TIPC Crypto, AEAD + */ +#define TIPC_AEAD_ALG_NAME (32) + +struct tipc_aead_key { + char alg_name[TIPC_AEAD_ALG_NAME]; + unsigned int keylen; /* in bytes */ + char key[]; +}; + +#define TIPC_AEAD_KEYLEN_MIN (16 + 4) +#define TIPC_AEAD_KEYLEN_MAX (32 + 4) +#define TIPC_AEAD_KEY_SIZE_MAX (sizeof(struct tipc_aead_key) + \ + TIPC_AEAD_KEYLEN_MAX) + +static inline int tipc_aead_key_size(struct tipc_aead_key *key) +{ + return sizeof(*key) + key->keylen; +} + /* The macros and functions below are deprecated: */ -- cgit v1.2.3 From e1f32190cf7ddd55778b460e7d44af3f76529698 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 8 Nov 2019 12:05:12 +0700 Subject: tipc: add support for AEAD key setting via netlink This commit adds two netlink commands to TIPC in order for user to be able to set or remove AEAD keys: - TIPC_NL_KEY_SET - TIPC_NL_KEY_FLUSH When the 'KEY_SET' is given along with the key data, the key will be initiated and attached to TIPC crypto. On the other hand, the 'KEY_FLUSH' command will remove all existing keys if any. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index efb958fd167d..6c2194ab745b 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -63,6 +63,8 @@ enum { TIPC_NL_PEER_REMOVE, TIPC_NL_BEARER_ADD, TIPC_NL_UDP_GET_REMOTEIP, + TIPC_NL_KEY_SET, + TIPC_NL_KEY_FLUSH, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -160,6 +162,8 @@ enum { TIPC_NLA_NODE_UNSPEC, TIPC_NLA_NODE_ADDR, /* u32 */ TIPC_NLA_NODE_UP, /* flag */ + TIPC_NLA_NODE_ID, /* data */ + TIPC_NLA_NODE_KEY, /* data */ __TIPC_NLA_NODE_MAX, TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 -- cgit v1.2.3 From aef587be42925f92418083f08852d0011b2766ca Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:32 +0800 Subject: sctp: add pf_expose per netns and sock and asoc As said in rfc7829, section 3, point 12: The SCTP stack SHOULD expose the PF state of its destination addresses to the ULP as well as provide the means to notify the ULP of state transitions of its destination addresses from active to PF, and vice versa. However, it is recommended that an SCTP stack implementing SCTP-PF also allows for the ULP to be kept ignorant of the PF state of its destinations and the associated state transitions, thus allowing for retention of the simpler state transition model of [RFC4960] in the ULP. Not only does it allow to expose the PF state to ULP, but also allow to ignore sctp-pf to ULP. So this patch is to add pf_expose per netns, sock and asoc. And in sctp_assoc_control_transport(), ulp_notify will be set to false if asoc->expose is not 'enabled' in next patch. It also allows a user to change pf_expose per netns by sysctl, and pf_expose per sock and asoc will be initialized with it. Note that pf_expose also works for SCTP_GET_PEER_ADDR_INFO sockopt, to not allow a user to query the state of a sctp-pf peer address when pf_expose is 'disabled', as said in section 7.3. v1->v2: - Fix a build warning noticed by Nathan Chancellor. v2->v3: - set pf_expose to UNUSED by default to keep compatible with old applications. v3->v4: - add a new entry for pf_expose on ip-sysctl.txt, as Marcelo suggested. - change this patch to 1/5, and move sctp_assoc_control_transport change into 2/5, as Marcelo suggested. - use SCTP_PF_EXPOSE_UNSET instead of SCTP_PF_EXPOSE_UNUSED, and set SCTP_PF_EXPOSE_UNSET to 0 in enum, as Marcelo suggested. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6bce7f9837a9..765f41a080b4 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -933,6 +933,7 @@ struct sctp_paddrinfo { enum sctp_spinfo_state { SCTP_INACTIVE, SCTP_PF, +#define SCTP_POTENTIALLY_FAILED SCTP_PF SCTP_ACTIVE, SCTP_UNCONFIRMED, SCTP_UNKNOWN = 0xffff /* Value used for transport state unknown */ -- cgit v1.2.3 From 768e15182dcb809e39c338290dda10c4e271d133 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:33 +0800 Subject: sctp: add SCTP_ADDR_POTENTIALLY_FAILED notification SCTP Quick failover draft section 5.1, point 5 has been removed from rfc7829. Instead, "the sender SHOULD (i) notify the Upper Layer Protocol (ULP) about this state transition", as said in section 3.2, point 8. So this patch is to add SCTP_ADDR_POTENTIALLY_FAILED, defined in section 7.1, "which is reported if the affected address becomes PF". Also remove transport cwnd's update when moving from PF back to ACTIVE , which is no longer in rfc7829 either. Note that ulp_notify will be set to false if asoc->expose is not 'enabled', according to last patch. v2->v3: - define SCTP_ADDR_PF SCTP_ADDR_POTENTIALLY_FAILED. v3->v4: - initialize spc_state with SCTP_ADDR_AVAILABLE, as Marcelo suggested. - check asoc->pf_expose in sctp_assoc_control_transport(), as Marcelo suggested. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 765f41a080b4..d99b428ac34e 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -410,6 +410,8 @@ enum sctp_spc_state { SCTP_ADDR_ADDED, SCTP_ADDR_MADE_PRIM, SCTP_ADDR_CONFIRMED, + SCTP_ADDR_POTENTIALLY_FAILED, +#define SCTP_ADDR_PF SCTP_ADDR_POTENTIALLY_FAILED }; -- cgit v1.2.3 From 8d2a6935d842f12c25611b165eace778adb09a53 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:34 +0800 Subject: sctp: add SCTP_EXPOSE_POTENTIALLY_FAILED_STATE sockopt This is a sockopt defined in section 7.3 of rfc7829: "Exposing the Potentially Failed Path State", by which users can change pf_expose per sock and asoc. The new sockopt SCTP_EXPOSE_POTENTIALLY_FAILED_STATE is also known as SCTP_EXPOSE_PF_STATE for short. v2->v3: - return -EINVAL if params.assoc_value > SCTP_PF_EXPOSE_MAX. - define SCTP_EXPOSE_PF_STATE SCTP_EXPOSE_POTENTIALLY_FAILED_STATE. v3->v4: - improve changelog. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index d99b428ac34e..a190e4a7f546 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -137,6 +137,8 @@ typedef __s32 sctp_assoc_t; #define SCTP_ASCONF_SUPPORTED 128 #define SCTP_AUTH_SUPPORTED 129 #define SCTP_ECN_SUPPORTED 130 +#define SCTP_EXPOSE_POTENTIALLY_FAILED_STATE 131 +#define SCTP_EXPOSE_PF_STATE SCTP_EXPOSE_POTENTIALLY_FAILED_STATE /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 -- cgit v1.2.3 From d467ac0a38551a5904878b1f5a2fe20a040c0e11 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:36 +0800 Subject: sctp: add SCTP_PEER_ADDR_THLDS_V2 sockopt Section 7.2 of rfc7829: "Peer Address Thresholds (SCTP_PEER_ADDR_THLDS) Socket Option" extends 'struct sctp_paddrthlds' with 'spt_pathcpthld' added to allow a user to change ps_retrans per sock/asoc/transport, as other 2 paddrthlds: pf_retrans, pathmaxrxt. Note: to not break the user's program, here to support pf_retrans dump and setting by adding a new sockopt SCTP_PEER_ADDR_THLDS_V2, and a new structure sctp_paddrthlds_v2 instead of extending sctp_paddrthlds. Also, when setting ps_retrans, the value is not allowed to be greater than pf_retrans. v1->v2: - use SCTP_PEER_ADDR_THLDS_V2 to set/get pf_retrans instead, as Marcelo and David Laight suggested. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index a190e4a7f546..28ad40d9acba 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -105,6 +105,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_DEFAULT_SNDINFO 34 #define SCTP_AUTH_DEACTIVATE_KEY 35 #define SCTP_REUSE_PORT 36 +#define SCTP_PEER_ADDR_THLDS_V2 37 /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. @@ -1087,6 +1088,15 @@ struct sctp_paddrthlds { __u16 spt_pathpfthld; }; +/* Use a new structure with spt_pathcpthld for back compatibility */ +struct sctp_paddrthlds_v2 { + sctp_assoc_t spt_assoc_id; + struct sockaddr_storage spt_address; + __u16 spt_pathmaxrxt; + __u16 spt_pathpfthld; + __u16 spt_pathcpthld; +}; + /* * Socket Option for Getting the Association/Stream-Specific PR-SCTP Status */ -- cgit v1.2.3 From 8bb69f3b2918788435cbd5834c66682642c09fba Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:55 +0100 Subject: netfilter: nf_tables: add flowtable offload control plane This patch adds the NFTA_FLOWTABLE_FLAGS attribute that allows users to specify the NF_FLOWTABLE_HW_OFFLOAD flag. This patch also adds a new setup interface for the flowtable type to perform the flowtable offload block callback configuration. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 81fed16fe2b2..bb9b049310df 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1518,6 +1518,7 @@ enum nft_object_attributes { * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32) * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32) * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64) + * @NFTA_FLOWTABLE_FLAGS: flags (NLA_U32) */ enum nft_flowtable_attributes { NFTA_FLOWTABLE_UNSPEC, @@ -1527,6 +1528,7 @@ enum nft_flowtable_attributes { NFTA_FLOWTABLE_USE, NFTA_FLOWTABLE_HANDLE, NFTA_FLOWTABLE_PAD, + NFTA_FLOWTABLE_FLAGS, __NFTA_FLOWTABLE_MAX }; #define NFTA_FLOWTABLE_MAX (__NFTA_FLOWTABLE_MAX - 1) -- cgit v1.2.3 From bd1903b7c4596ba6f7677d0dfefd05ba5876707d Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Wed, 13 Nov 2019 23:04:49 +0800 Subject: net: openvswitch: add hash info to upcall When using the kernel datapath, the upcall don't include skb hash info relatived. That will introduce some problem, because the hash of skb is important in kernel stack. For example, VXLAN module uses it to select UDP src port. The tx queue selection may also use the hash in stack. Hash is computed in different ways. Hash is random for a TCP socket, and hash may be computed in hardware, or software stack. Recalculation hash is not easy. Hash of TCP socket is computed: tcp_v4_connect -> sk_set_txhash (is random) __tcp_transmit_skb -> skb_set_hash_from_sk There will be one upcall, without information of skb hash, to ovs-vswitchd, for the first packet of a TCP session. The rest packets will be processed in Open vSwitch modules, hash kept. If this tcp session is forward to VXLAN module, then the UDP src port of first tcp packet is different from rest packets. TCP packets may come from the host or dockers, to Open vSwitch. To fix it, we store the hash info to upcall, and restore hash when packets sent back. +---------------+ +-------------------------+ | Docker/VMs | | ovs-vswitchd | +----+----------+ +-+--------------------+--+ | ^ | | | | | | upcall v restore packet hash (not recalculate) | +-+--------------------+--+ | tap netdev | | vxlan module +---------------> +--> Open vSwitch ko +--> or internal type | | +-------------------------+ Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 1887a451c388..a87b44cd5590 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -173,6 +173,7 @@ enum ovs_packet_cmd { * @OVS_PACKET_ATTR_LEN: Packet size before truncation. * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment * size. + * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb). * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_PACKET_* commands. @@ -190,7 +191,8 @@ enum ovs_packet_attr { OVS_PACKET_ATTR_PROBE, /* Packet operation is a feature probe, error logging should be suppressed. */ OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */ - OVS_PACKET_ATTR_LEN, /* Packet size before truncation. */ + OVS_PACKET_ATTR_LEN, /* Packet size before truncation. */ + OVS_PACKET_ATTR_HASH, /* Packet hash. */ __OVS_PACKET_ATTR_MAX }; -- cgit v1.2.3 From fec56f5890d93fc2ed74166c397dc186b1c25951 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:04 -0800 Subject: bpf: Introduce BPF trampoline Introduce BPF trampoline concept to allow kernel code to call into BPF programs with practically zero overhead. The trampoline generation logic is architecture dependent. It's converting native calling convention into BPF calling convention. BPF ISA is 64-bit (even on 32-bit architectures). The registers R1 to R5 are used to pass arguments into BPF functions. The main BPF program accepts only single argument "ctx" in R1. Whereas CPU native calling convention is different. x86-64 is passing first 6 arguments in registers and the rest on the stack. x86-32 is passing first 3 arguments in registers. sparc64 is passing first 6 in registers. And so on. The trampolines between BPF and kernel already exist. BPF_CALL_x macros in include/linux/filter.h statically compile trampolines from BPF into kernel helpers. They convert up to five u64 arguments into kernel C pointers and integers. On 64-bit architectures this BPF_to_kernel trampolines are nops. On 32-bit architecture they're meaningful. The opposite job kernel_to_BPF trampolines is done by CAST_TO_U64 macros and __bpf_trace_##call() shim functions in include/trace/bpf_probe.h. They convert kernel function arguments into array of u64s that BPF program consumes via R1=ctx pointer. This patch set is doing the same job as __bpf_trace_##call() static trampolines, but dynamically for any kernel function. There are ~22k global kernel functions that are attachable via nop at function entry. The function arguments and types are described in BTF. The job of btf_distill_func_proto() function is to extract useful information from BTF into "function model" that architecture dependent trampoline generators will use to generate assembly code to cast kernel function arguments into array of u64s. For example the kernel function eth_type_trans has two pointers. They will be casted to u64 and stored into stack of generated trampoline. The pointer to that stack space will be passed into BPF program in R1. On x86-64 such generated trampoline will consume 16 bytes of stack and two stores of %rdi and %rsi into stack. The verifier will make sure that only two u64 are accessed read-only by BPF program. The verifier will also recognize the precise type of the pointers being accessed and will not allow typecasting of the pointer to a different type within BPF program. The tracing use case in the datacenter demonstrated that certain key kernel functions have (like tcp_retransmit_skb) have 2 or more kprobes that are always active. Other functions have both kprobe and kretprobe. So it is essential to keep both kernel code and BPF programs executing at maximum speed. Hence generated BPF trampoline is re-generated every time new program is attached or detached to maintain maximum performance. To avoid the high cost of retpoline the attached BPF programs are called directly. __bpf_prog_enter/exit() are used to support per-program execution stats. In the future this logic will be optimized further by adding support for bpf_stats_enabled_key inside generated assembly code. Introduction of preemptible and sleepable BPF programs will completely remove the need to call to __bpf_prog_enter/exit(). Detach of a BPF program from the trampoline should not fail. To avoid memory allocation in detach path the half of the page is used as a reserve and flipped after each attach/detach. 2k bytes is enough to call 40+ BPF programs directly which is enough for BPF tracing use cases. This limit can be increased in the future. BPF_TRACE_FENTRY programs have access to raw kernel function arguments while BPF_TRACE_FEXIT programs have access to kernel return value as well. Often kprobe BPF program remembers function arguments in a map while kretprobe fetches arguments from a map and analyzes them together with return value. BPF_TRACE_FEXIT accelerates this typical use case. Recursion prevention for kprobe BPF programs is done via per-cpu bpf_prog_active counter. In practice that turned out to be a mistake. It caused programs to randomly skip execution. The tracing tools missed results they were looking for. Hence BPF trampoline doesn't provide builtin recursion prevention. It's a job of BPF program itself and will be addressed in the follow up patches. BPF trampoline is intended to be used beyond tracing and fentry/fexit use cases in the future. For example to remove retpoline cost from XDP programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-5-ast@kernel.org --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index df6809a76404..69c200e6e696 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -201,6 +201,8 @@ enum bpf_attach_type { BPF_CGROUP_GETSOCKOPT, BPF_CGROUP_SETSOCKOPT, BPF_TRACE_RAW_TP, + BPF_TRACE_FENTRY, + BPF_TRACE_FEXIT, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 5b92a28aae4dd0f88778d540ecfdcdaec5a41723 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:17 -0800 Subject: bpf: Support attaching tracing BPF program to other BPF programs Allow FENTRY/FEXIT BPF programs to attach to other BPF programs of any type including their subprograms. This feature allows snooping on input and output packets in XDP, TC programs including their return values. In order to do that the verifier needs to track types not only of vmlinux, but types of other BPF programs as well. The verifier also needs to translate uapi/linux/bpf.h types used by networking programs into kernel internal BTF types used by FENTRY/FEXIT BPF programs. In some cases LLVM optimizations can remove arguments from BPF subprograms without adjusting BTF info that LLVM backend knows. When BTF info disagrees with actual types that the verifiers sees the BPF trampoline has to fallback to conservative and treat all arguments as u64. The FENTRY/FEXIT program can still attach to such subprograms, but it won't be able to recognize pointer types like 'struct sk_buff *' and it won't be able to pass them to bpf_skb_output() for dumping packets to user space. The FENTRY/FEXIT program would need to use bpf_probe_read_kernel() instead. The BPF_PROG_LOAD command is extended with attach_prog_fd field. When it's set to zero the attach_btf_id is one vmlinux BTF type ids. When attach_prog_fd points to previously loaded BPF program the attach_btf_id is BTF type id of main function or one of its subprograms. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-18-ast@kernel.org --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69c200e6e696..4842a134b202 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -425,6 +425,7 @@ union bpf_attr { __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + __u32 attach_prog_fd; /* 0 to attach to vmlinux */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.2.3 From fc9702273e2edb90400a34b3be76f7b08fa3344b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:04 -0800 Subject: bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY Add ability to memory-map contents of BPF array map. This is extremely useful for working with BPF global data from userspace programs. It allows to avoid typical bpf_map_{lookup,update}_elem operations, improving both performance and usability. There had to be special considerations for map freezing, to avoid having writable memory view into a frozen map. To solve this issue, map freezing and mmap-ing is happening under mutex now: - if map is already frozen, no writable mapping is allowed; - if map has writable memory mappings active (accounted in map->writecnt), map freezing will keep failing with -EBUSY; - once number of writable memory mappings drops to zero, map freezing can be performed again. Only non-per-CPU plain arrays are supported right now. Maps with spinlocks can't be memory mapped either. For BPF_F_MMAPABLE array, memory allocation has to be done through vmalloc() to be mmap()'able. We also need to make sure that array data memory is page-sized and page-aligned, so we over-allocate memory in such a way that struct bpf_array is at the end of a single page of memory with array->value being aligned with the start of the second page. On deallocation we need to accomodate this memory arrangement to free vmalloc()'ed memory correctly. One important consideration regarding how memory-mapping subsystem functions. Memory-mapping subsystem provides few optional callbacks, among them open() and close(). close() is called for each memory region that is unmapped, so that users can decrease their reference counters and free up resources, if necessary. open() is *almost* symmetrical: it's called for each memory region that is being mapped, **except** the very first one. So bpf_map_mmap does initial refcnt bump, while open() will do any extra ones after that. Thus number of close() calls is equal to number of open() calls plus one more. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: John Fastabend Acked-by: Johannes Weiner Link: https://lore.kernel.org/bpf/20191117172806.2195367-4-andriin@fb.com --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4842a134b202..dbbcf0b02970 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -348,6 +348,9 @@ enum bpf_attach_type { /* Clone map from listener for newly accepted socket */ #define BPF_F_CLONE (1U << 9) +/* Enable memory-mapping BPF map */ +#define BPF_F_MMAPABLE (1U << 10) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) -- cgit v1.2.3 From cec2975f2b7058c42330a0f8164d94c6b7c8c446 Mon Sep 17 00:00:00 2001 From: Gautam Ramakrishnan Date: Wed, 20 Nov 2019 19:43:54 +0530 Subject: net: sched: pie: enable timestamp based delay calculation RFC 8033 suggests an alternative approach to calculate the queue delay in PIE by using a timestamp on every enqueued packet. This patch adds an implementation of that approach and sets it as the default method to calculate queue delay. The previous method (based on Little's law) to calculate queue delay is set as optional. Signed-off-by: Gautam Ramakrishnan Signed-off-by: Leslie Monis Signed-off-by: Mohit P. Tahiliani Acked-by: Dave Taht Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 5011259b8f67..9f1a72876212 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -950,19 +950,25 @@ enum { TCA_PIE_BETA, TCA_PIE_ECN, TCA_PIE_BYTEMODE, + TCA_PIE_DQ_RATE_ESTIMATOR, __TCA_PIE_MAX }; #define TCA_PIE_MAX (__TCA_PIE_MAX - 1) struct tc_pie_xstats { - __u64 prob; /* current probability */ - __u32 delay; /* current delay in ms */ - __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ - __u32 packets_in; /* total number of packets enqueued */ - __u32 dropped; /* packets dropped due to pie_action */ - __u32 overlimit; /* dropped due to lack of space in queue */ - __u32 maxq; /* maximum queue size */ - __u32 ecn_mark; /* packets marked with ecn*/ + __u64 prob; /* current probability */ + __u32 delay; /* current delay in ms */ + __u32 avg_dq_rate; /* current average dq_rate in + * bits/pie_time + */ + __u32 dq_rate_estimating; /* is avg_dq_rate being calculated? */ + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to pie_action */ + __u32 overlimit; /* dropped due to lack of space + * in queue + */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ }; /* CBS */ -- cgit v1.2.3 From 91e6015b082b08a74e5d9d326f651e5890a93519 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Nov 2019 22:38:16 +0100 Subject: bpf: Emit audit messages upon successful prog load and unload Allow for audit messages to be emitted upon BPF program load and unload for having a timeline of events. The load itself is in syscall context, so additional info about the process initiating the BPF prog creation can be logged and later directly correlated to the unload event. The only info really needed from BPF side is the globally unique prog ID where then audit user space tooling can query / dump all info needed about the specific BPF program right upon load event and enrich the record, thus these changes needed here can be kept small and non-intrusive to the core. Raw example output: # auditctl -D # auditctl -a always,exit -F arch=x86_64 -S bpf # ausearch --start recent -m 1334 [...] ---- time->Wed Nov 20 12:45:51 2019 type=PROCTITLE msg=audit(1574271951.590:8974): proctitle="./test_verifier" type=SYSCALL msg=audit(1574271951.590:8974): arch=c000003e syscall=321 success=yes exit=14 a0=5 a1=7ffe2d923e80 a2=78 a3=0 items=0 ppid=742 pid=949 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=2 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1334] msg=audit(1574271951.590:8974): auid=0 uid=0 gid=0 ses=2 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 pid=949 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" prog-id=3260 event=LOAD ---- time->Wed Nov 20 12:45:51 2019 type=UNKNOWN[1334] msg=audit(1574271951.590:8975): prog-id=3260 event=UNLOAD ---- [...] Signed-off-by: Daniel Borkmann Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191120213816.8186-1-jolsa@kernel.org --- include/uapi/linux/audit.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index c89c6495983d..32a5db900f47 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -116,6 +116,7 @@ #define AUDIT_FANOTIFY 1331 /* Fanotify access decision */ #define AUDIT_TIME_INJOFFSET 1332 /* Timekeeping offset injected */ #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */ +#define AUDIT_BPF 1334 /* BPF subsystem */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ -- cgit v1.2.3 From fca3f91cc38ad866c995fb099d961b31cd687849 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:26 +0800 Subject: net: sched: add vxlan option support to act_tunnel_key This patch is to allow setting vxlan options using the act_tunnel_key action. Different from geneve options, only one option can be set. And also, geneve options and vxlan options can't be set at the same time. gbp is the only param for vxlan options: # ip link add name vxlan0 type vxlan dstport 0 external # tc qdisc add dev eth0 ingress # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_proto udp \ action tunnel_key \ set src_ip 10.0.99.192 \ dst_ip 10.0.99.193 \ dst_port 6081 \ id 11 \ vxlan_opts 01020304 \ action mirred egress redirect dev vxlan0 v1->v2: - add .strict_start_type for enc_opts_policy as Jakub noticed. - use Duplicate instead of Wrong in err msg for extack as Jakub suggested. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index 41c8b462c177..f302c2a76953 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -50,6 +50,10 @@ enum { * TCA_TUNNEL_KEY_ENC_OPTS_ * attributes */ + TCA_TUNNEL_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_ + * attributes + */ __TCA_TUNNEL_KEY_ENC_OPTS_MAX, }; @@ -67,4 +71,13 @@ enum { #define TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX - 1) +enum { + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX - 1) + #endif -- cgit v1.2.3 From e20d4ff2acd7db2ffce64a6ddbdaeec43a8eec19 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:27 +0800 Subject: net: sched: add erspan option support to act_tunnel_key This patch is to allow setting erspan options using the act_tunnel_key action. Different from geneve options, only one option can be set. And also, geneve options, vxlan options or erspan options can't be set at the same time. Options are expressed as ver:index:dir:hwid, when ver is set to 1, index will be applied while dir and hwid will be ignored, and when ver is set to 2, dir and hwid will be used while index will be ignored. # ip link add name erspan1 type erspan external # tc qdisc add dev eth0 ingress # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_proto udp \ action tunnel_key \ set src_ip 10.0.99.192 \ dst_ip 10.0.99.193 \ dst_port 6081 \ id 11 \ erspan_opts 1:2:0:0 \ action mirred egress redirect dev erspan1 v1->v2: - do the validation when dst is not yet allocated as Jakub suggested. - use Duplicate instead of Wrong in err msg for extack. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index f302c2a76953..3f10dc4e7a4b 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -54,6 +54,10 @@ enum { * TCA_TUNNEL_KEY_ENC_OPTS_ * attributes */ + TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_ + * attributes + */ __TCA_TUNNEL_KEY_ENC_OPTS_MAX, }; @@ -80,4 +84,16 @@ enum { #define TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX \ (__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX - 1) +enum { + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, /* u8 */ + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, /* be32 */ + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR, /* u8 */ + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID, /* u8 */ + __TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX \ + (__TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX - 1) + #endif -- cgit v1.2.3 From d8f9dfae49ce4ffb772dc10dd6578dc815b34c12 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:28 +0800 Subject: net: sched: allow flower to match vxlan options This patch is to allow matching gbp option in vxlan. The options can be described in the form GBP/GBP_MASK, where GBP is represented as a 32bit hexadecimal value. Different from geneve, only one option can be set. And also, geneve options and vxlan options can't be set at the same time. # ip link add name vxlan0 type vxlan dstport 0 external # tc qdisc add dev vxlan0 ingress # tc filter add dev vxlan0 protocol ip parent ffff: \ flower \ enc_src_ip 10.0.99.192 \ enc_dst_ip 10.0.99.193 \ enc_key_id 11 \ vxlan_opts 01020304/ffffffff \ ip_proto udp \ action mirred egress redirect dev eth0 v1->v2: - add .strict_start_type for enc_opts_policy as Jakub noticed. - use Duplicate instead of Wrong in err msg for extack as Jakub suggested. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index c6ad22f76ede..929825d710e2 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -571,6 +571,10 @@ enum { * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ * attributes */ + TCA_FLOWER_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_VXLAN_ + * attributes + */ __TCA_FLOWER_KEY_ENC_OPTS_MAX, }; @@ -588,6 +592,15 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), -- cgit v1.2.3 From 79b1011cb33d166f531a1347a17e6602954e4eb1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:29 +0800 Subject: net: sched: allow flower to match erspan options This patch is to allow matching options in erspan. The options can be described in the form: VER:INDEX:DIR:HWID/VER:INDEX_MASK:DIR_MASK:HWID_MASK. When ver is set to 1, index will be applied while dir and hwid will be ignored, and when ver is set to 2, dir and hwid will be used while index will be ignored. Different from geneve, only one option can be set. And also, geneve options, vxlan options or erspan options can't be set at the same time. # ip link add name erspan1 type erspan external # tc qdisc add dev erspan1 ingress # tc filter add dev erspan1 protocol ip parent ffff: \ flower \ enc_src_ip 10.0.99.192 \ enc_dst_ip 10.0.99.193 \ enc_key_id 11 \ erspan_opts 1:12:0:0/1:ffff:0:0 \ ip_proto udp \ action mirred egress redirect dev eth0 v1->v2: - improve some err msgs of extack. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 929825d710e2..449a63971451 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -575,6 +575,10 @@ enum { * TCA_FLOWER_KEY_ENC_OPT_VXLAN_ * attributes */ + TCA_FLOWER_KEY_ENC_OPTS_ERSPAN, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_ERSPAN_ + * attributes + */ __TCA_FLOWER_KEY_ENC_OPTS_MAX, }; @@ -601,6 +605,18 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, /* be32 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID, /* u8 */ + __TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), -- cgit v1.2.3 From 84bb46cd62283cc371769ec1f77ff7924099f584 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 23 Nov 2019 09:54:58 -0800 Subject: Revert "bpf: Emit audit messages upon successful prog load and unload" This commit reverts commit 91e6015b082b ("bpf: Emit audit messages upon successful prog load and unload") and its follow up commit 7599a896f2e4 ("audit: Move audit_log_task declaration under CONFIG_AUDITSYSCALL") as requested by Paul Moore. The change needs close review on linux-audit, tests etc. Signed-off-by: Jakub Kicinski --- include/uapi/linux/audit.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 32a5db900f47..c89c6495983d 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -116,7 +116,6 @@ #define AUDIT_FANOTIFY 1331 /* Fanotify access decision */ #define AUDIT_TIME_INJOFFSET 1332 /* Timekeeping offset injected */ #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */ -#define AUDIT_BPF 1334 /* BPF subsystem */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ -- cgit v1.2.3