From 36fbf1e52bd3ff8a5cb604955eedfc9350c2e6cc Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Mon, 30 Sep 2019 11:48:16 +0200 Subject: net: rtnetlink: add linkprop commands to add and delete alternative ifnames Add two commands to add and delete list of link properties. Implement the first property type along - alternative ifnames. Each net device can have multiple alternative names. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/if.h | 1 + include/uapi/linux/if_link.h | 2 ++ include/uapi/linux/rtnetlink.h | 7 +++++++ 3 files changed, 10 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h index 7fea0fd7d6f5..4bf33344aab1 100644 --- a/include/uapi/linux/if.h +++ b/include/uapi/linux/if.h @@ -33,6 +33,7 @@ #define IFNAMSIZ 16 #endif /* __UAPI_DEF_IF_IFNAMSIZ */ #define IFALIASZ 256 +#define ALTIFNAMSIZ 128 #include /* For glibc compatibility. An empty enum does not compile. */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 4a8c02cafa9a..8aec8769d944 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -167,6 +167,8 @@ enum { IFLA_NEW_IFINDEX, IFLA_MIN_MTU, IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ __IFLA_MAX }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index ce2a623abb75..1418a8362bb7 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -164,6 +164,13 @@ enum { RTM_GETNEXTHOP, #define RTM_GETNEXTHOP RTM_GETNEXTHOP + RTM_NEWLINKPROP = 108, +#define RTM_NEWLINKPROP RTM_NEWLINKPROP + RTM_DELLINKPROP, +#define RTM_DELLINKPROP RTM_DELLINKPROP + RTM_GETLINKPROP, +#define RTM_GETLINKPROP RTM_GETLINKPROP + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; -- cgit v1.2.3 From 4b76f9ed47074990d85c2ec52288a7d09c3ea357 Mon Sep 17 00:00:00 2001 From: Sunil Dutt Date: Fri, 13 Sep 2019 18:11:44 +0530 Subject: nl80211: Document the expectation for NL80211_ATTR_IE in NL80211_CMD_CONNECT This commit documents the expectation for NL80211_ATTR_IE when included in NL80211_CMD_CONNECT, as following. Driver shall not modify the IEs specified through NL80211_ATTR_IE if NL80211_ATTR_MAC is included. However, if NL80211_ATTR_MAC_HINT is included, these IEs through NL80211_ATTR_IE are specified by the user space based on the best possible BSS selected. Thus, if the driver ends up selecting a different BSS, it can modify these IEs accordingly (e.g. userspace asks the driver to perform PMKSA caching with BSS1 and the driver ends up selecting BSS2 with different PMKSA cache entry. RSNIE has to get updated with the apt PMKID). Signed-off-by: Sunil Dutt Link: https://lore.kernel.org/r/1568378504-15179-1-git-send-email-usdutt@codeaurora.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index beee59c831a7..64135ab3a7ac 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -571,6 +571,14 @@ * set of BSSID,frequency parameters is used (i.e., either the enforcing * %NL80211_ATTR_MAC,%NL80211_ATTR_WIPHY_FREQ or the less strict * %NL80211_ATTR_MAC_HINT and %NL80211_ATTR_WIPHY_FREQ_HINT). + * Driver shall not modify the IEs specified through %NL80211_ATTR_IE if + * %NL80211_ATTR_MAC is included. However, if %NL80211_ATTR_MAC_HINT is + * included, these IEs through %NL80211_ATTR_IE are specified by the user + * space based on the best possible BSS selected. Thus, if the driver ends + * up selecting a different BSS, it can modify these IEs accordingly (e.g. + * userspace asks the driver to perform PMKSA caching with BSS1 and the + * driver ends up selecting BSS2 with different PMKSA cache entry; RSNIE + * has to get updated with the apt PMKID). * %NL80211_ATTR_PREV_BSSID can be used to request a reassociation within * the ESS in case the device is already associated and an association with * a different BSS is desired. -- cgit v1.2.3 From 070c63f20f6c739a3c534555f56c7327536bfcc2 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 3 Oct 2019 11:49:39 +0200 Subject: net: devlink: allow to change namespaces during reload All devlink instances are created in init_net and stay there for a lifetime. Allow user to be able to move devlink instances into namespaces during devlink reload operation. That ensures proper re-instantiation of driver objects, including netdevices. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 580b7a2e40e1..b558ea88b766 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -421,6 +421,10 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_FAILED, /* u8 0 or 1 */ + DEVLINK_ATTR_NETNS_FD, /* u32 */ + DEVLINK_ATTR_NETNS_PID, /* u32 */ + DEVLINK_ATTR_NETNS_ID, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From d6547f2a2cfc8b145b59291d3e4b072891f34882 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 3 Oct 2019 23:29:24 +0300 Subject: net, uapi: fix -Wpointer-arith warnings Add casts to fix these warnings: ./usr/include/linux/netfilter_arp/arp_tables.h:200:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/netfilter_bridge/ebtables.h:197:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/netfilter_ipv4/ip_tables.h:223:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/netfilter_ipv6/ip6_tables.h:263:19: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/tipc_config.h:310:28: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/tipc_config.h:410:24: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] ./usr/include/linux/virtio_ring.h:170:16: error: pointer of type 'void *' used in arithmetic [-Werror=pointer-arith] Those are theoretical probably but kernel doesn't control compiler flags in userspace. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/uapi/linux/netfilter_arp/arp_tables.h | 2 +- include/uapi/linux/netfilter_bridge/ebtables.h | 2 +- include/uapi/linux/netfilter_ipv4/ip_tables.h | 2 +- include/uapi/linux/netfilter_ipv6/ip6_tables.h | 2 +- include/uapi/linux/tipc_config.h | 4 ++-- include/uapi/linux/virtio_ring.h | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter_arp/arp_tables.h b/include/uapi/linux/netfilter_arp/arp_tables.h index a2a0927d9bd6..bbf5af2b67a8 100644 --- a/include/uapi/linux/netfilter_arp/arp_tables.h +++ b/include/uapi/linux/netfilter_arp/arp_tables.h @@ -199,7 +199,7 @@ struct arpt_get_entries { /* Helper functions */ static __inline__ struct xt_entry_target *arpt_get_target(struct arpt_entry *e) { - return (void *)e + e->target_offset; + return (struct xt_entry_target *)((char *)e + e->target_offset); } /* diff --git a/include/uapi/linux/netfilter_bridge/ebtables.h b/include/uapi/linux/netfilter_bridge/ebtables.h index 8076c940ffeb..a494cf43a755 100644 --- a/include/uapi/linux/netfilter_bridge/ebtables.h +++ b/include/uapi/linux/netfilter_bridge/ebtables.h @@ -194,7 +194,7 @@ struct ebt_entry { static __inline__ struct ebt_entry_target * ebt_get_target(struct ebt_entry *e) { - return (void *)e + e->target_offset; + return (struct ebt_entry_target *)((char *)e + e->target_offset); } /* {g,s}etsockopt numbers */ diff --git a/include/uapi/linux/netfilter_ipv4/ip_tables.h b/include/uapi/linux/netfilter_ipv4/ip_tables.h index 6aaeb14bfce1..50c7fee625ae 100644 --- a/include/uapi/linux/netfilter_ipv4/ip_tables.h +++ b/include/uapi/linux/netfilter_ipv4/ip_tables.h @@ -222,7 +222,7 @@ struct ipt_get_entries { static __inline__ struct xt_entry_target * ipt_get_target(struct ipt_entry *e) { - return (void *)e + e->target_offset; + return (struct xt_entry_target *)((char *)e + e->target_offset); } /* diff --git a/include/uapi/linux/netfilter_ipv6/ip6_tables.h b/include/uapi/linux/netfilter_ipv6/ip6_tables.h index 031d0a43bed2..d9e364f96a5c 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6_tables.h +++ b/include/uapi/linux/netfilter_ipv6/ip6_tables.h @@ -262,7 +262,7 @@ struct ip6t_get_entries { static __inline__ struct xt_entry_target * ip6t_get_target(struct ip6t_entry *e) { - return (void *)e + e->target_offset; + return (struct xt_entry_target *)((char *)e + e->target_offset); } /* diff --git a/include/uapi/linux/tipc_config.h b/include/uapi/linux/tipc_config.h index 4955e1a9f1bc..4dfc05651c98 100644 --- a/include/uapi/linux/tipc_config.h +++ b/include/uapi/linux/tipc_config.h @@ -309,7 +309,7 @@ static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len) tlv_ptr->tlv_len = htons(tlv_len); if (len && data) { memcpy(TLV_DATA(tlv_ptr), data, len); - memset(TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len); + memset((char *)TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len); } return TLV_SPACE(len); } @@ -409,7 +409,7 @@ static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags, tcm_hdr->tcm_flags = htons(flags); if (data_len && data) { memcpy(TCM_DATA(msg), data, data_len); - memset(TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len); + memset((char *)TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len); } return TCM_SPACE(data_len); } diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index 4c4e24c291a5..559f42e73315 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -169,7 +169,7 @@ static inline void vring_init(struct vring *vr, unsigned int num, void *p, { vr->num = num; vr->desc = p; - vr->avail = p + num*sizeof(struct vring_desc); + vr->avail = (struct vring_avail *)((char *)p + num * sizeof(struct vring_desc)); vr->used = (void *)(((uintptr_t)&vr->avail->ring[num] + sizeof(__virtio16) + align-1) & ~(align - 1)); } -- cgit v1.2.3 From d26b698dd3cd52f5a3277446a87e5e0198c99cd0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:24 -0700 Subject: net/tls: add skeleton of MIB statistics Add a skeleton structure for adding TLS statistics. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 549a31c29f7d..4abd57948ad4 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -323,4 +323,11 @@ enum __LINUX_MIB_XFRMMAX }; +/* linux TLS mib definitions */ +enum +{ + LINUX_MIB_TLSNUM = 0, + __LINUX_MIB_TLSMAX +}; + #endif /* _LINUX_SNMP_H */ -- cgit v1.2.3 From b32fd3cc31d723bf2ab859667be3612c0086ec72 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:25 -0700 Subject: net/tls: add statistics for installed sessions Add SNMP stats for number of sockets with successfully installed sessions. Break them down to software and hardware ones. Note that if hardware offload fails stack uses software implementation, and counts the session appropriately. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 4abd57948ad4..1b4613b5af70 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -327,6 +327,14 @@ enum enum { LINUX_MIB_TLSNUM = 0, + LINUX_MIB_TLSCURRTXSW, /* TlsCurrTxSw */ + LINUX_MIB_TLSCURRRXSW, /* TlsCurrRxSw */ + LINUX_MIB_TLSCURRTXDEVICE, /* TlsCurrTxDevice */ + LINUX_MIB_TLSCURRRXDEVICE, /* TlsCurrRxDevice */ + LINUX_MIB_TLSTXSW, /* TlsTxSw */ + LINUX_MIB_TLSRXSW, /* TlsRxSw */ + LINUX_MIB_TLSTXDEVICE, /* TlsTxDevice */ + LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From 5c5ec66858062a857cf51f57cbe52b36330f7ae6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:26 -0700 Subject: net/tls: add TlsDecryptError stat Add a statistic for TLS record decryption errors. Since devices are supposed to pass records as-is when they encounter errors this statistic will count bad records in both pure software and inline crypto configurations. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 1b4613b5af70..c9e4963e26f0 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -335,6 +335,7 @@ enum LINUX_MIB_TLSRXSW, /* TlsRxSw */ LINUX_MIB_TLSTXDEVICE, /* TlsTxDevice */ LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ + LINUX_MIB_TLSDECRYPTERROR, /* TlsDecryptError */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From a4d26fdbc2a5414bb1b67198656cc7e24a4a3c3a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2019 16:19:27 -0700 Subject: net/tls: add TlsDeviceRxResync statistic Add a statistic for number of RX resyncs sent down to the NIC. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index c9e4963e26f0..7eee233e78d2 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -336,6 +336,7 @@ enum LINUX_MIB_TLSTXDEVICE, /* TlsTxDevice */ LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ LINUX_MIB_TLSDECRYPTERROR, /* TlsDecryptError */ + LINUX_MIB_TLSRXDEVICERESYNC, /* TlsRxDeviceResync */ __LINUX_MIB_TLSMAX }; -- cgit v1.2.3 From 5f0e5412781b01708f622d00c0b3f77b9dca7367 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 6 Oct 2019 20:07:36 -0700 Subject: uapi/bpf: fix helper docs Various small fixes to BPF helper documentation comments, enabling automatic header generation with a list of BPF helpers. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77c6be96d676..a65c3b0c6935 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -794,7 +794,7 @@ union bpf_attr { * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. * - * int bpf_get_current_comm(char *buf, u32 size_of_buf) + * int bpf_get_current_comm(void *buf, u32 size_of_buf) * Description * Copy the **comm** attribute of the current task into *buf* of * *size_of_buf*. The **comm** attribute contains the name of @@ -1023,7 +1023,7 @@ union bpf_attr { * The realm of the route for the packet associated to *skb*, or 0 * if none was found. * - * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * int bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) * Description * Write raw *data* blob into a special BPF perf event held by * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf @@ -1068,7 +1068,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) + * int bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) * Description * This helper was provided as an easy way to load data from a * packet. It can be used to load *len* bytes from *offset* from @@ -1085,7 +1085,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags) + * int bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) * Description * Walk a user or a kernel stack and return its id. To achieve * this, the helper needs *ctx*, which is a pointer to the context @@ -1154,7 +1154,7 @@ union bpf_attr { * The checksum result, or a negative error code in case of * failure. * - * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Retrieve tunnel options metadata for the packet associated to * *skb*, and store the raw tunnel option data to the buffer *opt* @@ -1172,7 +1172,7 @@ union bpf_attr { * Return * The size of the option data retrieved. * - * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) + * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) * Description * Set tunnel options metadata for the packet associated to *skb* * to the option data contained in the raw buffer *opt* of *size*. @@ -1511,7 +1511,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1595,7 +1595,7 @@ union bpf_attr { * Return * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. * - * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * int bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) * Description * Redirect the packet to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and @@ -1715,7 +1715,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) + * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1947,7 +1947,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) + * int bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *ctx*, which is a pointer @@ -1980,7 +1980,7 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * - * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) + * int bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description * This helper is similar to **bpf_skb_load_bytes**\ () in that * it provides an easy way to load *len* bytes from *offset* @@ -2033,7 +2033,7 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * - * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) + * int bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. * The *skops* is used as a new value for the entry associated to @@ -2392,7 +2392,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) + * int bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description * For socket policies, insert *len* bytes into *msg* at offset * *start*. @@ -2408,9 +2408,9 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags) + * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) * Description - * Will remove *pop* bytes from a *msg* starting at byte *start*. + * Will remove *len* bytes from a *msg* starting at byte *start*. * This may result in **ENOMEM** errors under certain situations if * an allocation and copy are required due to a full ring buffer. * However, the helper will try to avoid doing the allocation @@ -2505,7 +2505,7 @@ union bpf_attr { * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * - * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * int bpf_skb_ecn_set_ce(struct sk_buff *skb) * Description * Set ECN (Explicit Congestion Notification) field of IP header * to **CE** (Congestion Encountered) if current value is **ECT** -- cgit v1.2.3 From b6e6b5f1da7e8d092f86a4351802c27c0170c5a5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 8 Oct 2019 19:27:36 +0800 Subject: sctp: add SCTP_SEND_FAILED_EVENT event This patch is to add a new event SCTP_SEND_FAILED_EVENT described in rfc6458#section-6.1.11. It's a update of SCTP_SEND_FAILED event: struct sctp_sndrcvinfo ssf_info is replaced with struct sctp_sndinfo ssfe_info in struct sctp_send_failed_event. SCTP_SEND_FAILED is being deprecated, but we don't remove it in this patch. Both are being processed in sctp_datamsg_destroy() when the corresp event flag is set. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: Jakub Kicinski --- include/uapi/linux/sctp.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6d5b164af55c..6bce7f9837a9 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -449,6 +449,16 @@ struct sctp_send_failed { __u8 ssf_data[0]; }; +struct sctp_send_failed_event { + __u16 ssf_type; + __u16 ssf_flags; + __u32 ssf_length; + __u32 ssf_error; + struct sctp_sndinfo ssfe_info; + sctp_assoc_t ssf_assoc_id; + __u8 ssf_data[0]; +}; + /* * ssf_flags: 16 bits (unsigned integer) * @@ -605,6 +615,7 @@ struct sctp_event_subscribe { __u8 sctp_stream_reset_event; __u8 sctp_assoc_reset_event; __u8 sctp_stream_change_event; + __u8 sctp_send_failure_event_event; }; /* @@ -632,6 +643,7 @@ union sctp_notification { struct sctp_stream_reset_event sn_strreset_event; struct sctp_assoc_reset_event sn_assocreset_event; struct sctp_stream_change_event sn_strchange_event; + struct sctp_send_failed_event sn_send_failed_event; }; /* Section 5.3.1 @@ -667,7 +679,9 @@ enum sctp_sn_type { #define SCTP_ASSOC_RESET_EVENT SCTP_ASSOC_RESET_EVENT SCTP_STREAM_CHANGE_EVENT, #define SCTP_STREAM_CHANGE_EVENT SCTP_STREAM_CHANGE_EVENT - SCTP_SN_TYPE_MAX = SCTP_STREAM_CHANGE_EVENT, + SCTP_SEND_FAILED_EVENT, +#define SCTP_SEND_FAILED_EVENT SCTP_SEND_FAILED_EVENT + SCTP_SN_TYPE_MAX = SCTP_SEND_FAILED_EVENT, #define SCTP_SN_TYPE_MAX SCTP_SN_TYPE_MAX }; -- cgit v1.2.3 From 14af7fd1d4279c8db7fbbb3ca0df3b13179eb502 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 12 Oct 2019 18:27:57 +0200 Subject: ethtool: Add support for 400Gbps (50Gbps per lane) link modes Add support for 400Gbps speed, link modes of 50Gbps per lane Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 8938b76c4ee3..d4591792f0b4 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1507,6 +1507,11 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, ETHTOOL_LINK_MODE_100baseT1_Full_BIT = 67, ETHTOOL_LINK_MODE_1000baseT1_Full_BIT = 68, + ETHTOOL_LINK_MODE_400000baseKR8_Full_BIT = 69, + ETHTOOL_LINK_MODE_400000baseSR8_Full_BIT = 70, + ETHTOOL_LINK_MODE_400000baseLR8_ER8_FR8_Full_BIT = 71, + ETHTOOL_LINK_MODE_400000baseDR8_Full_BIT = 72, + ETHTOOL_LINK_MODE_400000baseCR8_Full_BIT = 73, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS @@ -1618,6 +1623,7 @@ enum ethtool_link_mode_bit_indices { #define SPEED_56000 56000 #define SPEED_100000 100000 #define SPEED_200000 200000 +#define SPEED_400000 400000 #define SPEED_UNKNOWN -1 -- cgit v1.2.3 From ccfe29eb29c2edcea6552072ef00ff4117f53e83 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:24:58 -0700 Subject: bpf: Add attach_btf_id attribute to program load Add attach_btf_id attribute to prog_load command. It's similar to existing expected_attach_type attribute which is used in several cgroup based program types. Unfortunately expected_attach_type is ignored for tracing programs and cannot be reused for new purpose. Hence introduce attach_btf_id to verify bpf programs against given in-kernel BTF type id at load time. It is strictly checked to be valid for raw_tp programs only. In a later patches it will become: btf_id == 0 semantics of existing raw_tp progs. btd_id > 0 raw_tp with BTF and additional type safety. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-5-ast@kernel.org --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a65c3b0c6935..3bb2cd1de341 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -420,6 +420,7 @@ union bpf_attr { __u32 line_info_rec_size; /* userspace bpf_line_info size */ __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ + __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.2.3 From a7658e1a4164ce2b9eb4a11aadbba38586e93bd6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 15 Oct 2019 20:25:04 -0700 Subject: bpf: Check types of arguments passed into helpers Introduce new helper that reuses existing skb perf_event output implementation, but can be called from raw_tracepoint programs that receive 'struct sk_buff *' as tracepoint argument or can walk other kernel data structures to skb pointer. In order to do that teach verifier to resolve true C types of bpf helpers into in-kernel BTF ids. The type of kernel pointer passed by raw tracepoint into bpf program will be tracked by the verifier all the way until it's passed into helper function. For example: kfree_skb() kernel function calls trace_kfree_skb(skb, loc); bpf programs receives that skb pointer and may eventually pass it into bpf_skb_output() bpf helper which in-kernel is implemented via bpf_skb_event_output() kernel function. Its first argument in the kernel is 'struct sk_buff *'. The verifier makes sure that types match all the way. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191016032505.2089704-11-ast@kernel.org --- include/uapi/linux/bpf.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3bb2cd1de341..4af8b0819a32 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2751,6 +2751,30 @@ union bpf_attr { * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * int bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2863,7 +2887,8 @@ union bpf_attr { FN(sk_storage_get), \ FN(sk_storage_delete), \ FN(send_signal), \ - FN(tcp_gen_syncookie), + FN(tcp_gen_syncookie), \ + FN(skb_output), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 1a9167a214f560a23c5050ce6dfebae489528f0d Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Wed, 19 Jun 2019 13:01:27 -0300 Subject: KVM: PPC: Report single stepping capability When calling the KVM_SET_GUEST_DEBUG ioctl, userspace might request the next instruction to be single stepped via the KVM_GUESTDBG_SINGLESTEP control bit of the kvm_guest_debug structure. This patch adds the KVM_CAP_PPC_GUEST_DEBUG_SSTEP capability in order to inform userspace about the state of single stepping support. We currently don't have support for guest single stepping implemented in Book3S HV so the capability is only present for Book3S PR and BookE. Signed-off-by: Fabiano Rosas Signed-off-by: Paul Mackerras --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52641d8ca9e8..ce8cfcc51aec 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1000,6 +1000,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PMU_EVENT_FILTER 173 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175 +#define KVM_CAP_PPC_GUEST_DEBUG_SSTEP 176 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From c726200dd106d4c58a281eea7159b8ba28a4ab34 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 11 Oct 2019 13:07:05 +0200 Subject: KVM: arm/arm64: Allow reporting non-ISV data aborts to userspace For a long time, if a guest accessed memory outside of a memslot using any of the load/store instructions in the architecture which doesn't supply decoding information in the ESR_EL2 (the ISV bit is not set), the kernel would print the following message and terminate the VM as a result of returning -ENOSYS to userspace: load/store instruction decoding not implemented The reason behind this message is that KVM assumes that all accesses outside a memslot is an MMIO access which should be handled by userspace, and we originally expected to eventually implement some sort of decoding of load/store instructions where the ISV bit was not set. However, it turns out that many of the instructions which don't provide decoding information on abort are not safe to use for MMIO accesses, and the remaining few that would potentially make sense to use on MMIO accesses, such as those with register writeback, are not used in practice. It also turns out that fetching an instruction from guest memory can be a pretty horrible affair, involving stopping all CPUs on SMP systems, handling multiple corner cases of address translation in software, and more. It doesn't appear likely that we'll ever implement this in the kernel. What is much more common is that a user has misconfigured his/her guest and is actually not accessing an MMIO region, but just hitting some random hole in the IPA space. In this scenario, the error message above is almost misleading and has led to a great deal of confusion over the years. It is, nevertheless, ABI to userspace, and we therefore need to introduce a new capability that userspace explicitly enables to change behavior. This patch introduces KVM_CAP_ARM_NISV_TO_USER (NISV meaning Non-ISV) which does exactly that, and introduces a new exit reason to report the event to userspace. User space can then emulate an exception to the guest, restart the guest, suspend the guest, or take any other appropriate action as per the policy of the running system. Reported-by: Heinrich Schuchardt Signed-off-by: Christoffer Dall Reviewed-by: Alexander Graf Signed-off-by: Marc Zyngier --- include/uapi/linux/kvm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52641d8ca9e8..7336ee8d98d7 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -235,6 +235,7 @@ struct kvm_hyperv_exit { #define KVM_EXIT_S390_STSI 25 #define KVM_EXIT_IOAPIC_EOI 26 #define KVM_EXIT_HYPERV 27 +#define KVM_EXIT_ARM_NISV 28 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -394,6 +395,11 @@ struct kvm_run { } eoi; /* KVM_EXIT_HYPERV */ struct kvm_hyperv_exit hyperv; + /* KVM_EXIT_ARM_NISV */ + struct { + __u64 esr_iss; + __u64 fault_ipa; + } arm_nisv; /* Fix the size of the union. */ char padding[256]; }; @@ -1000,6 +1006,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PMU_EVENT_FILTER 173 #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175 +#define KVM_CAP_ARM_NISV_TO_USER 176 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From da345174ceca052469e4775e4ae263b5f27a9355 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 11 Oct 2019 13:07:06 +0200 Subject: KVM: arm/arm64: Allow user injection of external data aborts In some scenarios, such as buggy guest or incorrect configuration of the VMM and firmware description data, userspace will detect a memory access to a portion of the IPA, which is not mapped to any MMIO region. For this purpose, the appropriate action is to inject an external abort to the guest. The kernel already has functionality to inject an external abort, but we need to wire up a signal from user space that lets user space tell the kernel to do this. It turns out, we already have the set event functionality which we can perfectly reuse for this. Signed-off-by: Christoffer Dall Signed-off-by: Marc Zyngier --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 7336ee8d98d7..65db5a4257ec 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1007,6 +1007,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_IRQ_LINE_LAYOUT_2 174 #define KVM_CAP_HYPERV_DIRECT_TLBFLUSH 175 #define KVM_CAP_ARM_NISV_TO_USER 176 +#define KVM_CAP_ARM_INJECT_EXT_DABT 177 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 58772e9a3db72d032eeb12bc011bc5184a3925f4 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 21 Oct 2019 16:28:20 +0100 Subject: KVM: arm64: Provide VCPU attributes for stolen time Allow user space to inform the KVM host where in the physical memory map the paravirtualized time structures should be located. User space can set an attribute on the VCPU providing the IPA base address of the stolen time structure for that VCPU. This must be repeated for every VCPU in the VM. The address is given in terms of the physical address visible to the guest and must be 64 byte aligned. The guest will discover the address via a hypercall. Signed-off-by: Steven Price Signed-off-by: Marc Zyngier --- include/uapi/linux/kvm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52641d8ca9e8..a540c8357049 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1227,6 +1227,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_XIVE, #define KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_XIVE + KVM_DEV_TYPE_ARM_PV_TIME, +#define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_MAX, }; -- cgit v1.2.3 From b612e5df4587c934bd056bf05f4a1deca4de4f75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Oct 2019 12:45:37 +0200 Subject: clone3: add CLONE_CLEAR_SIGHAND Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL. Mutually exclusive with CLONE_SIGHAND to not disturb other thread's signal handler. In the spirit of closer cooperation between glibc developers and kernel developers (cf. [2]) this patchset came out of a discussion on the glibc mailing list for improving posix_spawn() (cf. [1], [3], [4]). Kernel support for this feature has been explicitly requested by glibc and I see no reason not to help them with this. The child helper process on Linux posix_spawn must ensure that no signal handlers are enabled, so the signal disposition must be either SIG_DFL or SIG_IGN. However, it requires a sigprocmask to obtain the current signal mask and at least _NSIG sigaction calls to reset the signal handlers for each posix_spawn call or complex state tracking that might lead to data corruption in glibc. Adding this flags lets glibc avoid these problems. [1]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00149.html [3]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00158.html [4]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00160.html [2]: https://lwn.net/Articles/799331/ '[...] by asking for better cooperation with the C-library projects in general. They should be copied on patches containing ABI changes, for example. I noted that there are often times where C-library developers wish the kernel community had done things differently; how could those be avoided in the future? Members of the audience suggested that more glibc developers should perhaps join the linux-api list. The other suggestion was to "copy Florian on everything".' Cc: Florian Weimer Cc: libc-alpha@sourceware.org Cc: linux-api@vger.kernel.org Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191014104538.3096-1-christian.brauner@ubuntu.com --- include/uapi/linux/sched.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 99335e1f4a27..1d500ed03c63 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -33,6 +33,9 @@ #define CLONE_NEWNET 0x40000000 /* New network namespace */ #define CLONE_IO 0x80000000 /* Clone io context */ +/* Flags for the clone3() syscall. */ +#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ + #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall -- cgit v1.2.3 From d54725cd11a57c30f650260cfb0a92c268bdc3e0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Oct 2019 14:30:05 +0200 Subject: netfilter: nf_tables: support for multiple devices per netdev hook This patch allows you to register one netdev basechain to multiple devices. This adds a new NFTA_HOOK_DEVS netlink attribute to specify the list of netdevices. Basechains store a list of hooks. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index ed8881ad18ed..81fed16fe2b2 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -144,12 +144,14 @@ enum nft_list_attributes { * @NFTA_HOOK_HOOKNUM: netfilter hook number (NLA_U32) * @NFTA_HOOK_PRIORITY: netfilter hook priority (NLA_U32) * @NFTA_HOOK_DEV: netdevice name (NLA_STRING) + * @NFTA_HOOK_DEVS: list of netdevices (NLA_NESTED) */ enum nft_hook_attributes { NFTA_HOOK_UNSPEC, NFTA_HOOK_HOOKNUM, NFTA_HOOK_PRIORITY, NFTA_HOOK_DEV, + NFTA_HOOK_DEVS, __NFTA_HOOK_MAX }; #define NFTA_HOOK_MAX (__NFTA_HOOK_MAX - 1) -- cgit v1.2.3 From 1d55fdc85799372ab3b0d2a6928e73439f8149aa Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Thu, 17 Oct 2019 22:35:11 +0000 Subject: crypto: ccp - Retry SEV INIT command in case of integrity check failure. SEV INIT command loads the SEV related persistent data from NVS and initializes the platform context. The firmware validates the persistent state. If validation fails, the firmware will reset the persisent state and return an integrity check failure status. At this point, a subsequent INIT command should succeed, so retry the command. The INIT command retry is only done during driver initialization. Additional enums along with SEV_RET_SECURE_DATA_INVALID are added to sev_ret_code to maintain continuity and relevance of enum values. Signed-off-by: Ashish Kalra Acked-by: David Rientjes Reviewed-by: Brijesh Singh Signed-off-by: Herbert Xu --- include/uapi/linux/psp-sev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index 592a0c1b77c9..0549a5c622bf 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -58,6 +58,9 @@ typedef enum { SEV_RET_HWSEV_RET_PLATFORM, SEV_RET_HWSEV_RET_UNSAFE, SEV_RET_UNSUPPORTED, + SEV_RET_INVALID_PARAM, + SEV_RET_RESOURCE_LIMIT, + SEV_RET_SECURE_DATA_INVALID, SEV_RET_MAX, } sev_ret_code; -- cgit v1.2.3 From 9a7f12edf8848a9b355a86fc0183d7be932ef11e Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Fri, 20 Sep 2019 17:58:21 +0200 Subject: fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint name According to commit message in the original commit c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life time hints"), as well as userspace library[1] and man page update[2], R/W hint constants are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained "RWF_*" prefix used in the early versions of the proposed patch set[3]. Rename it and provide the old name as a synonym for the new one for backward compatibility. [1] https://github.com/axboe/fio/commit/bd553af6c849 [2] https://github.com/mkerrisk/man-pages/commit/580082a186fd [3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life time hints") Acked-by: Song Liu Signed-off-by: Eugene Syromiatnikov Signed-off-by: Jens Axboe --- include/uapi/linux/fcntl.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 1d338357df8a..1f97b33c840e 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -58,13 +58,20 @@ * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be * used to clear any hints previously set. */ -#define RWF_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NOT_SET 0 #define RWH_WRITE_LIFE_NONE 1 #define RWH_WRITE_LIFE_SHORT 2 #define RWH_WRITE_LIFE_MEDIUM 3 #define RWH_WRITE_LIFE_LONG 4 #define RWH_WRITE_LIFE_EXTREME 5 +/* + * The originally introduced spelling is remained from the first + * versions of the patch set that introduced the feature, see commit + * v4.13-rc1~212^2~51. + */ +#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET + /* * Types of directory notifications that may be requested. */ -- cgit v1.2.3 From 480274787d7e3458bc5a7cfbbbe07033984ad711 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 23 Oct 2019 11:09:26 -0400 Subject: tcp: add TCP_INFO status for failed client TFO The TCPI_OPT_SYN_DATA bit as part of tcpi_options currently reports whether or not data-in-SYN was ack'd on both the client and server side. We'd like to gather more information on the client-side in the failure case in order to indicate the reason for the failure. This can be useful for not only debugging TFO, but also for creating TFO socket policies. For example, if a middle box removes the TFO option or drops a data-in-SYN, we can can detect this case, and turn off TFO for these connections saving the extra retransmits. The newly added tcpi_fastopen_client_fail status is 2 bits and has the following 4 states: 1) TFO_STATUS_UNSPEC Catch-all state which includes when TFO is disabled via black hole detection, which is indicated via LINUX_MIB_TCPFASTOPENBLACKHOLE. 2) TFO_COOKIE_UNAVAILABLE If TFO_CLIENT_NO_COOKIE mode is off, this state indicates that no cookie is available in the cache. 3) TFO_DATA_NOT_ACKED Data was sent with SYN, we received a SYN/ACK but it did not cover the data portion. Cookie is not accepted by server because the cookie may be invalid or the server may be overloaded. 4) TFO_SYN_RETRANSMITTED Data was sent with SYN, we received a SYN/ACK which did not cover the data after at least 1 additional SYN was sent (without data). It may be the case that a middle-box is dropping data-in-SYN packets. Thus, it would be more efficient to not use TFO on this connection to avoid extra retransmits during connection establishment. These new fields do not cover all the cases where TFO may fail, but other failures, such as SYN/ACK + data being dropped, will result in the connection not becoming established. And a connection blackhole after session establishment shows up as a stalled connection. Signed-off-by: Jason Baron Cc: Eric Dumazet Cc: Neal Cardwell Cc: Christoph Paasch Cc: Yuchung Cheng Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 81e697978e8b..74af1f759cee 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -155,6 +155,14 @@ enum { TCP_QUEUES_NR, }; +/* why fastopen failed from client perspective */ +enum tcp_fastopen_client_fail { + TFO_STATUS_UNSPEC, /* catch-all */ + TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */ + TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */ + TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */ +}; + /* for TCP_INFO socket option */ #define TCPI_OPT_TIMESTAMPS 1 #define TCPI_OPT_SACK 2 @@ -211,7 +219,7 @@ struct tcp_info { __u8 tcpi_backoff; __u8 tcpi_options; __u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4; - __u8 tcpi_delivery_rate_app_limited:1; + __u8 tcpi_delivery_rate_app_limited:1, tcpi_fastopen_client_fail:2; __u32 tcpi_rto; __u32 tcpi_ato; -- cgit v1.2.3 From c199ce4f9dd896c716aece33e6750be34aea1151 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 24 Oct 2019 17:22:01 +0200 Subject: net: Fix misspellings of "configure" and "configuration" Fix various misspellings of "configuration" and "configure". Signed-off-by: Geert Uytterhoeven Acked-by: Kalle Valo Signed-off-by: David S. Miller --- include/uapi/linux/dcbnl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index 69df19aa8e72..a791a94013a6 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -286,7 +286,7 @@ struct dcbmsg { * @DCB_CMD_GNUMTCS: get the number of traffic classes currently supported * @DCB_CMD_SNUMTCS: set the number of traffic classes * @DCB_CMD_GBCN: set backward congestion notification configuration - * @DCB_CMD_SBCN: get backward congestion notification configration. + * @DCB_CMD_SBCN: get backward congestion notification configuration. * @DCB_CMD_GAPP: get application protocol configuration * @DCB_CMD_SAPP: set application protocol configuration * @DCB_CMD_IEEE_SET: set IEEE 802.1Qaz configuration -- cgit v1.2.3 From c3a31e605620c279163c14068a60869ea3fda203 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Oct 2019 13:59:56 -0600 Subject: io_uring: add support for IORING_REGISTER_FILES_UPDATE Allows the application to remove/replace/add files to/from a file set. Passes in a struct: struct io_uring_files_update { __u32 offset; __s32 *fds; }; that holds an array of fds, size of array passed in through the usual nr_args part of the io_uring_register() system call. The logic is as follows: 1) If ->fds[i] is -1, the existing file at i + ->offset is removed from the set. 2) If ->fds[i] is a valid fd, the existing file at i + ->offset is replaced with ->fds[i]. For case #2, is the existing file is currently empty (fd == -1), the new fd is simply added to the array. Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ea57526a5b89..4f532d9c0554 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -150,5 +150,11 @@ struct io_uring_params { #define IORING_UNREGISTER_FILES 3 #define IORING_REGISTER_EVENTFD 4 #define IORING_UNREGISTER_EVENTFD 5 +#define IORING_REGISTER_FILES_UPDATE 6 + +struct io_uring_files_update { + __u32 offset; + __s32 *fds; +}; #endif -- cgit v1.2.3 From 33a107f0a1b8df0ad925e39d8afc97bb78e0cec1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Oct 2019 12:10:03 -0600 Subject: io_uring: allow application controlled CQ ring size We currently size the CQ ring as twice the SQ ring, to allow some flexibility in not overflowing the CQ ring. This is done because the SQE life time is different than that of the IO request itself, the SQE is consumed as soon as the kernel has seen the entry. Certain application don't need a huge SQ ring size, since they just submit IO in batches. But they may have a lot of requests pending, and hence need a big CQ ring to hold them all. By allowing the application to control the CQ ring size multiplier, we can cater to those applications more efficiently. If an application wants to define its own CQ ring size, it must set IORING_SETUP_CQSIZE in the setup flags, and fill out io_uring_params->cq_entries. The value must be a power of two. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4f532d9c0554..e0137ea6ad79 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -50,6 +50,7 @@ struct io_uring_sqe { #define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ #define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ +#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ #define IORING_OP_NOP 0 #define IORING_OP_READV 1 -- cgit v1.2.3 From a41525ab2e75987e809926352ebc6f1397da900e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 15 Oct 2019 16:48:15 -0600 Subject: io_uring: add support for absolute timeouts This is a pretty trivial addition on top of the relative timeouts we have now, but it's handy for ensuring tighter timing for those that are building scheduling primitives on top of io_uring. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e0137ea6ad79..b402dfee5e15 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -70,6 +70,11 @@ struct io_uring_sqe { */ #define IORING_FSYNC_DATASYNC (1U << 0) +/* + * sqe->timeout_flags + */ +#define IORING_TIMEOUT_ABS (1U << 0) + /* * IO completion data structure (Completion Queue Entry) */ -- cgit v1.2.3 From 11365043e5271fea4c92189a976833da477a3a44 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 16 Oct 2019 09:08:32 -0600 Subject: io_uring: add support for canceling timeout requests We might have cases where the need for a specific timeout is gone, add support for canceling an existing timeout operation. This works like the POLL_REMOVE command, where the application passes in the user_data of the timeout it wishes to cancel in the sqe->addr field. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index b402dfee5e15..6dc5ced1c37a 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -64,6 +64,7 @@ struct io_uring_sqe { #define IORING_OP_SENDMSG 9 #define IORING_OP_RECVMSG 10 #define IORING_OP_TIMEOUT 11 +#define IORING_OP_TIMEOUT_REMOVE 12 /* * sqe->fsync_flags -- cgit v1.2.3 From 17f2fe35d080d8f64e86a60cdcd3a97edcbc213b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Oct 2019 14:42:58 -0600 Subject: io_uring: add support for IORING_OP_ACCEPT This allows an application to call accept4() in an async fashion. Like other opcodes, we first try a non-blocking accept, then punt to async context if we have to. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6dc5ced1c37a..f82d90e617a6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -19,7 +19,10 @@ struct io_uring_sqe { __u8 flags; /* IOSQE_ flags */ __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ - __u64 off; /* offset into file */ + union { + __u64 off; /* offset into file */ + __u64 addr2; + }; __u64 addr; /* pointer to buffer or iovecs */ __u32 len; /* buffer size or number of iovecs */ union { @@ -29,6 +32,7 @@ struct io_uring_sqe { __u32 sync_range_flags; __u32 msg_flags; __u32 timeout_flags; + __u32 accept_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -65,6 +69,7 @@ struct io_uring_sqe { #define IORING_OP_RECVMSG 10 #define IORING_OP_TIMEOUT 11 #define IORING_OP_TIMEOUT_REMOVE 12 +#define IORING_OP_ACCEPT 13 /* * sqe->fsync_flags -- cgit v1.2.3 From c0bceb97db9efc72629dd00cd0d9812f24d4ba2d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 30 Oct 2019 14:00:41 +0100 Subject: tipc: add smart nagle feature We introduce a feature that works like a combination of TCP_NAGLE and TCP_CORK, but without some of the weaknesses of those. In particular, we will not observe long delivery delays because of delayed acks, since the algorithm itself decides if and when acks are to be sent from the receiving peer. - The nagle property as such is determined by manipulating a new 'maxnagle' field in struct tipc_sock. If certain conditions are met, 'maxnagle' will define max size of the messages which can be bundled. If it is set to zero no messages are ever bundled, implying that the nagle property is disabled. - A socket with the nagle property enabled enters nagle mode when more than 4 messages have been sent out without receiving any data message from the peer. - A socket leaves nagle mode whenever it receives a data message from the peer. In nagle mode, messages smaller than 'maxnagle' are accumulated in the socket write queue. The last buffer in the queue is marked with a new 'ack_required' bit, which forces the receiving peer to send a CONN_ACK message back to the sender upon reception. The accumulated contents of the write queue is transmitted when one of the following events or conditions occur. - A CONN_ACK message is received from the peer. - A data message is received from the peer. - A SOCK_WAKEUP pseudo message is received from the link level. - The write queue contains more than 64 1k blocks of data. - The connection is being shut down. - There is no CONN_ACK message to expect. I.e., there is currently no outstanding message where the 'ack_required' bit was set. As a consequence, the first message added after we enter nagle mode is always sent directly with this bit set. This new feature gives a 50-100% improvement of throughput for small (i.e., less than MTU size) messages, while it might add up to one RTT to latency time when the socket is in nagle mode. Acked-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 7df026ea6aff..76421b878767 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -191,6 +191,7 @@ struct sockaddr_tipc { #define TIPC_GROUP_JOIN 135 /* Takes struct tipc_group_req* */ #define TIPC_GROUP_LEAVE 136 /* No argument */ #define TIPC_SOCK_RECVQ_USED 137 /* Default: none (read only) */ +#define TIPC_NODELAY 138 /* Default: false */ /* * Flag values -- cgit v1.2.3 From abbb0d33632ce931ca9c814813ee131351f6b92f Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 30 Oct 2019 16:09:05 +0200 Subject: net: sched: extend TCA_ACT space with TCA_ACT_FLAGS Extend TCA_ACT space with nla_bitfield32 flags. Add TCA_ACT_FLAGS_NO_PERCPU_STATS as the only allowed flag. Parse the flags in tcf_action_init_1() and pass resulting value as additional argument to a_o->init(). Signed-off-by: Vlad Buslov Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index a6aa466fac9e..c6ad22f76ede 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -16,9 +16,14 @@ enum { TCA_ACT_STATS, TCA_ACT_PAD, TCA_ACT_COOKIE, + TCA_ACT_FLAGS, __TCA_ACT_MAX }; +#define TCA_ACT_FLAGS_NO_PERCPU_STATS 1 /* Don't use percpu allocator for + * actions stats. + */ + #define TCA_ACT_MAX __TCA_ACT_MAX #define TCA_OLD_COMPAT (TCA_ACT_MAX+1) #define TCA_ACT_MAX_PRIO 32 -- cgit v1.2.3 From f1b9509c2fb0ef4db8d22dac9aef8e856a5d81f6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 30 Oct 2019 15:32:11 -0700 Subject: bpf: Replace prog_raw_tp+btf_id with prog_tracing The bpf program type raw_tp together with 'expected_attach_type' was the most appropriate api to indicate BTF-enabled raw_tp programs. But during development it became apparent that 'expected_attach_type' cannot be used and new 'attach_btf_id' field had to be introduced. Which means that the information is duplicated in two fields where one of them is ignored. Clean it up by introducing new program type where both 'expected_attach_type' and 'attach_btf_id' fields have specific meaning. In the future 'expected_attach_type' will be extended with other attach points that have similar semantics to raw_tp. This patch is replacing BTF-enabled BPF_PROG_TYPE_RAW_TRACEPOINT with prog_type = BPF_RPOG_TYPE_TRACING expected_attach_type = BPF_TRACE_RAW_TP attach_btf_id = btf_id of raw tracepoint inside the kernel Future patches will add expected_attach_type = BPF_TRACE_FENTRY or BPF_TRACE_FEXIT where programs have the same input context and the same helpers, but different attach points. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20191030223212.953010-2-ast@kernel.org --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4af8b0819a32..a6bf19dabaab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -173,6 +173,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, BPF_PROG_TYPE_CGROUP_SOCKOPT, + BPF_PROG_TYPE_TRACING, }; enum bpf_attach_type { @@ -199,6 +200,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP6_RECVMSG, BPF_CGROUP_GETSOCKOPT, BPF_CGROUP_SETSOCKOPT, + BPF_TRACE_RAW_TP, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 62755e35dfb2b113c52b81cd96d01c20971c8e02 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 28 Oct 2019 21:49:21 -0600 Subject: io_uring: support for generic async request cancel This adds support for IORING_OP_ASYNC_CANCEL, which will attempt to cancel requests that have been punted to async context and are now in-flight. This works for regular read/write requests to files, as long as they haven't been started yet. For socket based IO (or things like accept4(2)), we can cancel work that is already running as well. To cancel a request, the sqe must have ->addr set to the user_data of the request it wishes to cancel. If the request is cancelled successfully, the original request is completed with -ECANCELED and the cancel request is completed with a result of 0. If the request was already running, the original may or may not complete in error. The cancel request will complete with -EALREADY for that case. And finally, if the request to cancel wasn't found, the cancel request is completed with -ENOENT. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f82d90e617a6..6877cf8894db 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -33,6 +33,7 @@ struct io_uring_sqe { __u32 msg_flags; __u32 timeout_flags; __u32 accept_flags; + __u32 cancel_flags; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -70,6 +71,7 @@ struct io_uring_sqe { #define IORING_OP_TIMEOUT 11 #define IORING_OP_TIMEOUT_REMOVE 12 #define IORING_OP_ACCEPT 13 +#define IORING_OP_ASYNC_CANCEL 14 /* * sqe->fsync_flags -- cgit v1.2.3 From 6ae08ae3dea2cfa03dd3665a3c8475c2d429ef47 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 2 Nov 2019 00:17:59 +0100 Subject: bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers The current bpf_probe_read() and bpf_probe_read_str() helpers are broken in that they assume they can be used for probing memory access for kernel space addresses /as well as/ user space addresses. However, plain use of probe_kernel_read() for both cases will attempt to always access kernel space address space given access is performed under KERNEL_DS and some archs in-fact have overlapping address spaces where a kernel pointer and user pointer would have the /same/ address value and therefore accessing application memory via bpf_probe_read{,_str}() would read garbage values. Lets fix BPF side by making use of recently added 3d7081822f7f ("uaccess: Add non-pagefault user-space read functions"). Unfortunately, the only way to fix this status quo is to add dedicated bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str() helpers. The bpf_probe_read{,_str}() helpers are kept as-is to retain their current behavior. The two *_user() variants attempt the access always under USER_DS set, the two *_kernel() variants will -EFAULT when accessing user memory if the underlying architecture has non-overlapping address ranges, also avoiding throwing the kernel warning via 00c42373d397 ("x86-64: add warning for non-canonical user access address dereferences"). Fixes: a5e8c07059d0 ("bpf: add bpf_probe_read_str helper") Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/796ee46e948bc808d54891a1108435f8652c6ca4.1572649915.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 122 +++++++++++++++++++++++++++++++---------------- 1 file changed, 82 insertions(+), 40 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a6bf19dabaab..df6809a76404 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -563,10 +563,13 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read(void *dst, u32 size, const void *src) + * int bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) * Description * For tracing programs, safely attempt to read *size* bytes from - * address *src* and store the data in *dst*. + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use bpf_probe_read_user() or bpf_probe_read_kernel() + * instead. * Return * 0 on success, or a negative error in case of failure. * @@ -1428,45 +1431,14 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) + * int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) * Description - * Copy a NUL terminated string from an unsafe address - * *unsafe_ptr* to *dst*. The *size* should include the - * terminating NUL byte. In case the string length is smaller than - * *size*, the target is not padded with further NUL bytes. If the - * string length is larger than *size*, just *size*-1 bytes are - * copied and the last byte is set to NUL. - * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: - * - * :: - * - * SEC("kprobe/sys_open") - * void bpf_sys_open(struct pt_regs *ctx) - * { - * char buf[PATHLEN]; // PATHLEN is defined to 256 - * int res = bpf_probe_read_str(buf, sizeof(buf), - * ctx->di); - * - * // Consume buf, for example push it to - * // userspace via bpf_perf_event_output(); we - * // can use res (the string length) as event - * // size, after checking its boundaries. - * } - * - * In comparison, using **bpf_probe_read()** helper here instead - * to read the string would require to estimate the length at - * compile time, and would often result in copying more memory - * than necessary. + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for + * more details. * - * Another useful use case is when parsing individual process - * arguments or individual environment variables navigating - * *current*\ **->mm->arg_start** and *current*\ - * **->mm->env_start**: using this helper and the return value, - * one can quickly iterate at the right offset of the memory area. + * Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str() + * instead. * Return * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative @@ -2777,6 +2749,72 @@ union bpf_attr { * restricted to raw_tracepoint bpf programs. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, the length of the copied string is returned. This + * makes this helper useful in tracing programs for reading + * strings, and more importantly to get its length at runtime. See + * the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user()** helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with bpf_probe_read_user_str() apply. + * Return + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2890,7 +2928,11 @@ union bpf_attr { FN(sk_storage_delete), \ FN(send_signal), \ FN(tcp_gen_syncookie), \ - FN(skb_output), + FN(skb_output), \ + FN(probe_read_user), \ + FN(probe_read_kernel), \ + FN(probe_read_user_str), \ + FN(probe_read_kernel_str), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From 51f421c85c880dcb37df11e672b384eaa4444328 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 31 Oct 2019 10:13:21 -0600 Subject: block: sed-opal: Add support to read/write opal tables generically This feature gives the user RW access to any opal table with admin1 authority. The flags described in the new structure determines if the user wants to read/write the data. Flags are checked for valid values in order to allow future features to be added to the ioctl. The user can provide the desired table's UID. Also, the ioctl provides a size and offset field and internally will loop data accesses to return the full data block. Read overrun is prevented by the initiator's sec_send_recv() backend. The ioctl provides a private field with the intention to accommodate any future expansions to the ioctl. Reviewed-by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- include/uapi/linux/sed-opal.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index c6d035fa1b6c..6f5af1a84213 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -113,6 +113,25 @@ struct opal_shadow_mbr { __u64 size; }; +/* Opal table operations */ +enum opal_table_ops { + OPAL_READ_TABLE, + OPAL_WRITE_TABLE, +}; + +#define OPAL_UID_LENGTH 8 +struct opal_read_write_table { + struct opal_key key; + const __u64 data; + const __u8 table_uid[OPAL_UID_LENGTH]; + __u64 offset; + __u64 size; +#define OPAL_TABLE_READ (1 << OPAL_READ_TABLE) +#define OPAL_TABLE_WRITE (1 << OPAL_WRITE_TABLE) + __u64 flags; + __u64 priv; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -128,5 +147,6 @@ struct opal_shadow_mbr { #define IOC_OPAL_PSID_REVERT_TPR _IOW('p', 232, struct opal_key) #define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done) #define IOC_OPAL_WRITE_SHADOW_MBR _IOW('p', 234, struct opal_shadow_mbr) +#define IOC_OPAL_GENERIC_TABLE_RW _IOW('p', 235, struct opal_read_write_table) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From b6520fce073b619e6f2c0d510bb3481c9386c70b Mon Sep 17 00:00:00 2001 From: Kristian Evensen Date: Thu, 26 Sep 2019 12:06:45 +0200 Subject: netfilter: ipset: Add wildcard support to net,iface The net,iface equal functions currently compares the full interface names. In several cases, wildcard (or prefix) matching is useful. For example, when converting a large iptables rule-set to make use of ipset, I was able to significantly reduce the number of set elements by making use of wildcard matching. Wildcard matching is enabled by adding "wildcard" when adding an element to a set. Internally, this causes the IPSET_FLAG_IFACE_WILDCARD-flag to be set. When this flag is set, only the initial part of the interface name is used for comparison. Wildcard matching is done per element and not per set, as there are many cases where mixing wildcard and non-wildcard elements are useful. This means that is up to the user to handle (avoid) overlapping interface names. Signed-off-by: Kristian Evensen Signed-off-by: Jozsef Kadlecsik --- include/uapi/linux/netfilter/ipset/ip_set.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index eea166c52c36..11a72a938eb1 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -205,6 +205,8 @@ enum ipset_cadt_flags { IPSET_FLAG_WITH_FORCEADD = (1 << IPSET_FLAG_BIT_WITH_FORCEADD), IPSET_FLAG_BIT_WITH_SKBINFO = 6, IPSET_FLAG_WITH_SKBINFO = (1 << IPSET_FLAG_BIT_WITH_SKBINFO), + IPSET_FLAG_BIT_IFACE_WILDCARD = 7, + IPSET_FLAG_IFACE_WILDCARD = (1 << IPSET_FLAG_BIT_IFACE_WILDCARD), IPSET_FLAG_CADT_MAX = 15, }; -- cgit v1.2.3 From 4d390c287b2f3fbd0bb64c52c1a9418f790986e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 19:13:13 -0800 Subject: net_sched: do not export gnet_stats_basic_packed to uapi gnet_stats_basic_packed was really meant to be private kernel structure. If this proves to be a problem, we will have to rename the in-kernel version. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/gen_stats.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 065408e16a80..4eaacdf452e3 100644 --- a/include/uapi/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h @@ -26,10 +26,6 @@ struct gnet_stats_basic { __u64 bytes; __u32 packets; }; -struct gnet_stats_basic_packed { - __u64 bytes; - __u32 packets; -} __attribute__ ((packed)); /** * struct gnet_stats_rate_est - rate estimator -- cgit v1.2.3 From b33e699fe43aa63f29113311f69357e119ef5276 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Nov 2019 19:13:15 -0800 Subject: net_sched: add TCA_STATS_PKT64 attribute Now the kernel uses 64bit packet counters in scheduler layer, we want to export these counters to user space. Instead risking breaking user space by adding fields to struct gnet_stats_basic, add a new TCA_STATS_PKT64. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/gen_stats.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 4eaacdf452e3..852f234f1fd6 100644 --- a/include/uapi/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h @@ -13,6 +13,7 @@ enum { TCA_STATS_RATE_EST64, TCA_STATS_PAD, TCA_STATS_BASIC_HW, + TCA_STATS_PKT64, __TCA_STATS_MAX, }; #define TCA_STATS_MAX (__TCA_STATS_MAX - 1) -- cgit v1.2.3 From b103fb7653fff09e7a6fb6ba9398a41584e7ae36 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 24 Oct 2019 14:54:36 -0700 Subject: fscrypt: add support for IV_INO_LBLK_64 policies Inline encryption hardware compliant with the UFS v2.1 standard or with the upcoming version of the eMMC standard has the following properties: (1) Per I/O request, the encryption key is specified by a previously loaded keyslot. There might be only a small number of keyslots. (2) Per I/O request, the starting IV is specified by a 64-bit "data unit number" (DUN). IV bits 64-127 are assumed to be 0. The hardware automatically increments the DUN for each "data unit" of configurable size in the request, e.g. for each filesystem block. Property (1) makes it inefficient to use the traditional fscrypt per-file keys. Property (2) precludes the use of the existing DIRECT_KEY fscrypt policy flag, which needs at least 192 IV bits. Therefore, add a new fscrypt policy flag IV_INO_LBLK_64 which causes the encryption to modified as follows: - The encryption keys are derived from the master key, encryption mode number, and filesystem UUID. - The IVs are chosen as (inode_number << 32) | file_logical_block_num. For filenames encryption, file_logical_block_num is 0. Since the file nonces aren't used in the key derivation, many files may share the same encryption key. This is much more efficient on the target hardware. Including the inode number in the IVs and mixing the filesystem UUID into the keys ensures that data in different files is nevertheless still encrypted differently. Additionally, limiting the inode and block numbers to 32 bits and placing the block number in the low bits maintains compatibility with the 64-bit DUN convention (property (2) above). Since this scheme assumes that inode numbers are stable (which may preclude filesystem shrinking) and that inode and file logical block numbers are at most 32-bit, IV_INO_LBLK_64 will only be allowed on filesystems that meet these constraints. These are acceptable limitations for the cases where this format would actually be used. Note that IV_INO_LBLK_64 is an on-disk format, not an implementation. This patch just adds support for it using the existing filesystem layer encryption. A later patch will add support for inline encryption. Reviewed-by: Paul Crowley Co-developed-by: Satya Tangirala Signed-off-by: Satya Tangirala Signed-off-by: Eric Biggers --- include/uapi/linux/fscrypt.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/fscrypt.h b/include/uapi/linux/fscrypt.h index 39ccfe9311c3..1beb174ad950 100644 --- a/include/uapi/linux/fscrypt.h +++ b/include/uapi/linux/fscrypt.h @@ -17,7 +17,8 @@ #define FSCRYPT_POLICY_FLAGS_PAD_32 0x03 #define FSCRYPT_POLICY_FLAGS_PAD_MASK 0x03 #define FSCRYPT_POLICY_FLAG_DIRECT_KEY 0x04 -#define FSCRYPT_POLICY_FLAGS_VALID 0x07 +#define FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 0x08 +#define FSCRYPT_POLICY_FLAGS_VALID 0x0F /* Encryption algorithms */ #define FSCRYPT_MODE_AES_256_XTS 1 -- cgit v1.2.3 From 4ece477870774698e6e73d5821a3dd1605ca123b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Nov 2019 17:01:05 +0800 Subject: lwtunnel: add options setting and dumping for geneve To add options setting and dumping, .build_state(), .fill_encap() and .get_encap_size() in ip_tun_lwt_ops needs to be extended: ip_tun_build_state(): ip_tun_parse_opts(): ip_tun_parse_opts_geneve() ip_tun_fill_encap_info(): ip_tun_fill_encap_opts(): ip_tun_fill_encap_opts_geneve() ip_tun_encap_nlsize() ip_tun_opts_nlsize(): if (tun_flags & TUNNEL_GENEVE_OPT) ip_tun_parse_opts(), ip_tun_fill_encap_opts() and ip_tun_opts_nlsize() processes LWTUNNEL_IP_OPTS. ip_tun_parse_opts_geneve(), ip_tun_fill_encap_opts_geneve() and if (tun_flags & TUNNEL_GENEVE_OPT) processes LWTUNNEL_IP_OPTS_GENEVE. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index de696ca12f2c..b595ab219036 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -27,6 +27,7 @@ enum lwtunnel_ip_t { LWTUNNEL_IP_TOS, LWTUNNEL_IP_FLAGS, LWTUNNEL_IP_PAD, + LWTUNNEL_IP_OPTS, __LWTUNNEL_IP_MAX, }; @@ -41,11 +42,30 @@ enum lwtunnel_ip6_t { LWTUNNEL_IP6_TC, LWTUNNEL_IP6_FLAGS, LWTUNNEL_IP6_PAD, + LWTUNNEL_IP6_OPTS, __LWTUNNEL_IP6_MAX, }; #define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1) +enum { + LWTUNNEL_IP_OPTS_UNSPEC, + LWTUNNEL_IP_OPTS_GENEVE, + __LWTUNNEL_IP_OPTS_MAX, +}; + +#define LWTUNNEL_IP_OPTS_MAX (__LWTUNNEL_IP_OPTS_MAX - 1) + +enum { + LWTUNNEL_IP_OPT_GENEVE_UNSPEC, + LWTUNNEL_IP_OPT_GENEVE_CLASS, + LWTUNNEL_IP_OPT_GENEVE_TYPE, + LWTUNNEL_IP_OPT_GENEVE_DATA, + __LWTUNNEL_IP_OPT_GENEVE_MAX, +}; + +#define LWTUNNEL_IP_OPT_GENEVE_MAX (__LWTUNNEL_IP_OPT_GENEVE_MAX - 1) + enum { LWT_BPF_PROG_UNSPEC, LWT_BPF_PROG_FD, -- cgit v1.2.3 From edf31cbb1502481da181a09148adb33e12599185 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Nov 2019 17:01:06 +0800 Subject: lwtunnel: add options setting and dumping for vxlan Based on the code framework built on the last patch, to support setting and dumping for vxlan, we only need to add ip_tun_parse_opts_vxlan() for .build_state and ip_tun_fill_encap_opts_vxlan() for .fill_encap and if (tun_flags & TUNNEL_VXLAN_OPT) for .get_encap_size. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index b595ab219036..638b7b108453 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -51,6 +51,7 @@ enum lwtunnel_ip6_t { enum { LWTUNNEL_IP_OPTS_UNSPEC, LWTUNNEL_IP_OPTS_GENEVE, + LWTUNNEL_IP_OPTS_VXLAN, __LWTUNNEL_IP_OPTS_MAX, }; @@ -66,6 +67,14 @@ enum { #define LWTUNNEL_IP_OPT_GENEVE_MAX (__LWTUNNEL_IP_OPT_GENEVE_MAX - 1) +enum { + LWTUNNEL_IP_OPT_VXLAN_UNSPEC, + LWTUNNEL_IP_OPT_VXLAN_GBP, + __LWTUNNEL_IP_OPT_VXLAN_MAX, +}; + +#define LWTUNNEL_IP_OPT_VXLAN_MAX (__LWTUNNEL_IP_OPT_VXLAN_MAX - 1) + enum { LWT_BPF_PROG_UNSPEC, LWT_BPF_PROG_FD, -- cgit v1.2.3 From b0a21810bd5e1f92e3379899cc8ca9fe144ee8b3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 6 Nov 2019 17:01:07 +0800 Subject: lwtunnel: add options setting and dumping for erspan Based on the code framework built on the last patch, to support setting and dumping for vxlan, we only need to add ip_tun_parse_opts_erspan() for .build_state and ip_tun_fill_encap_opts_erspan() for .fill_encap and if (tun_flags & TUNNEL_ERSPAN_OPT) for .get_encap_size. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 638b7b108453..f6035f737193 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -52,6 +52,7 @@ enum { LWTUNNEL_IP_OPTS_UNSPEC, LWTUNNEL_IP_OPTS_GENEVE, LWTUNNEL_IP_OPTS_VXLAN, + LWTUNNEL_IP_OPTS_ERSPAN, __LWTUNNEL_IP_OPTS_MAX, }; @@ -75,6 +76,17 @@ enum { #define LWTUNNEL_IP_OPT_VXLAN_MAX (__LWTUNNEL_IP_OPT_VXLAN_MAX - 1) +enum { + LWTUNNEL_IP_OPT_ERSPAN_UNSPEC, + LWTUNNEL_IP_OPT_ERSPAN_VER, + LWTUNNEL_IP_OPT_ERSPAN_INDEX, + LWTUNNEL_IP_OPT_ERSPAN_DIR, + LWTUNNEL_IP_OPT_ERSPAN_HWID, + __LWTUNNEL_IP_OPT_ERSPAN_MAX, +}; + +#define LWTUNNEL_IP_OPT_ERSPAN_MAX (__LWTUNNEL_IP_OPT_ERSPAN_MAX - 1) + enum { LWT_BPF_PROG_UNSPEC, LWT_BPF_PROG_FD, -- cgit v1.2.3 From e876df1fe0ad1b191284ee6ed2db7960bd322d00 Mon Sep 17 00:00:00 2001 From: Ajay Joshi Date: Sun, 27 Oct 2019 23:05:46 +0900 Subject: block: add zone open, close and finish ioctl support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce three new ioctl commands BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE to allow applications to control the condition of zones on a zoned block device through the execution of the REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH operations. Contains contributions from Matias Bjorling, Hans Holmberg, Dmitry Fomichev, Keith Busch, Damien Le Moal and Christoph Hellwig. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Ajay Joshi Signed-off-by: Matias Bjorling Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Signed-off-by: Keith Busch Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 498eec813494..0cdef67135f0 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -120,9 +120,11 @@ struct blk_zone_report { }; /** - * struct blk_zone_range - BLKRESETZONE ioctl request - * @sector: starting sector of the first zone to issue reset write pointer - * @nr_sectors: Total number of sectors of 1 or more zones to reset + * struct blk_zone_range - BLKRESETZONE/BLKOPENZONE/ + * BLKCLOSEZONE/BLKFINISHZONE ioctl + * requests + * @sector: Starting sector of the first zone to operate on. + * @nr_sectors: Total number of sectors of all zones to operate on. */ struct blk_zone_range { __u64 sector; @@ -139,10 +141,19 @@ struct blk_zone_range { * sector range. The sector range must be zone aligned. * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors. * @BLKGETNRZONES: Get the total number of zones of the device. + * @BLKOPENZONE: Open the zones in the specified sector range. + * The 512 B sector range must be zone aligned. + * @BLKCLOSEZONE: Close the zones in the specified sector range. + * The 512 B sector range must be zone aligned. + * @BLKFINISHZONE: Mark the zones as full in the specified sector range. + * The 512 B sector range must be zone aligned. */ #define BLKREPORTZONE _IOWR(0x12, 130, struct blk_zone_report) #define BLKRESETZONE _IOW(0x12, 131, struct blk_zone_range) #define BLKGETZONESZ _IOR(0x12, 132, __u32) #define BLKGETNRZONES _IOR(0x12, 133, __u32) +#define BLKOPENZONE _IOW(0x12, 134, struct blk_zone_range) +#define BLKCLOSEZONE _IOW(0x12, 135, struct blk_zone_range) +#define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range) #endif /* _UAPI_BLKZONED_H */ -- cgit v1.2.3 From 2665abfd757fb35a241c6f0b1ebf620e3ffb36fb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Nov 2019 12:40:47 -0700 Subject: io_uring: add support for linked SQE timeouts While we have support for generic timeouts, we don't have a way to tie a timeout to a specific SQE. The generic timeouts simply trigger wakeups on the CQ ring. This adds support for IORING_OP_LINK_TIMEOUT. This command is only valid as a link to a previous command. The timeout specific can be either relative or absolute, following the same rules as IORING_OP_TIMEOUT. If the timeout triggers before the dependent command completes, it will attempt to cancel that command. Likewise, if the dependent command completes before the timeout triggers, it will cancel the timeout. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6877cf8894db..f1a118b01d18 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -72,6 +72,7 @@ struct io_uring_sqe { #define IORING_OP_TIMEOUT_REMOVE 12 #define IORING_OP_ACCEPT 13 #define IORING_OP_ASYNC_CANCEL 14 +#define IORING_OP_LINK_TIMEOUT 15 /* * sqe->fsync_flags -- cgit v1.2.3 From 1d7bb1d50fb4dc141c7431cc21fdd24ffcc83c76 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 6 Nov 2019 11:31:17 -0700 Subject: io_uring: add support for backlogged CQ ring Currently we drop completion events, if the CQ ring is full. That's fine for requests with bounded completion times, but it may make it harder or impossible to use io_uring with networked IO where request completion times are generally unbounded. Or with POLL, for example, which is also unbounded. After this patch, we never overflow the ring, we simply store requests in a backlog for later flushing. This flushing is done automatically by the kernel. To prevent the backlog from growing indefinitely, if the backlog is non-empty, we apply back pressure on IO submissions. Any attempt to submit new IO with a non-empty backlog will get an -EBUSY return from the kernel. This is a signal to the application that it has backlogged CQ events, and that it must reap those before being allowed to submit more IO. Note that if we do return -EBUSY, we will have filled whatever backlogged events into the CQ ring first, if there's room. This means the application can safely reap events WITHOUT entering the kernel and waiting for them, they are already available in the CQ ring. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f1a118b01d18..2a1569211d87 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -155,6 +155,7 @@ struct io_uring_params { * io_uring_params->features flags */ #define IORING_FEAT_SINGLE_MMAP (1U << 0) +#define IORING_FEAT_NODROP (1U << 1) /* * io_uring_register(2) opcodes and arguments -- cgit v1.2.3 From 14f34e36b36ceede9877ca422a62fcac17b52023 Mon Sep 17 00:00:00 2001 From: Gurumoorthi Gnanasambandhan Date: Thu, 31 Oct 2019 23:46:40 +0200 Subject: cfg80211: VLAN offload support for set_key and set_sta_vlan This provides an alternative mechanism for AP VLAN support where a single netdev is used with VLAN tagged frames instead of separate netdevs for each VLAN without tagged frames from the WLAN driver. By setting NL80211_EXT_FEATURE_VLAN_OFFLOAD flag the driver indicates support for a single netdev with VLAN tagged frames. Separate VLAN-specific netdevs can be added using RTM_NEWLINK/IFLA_VLAN_ID similarly to Ethernet. NL80211_CMD_NEW_KEY (for group keys), NL80211_CMD_NEW_STATION, and NL80211_CMD_SET_STATION will optionally specify vlan_id using NL80211_ATTR_VLAN_ID. Signed-off-by: Gurumoorthi Gnanasambandhan Signed-off-by: Jouni Malinen Link: https://lore.kernel.org/r/20191031214640.5012-1-jouni@codeaurora.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 64135ab3a7ac..341e0e8cae46 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -248,6 +248,22 @@ * %NL80211_ATTR_SAE_PASSWORD. */ +/** + * DOC: VLAN offload support for setting group keys and binding STAs to VLANs + * + * By setting @NL80211_EXT_FEATURE_VLAN_OFFLOAD flag drivers can indicate they + * support offloading VLAN functionality in a manner where the driver exposes a + * single netdev that uses VLAN tagged frames and separate VLAN-specific netdevs + * can then be added using RTM_NEWLINK/IFLA_VLAN_ID similarly to the Ethernet + * case. Frames received from stations that are not assigned to any VLAN are + * delivered on the main netdev and frames to such stations can be sent through + * that main netdev. + * + * %NL80211_CMD_NEW_KEY (for group keys), %NL80211_CMD_NEW_STATION, and + * %NL80211_CMD_SET_STATION will optionally specify vlan_id using + * %NL80211_ATTR_VLAN_ID. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -2381,6 +2397,9 @@ enum nl80211_commands { * the allowed channel bandwidth configurations. (u8 attribute) * Defined by IEEE P802.11ay/D4.0 section 9.4.2.251, Table 13. * + * @NL80211_ATTR_VLAN_ID: VLAN ID (1..4094) for the station and VLAN group key + * (u16). + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2843,6 +2862,8 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_EDMG_CHANNELS, NL80211_ATTR_WIPHY_EDMG_BW_CONFIG, + NL80211_ATTR_VLAN_ID, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5492,6 +5513,10 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SAE_OFFLOAD: Device wants to do SAE authentication in * station mode (SAE password is passed as part of the connect command). * + * @NL80211_EXT_FEATURE_VLAN_OFFLOAD: The driver supports a single netdev + * with VLAN tagged frames and separate VLAN-specific netdevs added using + * vconfig similarly to the Ethernet case. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5537,6 +5562,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_EXT_KEY_ID, NL80211_EXT_FEATURE_STA_TX_PWR, NL80211_EXT_FEATURE_SAE_OFFLOAD, + NL80211_EXT_FEATURE_VLAN_OFFLOAD, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 134bdac397661a5841d9f27f508190c68b26232b Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 8 Nov 2019 12:05:10 +0700 Subject: tipc: add new AEAD key structure for user API The new structure 'tipc_aead_key' is added to the 'tipc.h' for user to be able to transfer a key to TIPC in kernel. Netlink will be used for this purpose in the later commits. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 76421b878767..add01db1daef 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -233,6 +233,27 @@ struct tipc_sioc_nodeid_req { char node_id[TIPC_NODEID_LEN]; }; +/* + * TIPC Crypto, AEAD + */ +#define TIPC_AEAD_ALG_NAME (32) + +struct tipc_aead_key { + char alg_name[TIPC_AEAD_ALG_NAME]; + unsigned int keylen; /* in bytes */ + char key[]; +}; + +#define TIPC_AEAD_KEYLEN_MIN (16 + 4) +#define TIPC_AEAD_KEYLEN_MAX (32 + 4) +#define TIPC_AEAD_KEY_SIZE_MAX (sizeof(struct tipc_aead_key) + \ + TIPC_AEAD_KEYLEN_MAX) + +static inline int tipc_aead_key_size(struct tipc_aead_key *key) +{ + return sizeof(*key) + key->keylen; +} + /* The macros and functions below are deprecated: */ -- cgit v1.2.3 From e1f32190cf7ddd55778b460e7d44af3f76529698 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 8 Nov 2019 12:05:12 +0700 Subject: tipc: add support for AEAD key setting via netlink This commit adds two netlink commands to TIPC in order for user to be able to set or remove AEAD keys: - TIPC_NL_KEY_SET - TIPC_NL_KEY_FLUSH When the 'KEY_SET' is given along with the key data, the key will be initiated and attached to TIPC crypto. On the other hand, the 'KEY_FLUSH' command will remove all existing keys if any. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index efb958fd167d..6c2194ab745b 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -63,6 +63,8 @@ enum { TIPC_NL_PEER_REMOVE, TIPC_NL_BEARER_ADD, TIPC_NL_UDP_GET_REMOTEIP, + TIPC_NL_KEY_SET, + TIPC_NL_KEY_FLUSH, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -160,6 +162,8 @@ enum { TIPC_NLA_NODE_UNSPEC, TIPC_NLA_NODE_ADDR, /* u32 */ TIPC_NLA_NODE_UP, /* flag */ + TIPC_NLA_NODE_ID, /* data */ + TIPC_NLA_NODE_KEY, /* data */ __TIPC_NLA_NODE_MAX, TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 -- cgit v1.2.3 From aef587be42925f92418083f08852d0011b2766ca Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:32 +0800 Subject: sctp: add pf_expose per netns and sock and asoc As said in rfc7829, section 3, point 12: The SCTP stack SHOULD expose the PF state of its destination addresses to the ULP as well as provide the means to notify the ULP of state transitions of its destination addresses from active to PF, and vice versa. However, it is recommended that an SCTP stack implementing SCTP-PF also allows for the ULP to be kept ignorant of the PF state of its destinations and the associated state transitions, thus allowing for retention of the simpler state transition model of [RFC4960] in the ULP. Not only does it allow to expose the PF state to ULP, but also allow to ignore sctp-pf to ULP. So this patch is to add pf_expose per netns, sock and asoc. And in sctp_assoc_control_transport(), ulp_notify will be set to false if asoc->expose is not 'enabled' in next patch. It also allows a user to change pf_expose per netns by sysctl, and pf_expose per sock and asoc will be initialized with it. Note that pf_expose also works for SCTP_GET_PEER_ADDR_INFO sockopt, to not allow a user to query the state of a sctp-pf peer address when pf_expose is 'disabled', as said in section 7.3. v1->v2: - Fix a build warning noticed by Nathan Chancellor. v2->v3: - set pf_expose to UNUSED by default to keep compatible with old applications. v3->v4: - add a new entry for pf_expose on ip-sysctl.txt, as Marcelo suggested. - change this patch to 1/5, and move sctp_assoc_control_transport change into 2/5, as Marcelo suggested. - use SCTP_PF_EXPOSE_UNSET instead of SCTP_PF_EXPOSE_UNUSED, and set SCTP_PF_EXPOSE_UNSET to 0 in enum, as Marcelo suggested. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 6bce7f9837a9..765f41a080b4 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -933,6 +933,7 @@ struct sctp_paddrinfo { enum sctp_spinfo_state { SCTP_INACTIVE, SCTP_PF, +#define SCTP_POTENTIALLY_FAILED SCTP_PF SCTP_ACTIVE, SCTP_UNCONFIRMED, SCTP_UNKNOWN = 0xffff /* Value used for transport state unknown */ -- cgit v1.2.3 From 768e15182dcb809e39c338290dda10c4e271d133 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:33 +0800 Subject: sctp: add SCTP_ADDR_POTENTIALLY_FAILED notification SCTP Quick failover draft section 5.1, point 5 has been removed from rfc7829. Instead, "the sender SHOULD (i) notify the Upper Layer Protocol (ULP) about this state transition", as said in section 3.2, point 8. So this patch is to add SCTP_ADDR_POTENTIALLY_FAILED, defined in section 7.1, "which is reported if the affected address becomes PF". Also remove transport cwnd's update when moving from PF back to ACTIVE , which is no longer in rfc7829 either. Note that ulp_notify will be set to false if asoc->expose is not 'enabled', according to last patch. v2->v3: - define SCTP_ADDR_PF SCTP_ADDR_POTENTIALLY_FAILED. v3->v4: - initialize spc_state with SCTP_ADDR_AVAILABLE, as Marcelo suggested. - check asoc->pf_expose in sctp_assoc_control_transport(), as Marcelo suggested. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 765f41a080b4..d99b428ac34e 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -410,6 +410,8 @@ enum sctp_spc_state { SCTP_ADDR_ADDED, SCTP_ADDR_MADE_PRIM, SCTP_ADDR_CONFIRMED, + SCTP_ADDR_POTENTIALLY_FAILED, +#define SCTP_ADDR_PF SCTP_ADDR_POTENTIALLY_FAILED }; -- cgit v1.2.3 From 8d2a6935d842f12c25611b165eace778adb09a53 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:34 +0800 Subject: sctp: add SCTP_EXPOSE_POTENTIALLY_FAILED_STATE sockopt This is a sockopt defined in section 7.3 of rfc7829: "Exposing the Potentially Failed Path State", by which users can change pf_expose per sock and asoc. The new sockopt SCTP_EXPOSE_POTENTIALLY_FAILED_STATE is also known as SCTP_EXPOSE_PF_STATE for short. v2->v3: - return -EINVAL if params.assoc_value > SCTP_PF_EXPOSE_MAX. - define SCTP_EXPOSE_PF_STATE SCTP_EXPOSE_POTENTIALLY_FAILED_STATE. v3->v4: - improve changelog. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index d99b428ac34e..a190e4a7f546 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -137,6 +137,8 @@ typedef __s32 sctp_assoc_t; #define SCTP_ASCONF_SUPPORTED 128 #define SCTP_AUTH_SUPPORTED 129 #define SCTP_ECN_SUPPORTED 130 +#define SCTP_EXPOSE_POTENTIALLY_FAILED_STATE 131 +#define SCTP_EXPOSE_PF_STATE SCTP_EXPOSE_POTENTIALLY_FAILED_STATE /* PR-SCTP policies */ #define SCTP_PR_SCTP_NONE 0x0000 -- cgit v1.2.3 From d467ac0a38551a5904878b1f5a2fe20a040c0e11 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 8 Nov 2019 13:20:36 +0800 Subject: sctp: add SCTP_PEER_ADDR_THLDS_V2 sockopt Section 7.2 of rfc7829: "Peer Address Thresholds (SCTP_PEER_ADDR_THLDS) Socket Option" extends 'struct sctp_paddrthlds' with 'spt_pathcpthld' added to allow a user to change ps_retrans per sock/asoc/transport, as other 2 paddrthlds: pf_retrans, pathmaxrxt. Note: to not break the user's program, here to support pf_retrans dump and setting by adding a new sockopt SCTP_PEER_ADDR_THLDS_V2, and a new structure sctp_paddrthlds_v2 instead of extending sctp_paddrthlds. Also, when setting ps_retrans, the value is not allowed to be greater than pf_retrans. v1->v2: - use SCTP_PEER_ADDR_THLDS_V2 to set/get pf_retrans instead, as Marcelo and David Laight suggested. Signed-off-by: Xin Long Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index a190e4a7f546..28ad40d9acba 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -105,6 +105,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_DEFAULT_SNDINFO 34 #define SCTP_AUTH_DEACTIVATE_KEY 35 #define SCTP_REUSE_PORT 36 +#define SCTP_PEER_ADDR_THLDS_V2 37 /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. @@ -1087,6 +1088,15 @@ struct sctp_paddrthlds { __u16 spt_pathpfthld; }; +/* Use a new structure with spt_pathcpthld for back compatibility */ +struct sctp_paddrthlds_v2 { + sctp_assoc_t spt_assoc_id; + struct sockaddr_storage spt_address; + __u16 spt_pathmaxrxt; + __u16 spt_pathpfthld; + __u16 spt_pathcpthld; +}; + /* * Socket Option for Getting the Association/Stream-Specific PR-SCTP Status */ -- cgit v1.2.3 From 8bb69f3b2918788435cbd5834c66682642c09fba Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 12 Nov 2019 00:29:55 +0100 Subject: netfilter: nf_tables: add flowtable offload control plane This patch adds the NFTA_FLOWTABLE_FLAGS attribute that allows users to specify the NF_FLOWTABLE_HW_OFFLOAD flag. This patch also adds a new setup interface for the flowtable type to perform the flowtable offload block callback configuration. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 81fed16fe2b2..bb9b049310df 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1518,6 +1518,7 @@ enum nft_object_attributes { * @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32) * @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32) * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64) + * @NFTA_FLOWTABLE_FLAGS: flags (NLA_U32) */ enum nft_flowtable_attributes { NFTA_FLOWTABLE_UNSPEC, @@ -1527,6 +1528,7 @@ enum nft_flowtable_attributes { NFTA_FLOWTABLE_USE, NFTA_FLOWTABLE_HANDLE, NFTA_FLOWTABLE_PAD, + NFTA_FLOWTABLE_FLAGS, __NFTA_FLOWTABLE_MAX }; #define NFTA_FLOWTABLE_MAX (__NFTA_FLOWTABLE_MAX - 1) -- cgit v1.2.3 From 3ad2522c64cff1f5aebb987b00683268f0cc7c29 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 29 Oct 2019 13:41:38 -0700 Subject: statx: define STATX_ATTR_VERITY Add a statx attribute bit STATX_ATTR_VERITY which will be set if the file has fs-verity enabled. This is the statx() equivalent of FS_VERITY_FL which is returned by FS_IOC_GETFLAGS. This is useful because it allows applications to check whether a file is a verity file without opening it. Opening a verity file can be expensive because the fsverity_info is set up on open, which involves parsing metadata and optionally verifying a cryptographic signature. This is analogous to how various other bits are exposed through both FS_IOC_GETFLAGS and statx(), e.g. the encrypt bit. Reviewed-by: Andreas Dilger Acked-by: Darrick J. Wong Signed-off-by: Eric Biggers --- include/uapi/linux/stat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 7b35e98d3c58..ad80a5c885d5 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -167,8 +167,8 @@ struct statx { #define STATX_ATTR_APPEND 0x00000020 /* [I] File is append-only */ #define STATX_ATTR_NODUMP 0x00000040 /* [I] File is not to be dumped */ #define STATX_ATTR_ENCRYPTED 0x00000800 /* [I] File requires key to decrypt in fs */ - #define STATX_ATTR_AUTOMOUNT 0x00001000 /* Dir: Automount trigger */ +#define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ #endif /* _UAPI_LINUX_STAT_H */ -- cgit v1.2.3 From bd1903b7c4596ba6f7677d0dfefd05ba5876707d Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Wed, 13 Nov 2019 23:04:49 +0800 Subject: net: openvswitch: add hash info to upcall When using the kernel datapath, the upcall don't include skb hash info relatived. That will introduce some problem, because the hash of skb is important in kernel stack. For example, VXLAN module uses it to select UDP src port. The tx queue selection may also use the hash in stack. Hash is computed in different ways. Hash is random for a TCP socket, and hash may be computed in hardware, or software stack. Recalculation hash is not easy. Hash of TCP socket is computed: tcp_v4_connect -> sk_set_txhash (is random) __tcp_transmit_skb -> skb_set_hash_from_sk There will be one upcall, without information of skb hash, to ovs-vswitchd, for the first packet of a TCP session. The rest packets will be processed in Open vSwitch modules, hash kept. If this tcp session is forward to VXLAN module, then the UDP src port of first tcp packet is different from rest packets. TCP packets may come from the host or dockers, to Open vSwitch. To fix it, we store the hash info to upcall, and restore hash when packets sent back. +---------------+ +-------------------------+ | Docker/VMs | | ovs-vswitchd | +----+----------+ +-+--------------------+--+ | ^ | | | | | | upcall v restore packet hash (not recalculate) | +-+--------------------+--+ | tap netdev | | vxlan module +---------------> +--> Open vSwitch ko +--> or internal type | | +-------------------------+ Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2019-October/364062.html Signed-off-by: Tonghao Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 1887a451c388..a87b44cd5590 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -173,6 +173,7 @@ enum ovs_packet_cmd { * @OVS_PACKET_ATTR_LEN: Packet size before truncation. * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment * size. + * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb). * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_PACKET_* commands. @@ -190,7 +191,8 @@ enum ovs_packet_attr { OVS_PACKET_ATTR_PROBE, /* Packet operation is a feature probe, error logging should be suppressed. */ OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */ - OVS_PACKET_ATTR_LEN, /* Packet size before truncation. */ + OVS_PACKET_ATTR_LEN, /* Packet size before truncation. */ + OVS_PACKET_ATTR_HASH, /* Packet hash. */ __OVS_PACKET_ATTR_MAX }; -- cgit v1.2.3 From fec56f5890d93fc2ed74166c397dc186b1c25951 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:04 -0800 Subject: bpf: Introduce BPF trampoline Introduce BPF trampoline concept to allow kernel code to call into BPF programs with practically zero overhead. The trampoline generation logic is architecture dependent. It's converting native calling convention into BPF calling convention. BPF ISA is 64-bit (even on 32-bit architectures). The registers R1 to R5 are used to pass arguments into BPF functions. The main BPF program accepts only single argument "ctx" in R1. Whereas CPU native calling convention is different. x86-64 is passing first 6 arguments in registers and the rest on the stack. x86-32 is passing first 3 arguments in registers. sparc64 is passing first 6 in registers. And so on. The trampolines between BPF and kernel already exist. BPF_CALL_x macros in include/linux/filter.h statically compile trampolines from BPF into kernel helpers. They convert up to five u64 arguments into kernel C pointers and integers. On 64-bit architectures this BPF_to_kernel trampolines are nops. On 32-bit architecture they're meaningful. The opposite job kernel_to_BPF trampolines is done by CAST_TO_U64 macros and __bpf_trace_##call() shim functions in include/trace/bpf_probe.h. They convert kernel function arguments into array of u64s that BPF program consumes via R1=ctx pointer. This patch set is doing the same job as __bpf_trace_##call() static trampolines, but dynamically for any kernel function. There are ~22k global kernel functions that are attachable via nop at function entry. The function arguments and types are described in BTF. The job of btf_distill_func_proto() function is to extract useful information from BTF into "function model" that architecture dependent trampoline generators will use to generate assembly code to cast kernel function arguments into array of u64s. For example the kernel function eth_type_trans has two pointers. They will be casted to u64 and stored into stack of generated trampoline. The pointer to that stack space will be passed into BPF program in R1. On x86-64 such generated trampoline will consume 16 bytes of stack and two stores of %rdi and %rsi into stack. The verifier will make sure that only two u64 are accessed read-only by BPF program. The verifier will also recognize the precise type of the pointers being accessed and will not allow typecasting of the pointer to a different type within BPF program. The tracing use case in the datacenter demonstrated that certain key kernel functions have (like tcp_retransmit_skb) have 2 or more kprobes that are always active. Other functions have both kprobe and kretprobe. So it is essential to keep both kernel code and BPF programs executing at maximum speed. Hence generated BPF trampoline is re-generated every time new program is attached or detached to maintain maximum performance. To avoid the high cost of retpoline the attached BPF programs are called directly. __bpf_prog_enter/exit() are used to support per-program execution stats. In the future this logic will be optimized further by adding support for bpf_stats_enabled_key inside generated assembly code. Introduction of preemptible and sleepable BPF programs will completely remove the need to call to __bpf_prog_enter/exit(). Detach of a BPF program from the trampoline should not fail. To avoid memory allocation in detach path the half of the page is used as a reserve and flipped after each attach/detach. 2k bytes is enough to call 40+ BPF programs directly which is enough for BPF tracing use cases. This limit can be increased in the future. BPF_TRACE_FENTRY programs have access to raw kernel function arguments while BPF_TRACE_FEXIT programs have access to kernel return value as well. Often kprobe BPF program remembers function arguments in a map while kretprobe fetches arguments from a map and analyzes them together with return value. BPF_TRACE_FEXIT accelerates this typical use case. Recursion prevention for kprobe BPF programs is done via per-cpu bpf_prog_active counter. In practice that turned out to be a mistake. It caused programs to randomly skip execution. The tracing tools missed results they were looking for. Hence BPF trampoline doesn't provide builtin recursion prevention. It's a job of BPF program itself and will be addressed in the follow up patches. BPF trampoline is intended to be used beyond tracing and fentry/fexit use cases in the future. For example to remove retpoline cost from XDP programs. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-5-ast@kernel.org --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index df6809a76404..69c200e6e696 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -201,6 +201,8 @@ enum bpf_attach_type { BPF_CGROUP_GETSOCKOPT, BPF_CGROUP_SETSOCKOPT, BPF_TRACE_RAW_TP, + BPF_TRACE_FENTRY, + BPF_TRACE_FEXIT, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 5b92a28aae4dd0f88778d540ecfdcdaec5a41723 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Nov 2019 10:57:17 -0800 Subject: bpf: Support attaching tracing BPF program to other BPF programs Allow FENTRY/FEXIT BPF programs to attach to other BPF programs of any type including their subprograms. This feature allows snooping on input and output packets in XDP, TC programs including their return values. In order to do that the verifier needs to track types not only of vmlinux, but types of other BPF programs as well. The verifier also needs to translate uapi/linux/bpf.h types used by networking programs into kernel internal BTF types used by FENTRY/FEXIT BPF programs. In some cases LLVM optimizations can remove arguments from BPF subprograms without adjusting BTF info that LLVM backend knows. When BTF info disagrees with actual types that the verifiers sees the BPF trampoline has to fallback to conservative and treat all arguments as u64. The FENTRY/FEXIT program can still attach to such subprograms, but it won't be able to recognize pointer types like 'struct sk_buff *' and it won't be able to pass them to bpf_skb_output() for dumping packets to user space. The FENTRY/FEXIT program would need to use bpf_probe_read_kernel() instead. The BPF_PROG_LOAD command is extended with attach_prog_fd field. When it's set to zero the attach_btf_id is one vmlinux BTF type ids. When attach_prog_fd points to previously loaded BPF program the attach_btf_id is BTF type id of main function or one of its subprograms. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20191114185720.1641606-18-ast@kernel.org --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69c200e6e696..4842a134b202 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -425,6 +425,7 @@ union bpf_attr { __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + __u32 attach_prog_fd; /* 0 to attach to vmlinux */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.2.3 From 49cb2fc42ce4b7a656ee605e30c302efaa39c1a7 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 15 Nov 2019 13:36:20 +0100 Subject: fork: extend clone3() to support setting a PID The main motivation to add set_tid to clone3() is CRIU. To restore a process with the same PID/TID CRIU currently uses /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to ns_last_pid and then (quickly) does a clone(). This works most of the time, but it is racy. It is also slow as it requires multiple syscalls. Extending clone3() to support *set_tid makes it possible restore a process using CRIU without accessing /proc/sys/kernel/ns_last_pid and race free (as long as the desired PID/TID is available). This clone3() extension places the same restrictions (CAP_SYS_ADMIN) on clone3() with *set_tid as they are currently in place for ns_last_pid. The original version of this change was using a single value for set_tid. At the 2019 LPC, after presenting set_tid, it was, however, decided to change set_tid to an array to enable setting the PID of a process in multiple PID namespaces at the same time. If a process is created in a PID namespace it is possible to influence the PID inside and outside of the PID namespace. Details also in the corresponding selftest. To create a process with the following PIDs: PID NS level Requested PID 0 (host) 31496 1 42 2 1 For that example the two newly introduced parameters to struct clone_args (set_tid and set_tid_size) would need to be: set_tid[0] = 1; set_tid[1] = 42; set_tid[2] = 31496; set_tid_size = 3; If only the PIDs of the two innermost nested PID namespaces should be defined it would look like this: set_tid[0] = 1; set_tid[1] = 42; set_tid_size = 2; The PID of the newly created process would then be the next available free PID in the PID namespace level 0 (host) and 42 in the PID namespace at level 1 and the PID of the process in the innermost PID namespace would be 1. The set_tid array is used to specify the PID of a process starting from the innermost nested PID namespaces up to set_tid_size PID namespaces. set_tid_size cannot be larger then the current PID namespace level. Signed-off-by: Adrian Reber Reviewed-by: Christian Brauner Reviewed-by: Oleg Nesterov Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Acked-by: Andrei Vagin Link: https://lore.kernel.org/r/20191115123621.142252-1-areber@redhat.com Signed-off-by: Christian Brauner --- include/uapi/linux/sched.h | 53 ++++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 18 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 1d500ed03c63..a0b1c224c72b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -39,24 +39,38 @@ #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall - * @flags: Flags for the new process as listed above. - * All flags are valid except for CSIGNAL and - * CLONE_DETACHED. - * @pidfd: If CLONE_PIDFD is set, a pidfd will be - * returned in this argument. - * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the - * child process will be returned in the child's - * memory. - * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of - * the child process will be returned in the - * parent's memory. - * @exit_signal: The exit_signal the parent process will be - * sent when the child exits. - * @stack: Specify the location of the stack for the - * child process. - * @stack_size: The size of the stack for the child process. - * @tls: If CLONE_SETTLS is set, the tls descriptor - * is set to tls. + * @flags: Flags for the new process as listed above. + * All flags are valid except for CSIGNAL and + * CLONE_DETACHED. + * @pidfd: If CLONE_PIDFD is set, a pidfd will be + * returned in this argument. + * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the + * child process will be returned in the child's + * memory. + * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of + * the child process will be returned in the + * parent's memory. + * @exit_signal: The exit_signal the parent process will be + * sent when the child exits. + * @stack: Specify the location of the stack for the + * child process. + * @stack_size: The size of the stack for the child process. + * @tls: If CLONE_SETTLS is set, the tls descriptor + * is set to tls. + * @set_tid: Pointer to an array of type *pid_t. The size + * of the array is defined using @set_tid_size. + * This array is used to select PIDs/TIDs for + * newly created processes. The first element in + * this defines the PID in the most nested PID + * namespace. Each additional element in the array + * defines the PID in the parent PID namespace of + * the original PID namespace. If the array has + * less entries than the number of currently + * nested PID namespaces only the PIDs in the + * corresponding namespaces are set. + * @set_tid_size: This defines the size of the array referenced + * in @set_tid. This cannot be larger than the + * kernel's limit of nested PID namespaces. * * The structure is versioned by size and thus extensible. * New struct members must go at the end of the struct and @@ -71,10 +85,13 @@ struct clone_args { __aligned_u64 stack; __aligned_u64 stack_size; __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; }; #endif #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ /* * Scheduling policies -- cgit v1.2.3 From fc9702273e2edb90400a34b3be76f7b08fa3344b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 17 Nov 2019 09:28:04 -0800 Subject: bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY Add ability to memory-map contents of BPF array map. This is extremely useful for working with BPF global data from userspace programs. It allows to avoid typical bpf_map_{lookup,update}_elem operations, improving both performance and usability. There had to be special considerations for map freezing, to avoid having writable memory view into a frozen map. To solve this issue, map freezing and mmap-ing is happening under mutex now: - if map is already frozen, no writable mapping is allowed; - if map has writable memory mappings active (accounted in map->writecnt), map freezing will keep failing with -EBUSY; - once number of writable memory mappings drops to zero, map freezing can be performed again. Only non-per-CPU plain arrays are supported right now. Maps with spinlocks can't be memory mapped either. For BPF_F_MMAPABLE array, memory allocation has to be done through vmalloc() to be mmap()'able. We also need to make sure that array data memory is page-sized and page-aligned, so we over-allocate memory in such a way that struct bpf_array is at the end of a single page of memory with array->value being aligned with the start of the second page. On deallocation we need to accomodate this memory arrangement to free vmalloc()'ed memory correctly. One important consideration regarding how memory-mapping subsystem functions. Memory-mapping subsystem provides few optional callbacks, among them open() and close(). close() is called for each memory region that is unmapped, so that users can decrease their reference counters and free up resources, if necessary. open() is *almost* symmetrical: it's called for each memory region that is being mapped, **except** the very first one. So bpf_map_mmap does initial refcnt bump, while open() will do any extra ones after that. Thus number of close() calls is equal to number of open() calls plus one more. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Song Liu Acked-by: John Fastabend Acked-by: Johannes Weiner Link: https://lore.kernel.org/bpf/20191117172806.2195367-4-andriin@fb.com --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4842a134b202..dbbcf0b02970 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -348,6 +348,9 @@ enum bpf_attach_type { /* Clone map from listener for newly accepted socket */ #define BPF_F_CLONE (1U << 9) +/* Enable memory-mapping BPF map */ +#define BPF_F_MMAPABLE (1U << 10) + /* flags for BPF_PROG_QUERY */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) -- cgit v1.2.3 From b9b1a53e180e2ab4cea42c95bf344e4bf7f524e7 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 10 Oct 2019 15:59:58 +0800 Subject: btrfs: use enum for extent type defines Use enum to replace macro definitions of extent types. Signed-off-by: Chengguang Xu Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs_tree.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index b65c7ee75bc7..44136de2f5d7 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -737,10 +737,12 @@ struct btrfs_balance_item { __le64 unused[4]; } __attribute__ ((__packed__)); -#define BTRFS_FILE_EXTENT_INLINE 0 -#define BTRFS_FILE_EXTENT_REG 1 -#define BTRFS_FILE_EXTENT_PREALLOC 2 -#define BTRFS_FILE_EXTENT_TYPES 2 +enum { + BTRFS_FILE_EXTENT_INLINE = 0, + BTRFS_FILE_EXTENT_REG = 1, + BTRFS_FILE_EXTENT_PREALLOC = 2, + BTRFS_NR_FILE_EXTENT_TYPES = 3, +}; struct btrfs_file_extent_item { /* -- cgit v1.2.3 From 3951e7f050ac6a38bbc859fc3cd6093890c31d1c Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 7 Oct 2019 11:11:01 +0200 Subject: btrfs: add xxhash64 to checksumming algorithms Add xxhash64 to the list of possible checksumming algorithms used by BTRFS. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs_tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 44136de2f5d7..962312939a8f 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -302,6 +302,7 @@ /* csum types */ enum btrfs_csum_type { BTRFS_CSUM_TYPE_CRC32 = 0, + BTRFS_CSUM_TYPE_XXHASH = 1, }; /* -- cgit v1.2.3 From 3831bf0094abed51e71cbeca8b6edf8b88c2644b Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 7 Oct 2019 11:11:02 +0200 Subject: btrfs: add sha256 to checksumming algorithm Add sha256 to the list of possible checksumming algorithms used by BTRFS. Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs_tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 962312939a8f..e2823c9b2061 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -303,6 +303,7 @@ enum btrfs_csum_type { BTRFS_CSUM_TYPE_CRC32 = 0, BTRFS_CSUM_TYPE_XXHASH = 1, + BTRFS_CSUM_TYPE_SHA256 = 2, }; /* -- cgit v1.2.3 From 352ae07b599a03b08a2149d9c2ff9c3479591af2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 7 Oct 2019 11:11:02 +0200 Subject: btrfs: add blake2b to checksumming algorithms Add blake2b (with 256 bit digest) to the list of possible checksumming algorithms used by BTRFS. Signed-off-by: David Sterba --- include/uapi/linux/btrfs_tree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index e2823c9b2061..5160be1d7332 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -304,6 +304,7 @@ enum btrfs_csum_type { BTRFS_CSUM_TYPE_CRC32 = 0, BTRFS_CSUM_TYPE_XXHASH = 1, BTRFS_CSUM_TYPE_SHA256 = 2, + BTRFS_CSUM_TYPE_BLAKE2 = 3, }; /* -- cgit v1.2.3 From 47e6f7423b9196ad6832d26cae52b7015f81ee7f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 2 Mar 2018 22:56:53 +0100 Subject: btrfs: add support for 3-copy replication (raid1c3) Add new block group profile to store 3 copies in a simliar way that current RAID1 does. The profile attributes and constraints are defined in the raid table and used by the same code that already handles the 2-copy RAID1. The minimum number of devices is 3, the maximum number of devices/chunks that can be lost/damaged is 2. Like RAID6 but with 33% space utilization. Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 3 ++- include/uapi/linux/btrfs_tree.h | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 3ee0678c0a83..ba22f91a3f5b 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -831,7 +831,8 @@ enum btrfs_err_code { BTRFS_ERROR_DEV_TGT_REPLACE, BTRFS_ERROR_DEV_MISSING_NOT_FOUND, BTRFS_ERROR_DEV_ONLY_WRITABLE, - BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS + BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS, + BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, }; #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 5160be1d7332..52b2964b0311 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -841,6 +841,7 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) #define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) +#define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ BTRFS_SPACE_INFO_GLOBAL_RSV) @@ -852,6 +853,7 @@ enum btrfs_raid_types { BTRFS_RAID_SINGLE, BTRFS_RAID_RAID5, BTRFS_RAID_RAID6, + BTRFS_RAID_RAID1C3, BTRFS_NR_RAID_TYPES }; @@ -861,6 +863,7 @@ enum btrfs_raid_types { #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_RAID1C3 | \ BTRFS_BLOCK_GROUP_RAID5 | \ BTRFS_BLOCK_GROUP_RAID6 | \ BTRFS_BLOCK_GROUP_DUP | \ @@ -868,7 +871,8 @@ enum btrfs_raid_types { #define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \ BTRFS_BLOCK_GROUP_RAID6) -#define BTRFS_BLOCK_GROUP_RAID1_MASK (BTRFS_BLOCK_GROUP_RAID1) +#define BTRFS_BLOCK_GROUP_RAID1_MASK (BTRFS_BLOCK_GROUP_RAID1 | \ + BTRFS_BLOCK_GROUP_RAID1C3) /* * We need a bit for restriper to be able to tell when chunks of type -- cgit v1.2.3 From 8d6fac0087e538173f34ca7431ed9b58581acf28 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 2 Mar 2018 22:56:53 +0100 Subject: btrfs: add support for 4-copy replication (raid1c4) Add new block group profile to store 4 copies in a simliar way that current RAID1 does. The profile attributes and constraints are defined in the raid table and used by the same code that already handles the 2- and 3-copy RAID1. The minimum number of devices is 4, the maximum number of devices/chunks that can be lost/damaged is 3. There is no comparable traditional RAID level, the profile is added for future needs to accompany triple-parity and beyond. Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 1 + include/uapi/linux/btrfs_tree.h | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index ba22f91a3f5b..a2b761275bba 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -833,6 +833,7 @@ enum btrfs_err_code { BTRFS_ERROR_DEV_ONLY_WRITABLE, BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS, BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, + BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, }; #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 52b2964b0311..8e322e2c7e78 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -842,6 +842,7 @@ struct btrfs_dev_replace_item { #define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7) #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) #define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) +#define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10) #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ BTRFS_SPACE_INFO_GLOBAL_RSV) @@ -854,6 +855,7 @@ enum btrfs_raid_types { BTRFS_RAID_RAID5, BTRFS_RAID_RAID6, BTRFS_RAID_RAID1C3, + BTRFS_RAID_RAID1C4, BTRFS_NR_RAID_TYPES }; @@ -864,6 +866,7 @@ enum btrfs_raid_types { #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID1 | \ BTRFS_BLOCK_GROUP_RAID1C3 | \ + BTRFS_BLOCK_GROUP_RAID1C4 | \ BTRFS_BLOCK_GROUP_RAID5 | \ BTRFS_BLOCK_GROUP_RAID6 | \ BTRFS_BLOCK_GROUP_DUP | \ @@ -872,7 +875,8 @@ enum btrfs_raid_types { BTRFS_BLOCK_GROUP_RAID6) #define BTRFS_BLOCK_GROUP_RAID1_MASK (BTRFS_BLOCK_GROUP_RAID1 | \ - BTRFS_BLOCK_GROUP_RAID1C3) + BTRFS_BLOCK_GROUP_RAID1C3 | \ + BTRFS_BLOCK_GROUP_RAID1C4) /* * We need a bit for restriper to be able to tell when chunks of type -- cgit v1.2.3 From cfbb825c76198c9095428c5f9362fbf6ae06f417 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 10 Jul 2018 18:15:05 +0200 Subject: btrfs: add incompat for raid1 with 3, 4 copies The new raid1c3 and raid1c4 profiles are backward incompatible and the name shall be 'raid1c34', the status can be found in the global supported features in /sys/fs/btrfs/features or in the per-filesystem directory. Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index a2b761275bba..7a8bc8b920f5 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -270,6 +270,7 @@ struct btrfs_ioctl_fs_info_args { #define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8) #define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9) #define BTRFS_FEATURE_INCOMPAT_METADATA_UUID (1ULL << 10) +#define BTRFS_FEATURE_INCOMPAT_RAID1C34 (1ULL << 11) struct btrfs_ioctl_feature_flags { __u64 compat_flags; -- cgit v1.2.3 From cec2975f2b7058c42330a0f8164d94c6b7c8c446 Mon Sep 17 00:00:00 2001 From: Gautam Ramakrishnan Date: Wed, 20 Nov 2019 19:43:54 +0530 Subject: net: sched: pie: enable timestamp based delay calculation RFC 8033 suggests an alternative approach to calculate the queue delay in PIE by using a timestamp on every enqueued packet. This patch adds an implementation of that approach and sets it as the default method to calculate queue delay. The previous method (based on Little's law) to calculate queue delay is set as optional. Signed-off-by: Gautam Ramakrishnan Signed-off-by: Leslie Monis Signed-off-by: Mohit P. Tahiliani Acked-by: Dave Taht Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 5011259b8f67..9f1a72876212 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -950,19 +950,25 @@ enum { TCA_PIE_BETA, TCA_PIE_ECN, TCA_PIE_BYTEMODE, + TCA_PIE_DQ_RATE_ESTIMATOR, __TCA_PIE_MAX }; #define TCA_PIE_MAX (__TCA_PIE_MAX - 1) struct tc_pie_xstats { - __u64 prob; /* current probability */ - __u32 delay; /* current delay in ms */ - __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ - __u32 packets_in; /* total number of packets enqueued */ - __u32 dropped; /* packets dropped due to pie_action */ - __u32 overlimit; /* dropped due to lack of space in queue */ - __u32 maxq; /* maximum queue size */ - __u32 ecn_mark; /* packets marked with ecn*/ + __u64 prob; /* current probability */ + __u32 delay; /* current delay in ms */ + __u32 avg_dq_rate; /* current average dq_rate in + * bits/pie_time + */ + __u32 dq_rate_estimating; /* is avg_dq_rate being calculated? */ + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to pie_action */ + __u32 overlimit; /* dropped due to lack of space + * in queue + */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ }; /* CBS */ -- cgit v1.2.3 From 91e6015b082b08a74e5d9d326f651e5890a93519 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Nov 2019 22:38:16 +0100 Subject: bpf: Emit audit messages upon successful prog load and unload Allow for audit messages to be emitted upon BPF program load and unload for having a timeline of events. The load itself is in syscall context, so additional info about the process initiating the BPF prog creation can be logged and later directly correlated to the unload event. The only info really needed from BPF side is the globally unique prog ID where then audit user space tooling can query / dump all info needed about the specific BPF program right upon load event and enrich the record, thus these changes needed here can be kept small and non-intrusive to the core. Raw example output: # auditctl -D # auditctl -a always,exit -F arch=x86_64 -S bpf # ausearch --start recent -m 1334 [...] ---- time->Wed Nov 20 12:45:51 2019 type=PROCTITLE msg=audit(1574271951.590:8974): proctitle="./test_verifier" type=SYSCALL msg=audit(1574271951.590:8974): arch=c000003e syscall=321 success=yes exit=14 a0=5 a1=7ffe2d923e80 a2=78 a3=0 items=0 ppid=742 pid=949 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=2 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key=(null) type=UNKNOWN[1334] msg=audit(1574271951.590:8974): auid=0 uid=0 gid=0 ses=2 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 pid=949 comm="test_verifier" exe="/root/bpf-next/tools/testing/selftests/bpf/test_verifier" prog-id=3260 event=LOAD ---- time->Wed Nov 20 12:45:51 2019 type=UNKNOWN[1334] msg=audit(1574271951.590:8975): prog-id=3260 event=UNLOAD ---- [...] Signed-off-by: Daniel Borkmann Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20191120213816.8186-1-jolsa@kernel.org --- include/uapi/linux/audit.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index c89c6495983d..32a5db900f47 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -116,6 +116,7 @@ #define AUDIT_FANOTIFY 1331 /* Fanotify access decision */ #define AUDIT_TIME_INJOFFSET 1332 /* Timekeeping offset injected */ #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */ +#define AUDIT_BPF 1334 /* BPF subsystem */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ -- cgit v1.2.3 From fca3f91cc38ad866c995fb099d961b31cd687849 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:26 +0800 Subject: net: sched: add vxlan option support to act_tunnel_key This patch is to allow setting vxlan options using the act_tunnel_key action. Different from geneve options, only one option can be set. And also, geneve options and vxlan options can't be set at the same time. gbp is the only param for vxlan options: # ip link add name vxlan0 type vxlan dstport 0 external # tc qdisc add dev eth0 ingress # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_proto udp \ action tunnel_key \ set src_ip 10.0.99.192 \ dst_ip 10.0.99.193 \ dst_port 6081 \ id 11 \ vxlan_opts 01020304 \ action mirred egress redirect dev vxlan0 v1->v2: - add .strict_start_type for enc_opts_policy as Jakub noticed. - use Duplicate instead of Wrong in err msg for extack as Jakub suggested. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index 41c8b462c177..f302c2a76953 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -50,6 +50,10 @@ enum { * TCA_TUNNEL_KEY_ENC_OPTS_ * attributes */ + TCA_TUNNEL_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_ + * attributes + */ __TCA_TUNNEL_KEY_ENC_OPTS_MAX, }; @@ -67,4 +71,13 @@ enum { #define TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX - 1) +enum { + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX - 1) + #endif -- cgit v1.2.3 From e20d4ff2acd7db2ffce64a6ddbdaeec43a8eec19 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:27 +0800 Subject: net: sched: add erspan option support to act_tunnel_key This patch is to allow setting erspan options using the act_tunnel_key action. Different from geneve options, only one option can be set. And also, geneve options, vxlan options or erspan options can't be set at the same time. Options are expressed as ver:index:dir:hwid, when ver is set to 1, index will be applied while dir and hwid will be ignored, and when ver is set to 2, dir and hwid will be used while index will be ignored. # ip link add name erspan1 type erspan external # tc qdisc add dev eth0 ingress # tc filter add dev eth0 protocol ip parent ffff: \ flower indev eth0 \ ip_proto udp \ action tunnel_key \ set src_ip 10.0.99.192 \ dst_ip 10.0.99.193 \ dst_port 6081 \ id 11 \ erspan_opts 1:2:0:0 \ action mirred egress redirect dev erspan1 v1->v2: - do the validation when dst is not yet allocated as Jakub suggested. - use Duplicate instead of Wrong in err msg for extack. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_tunnel_key.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index f302c2a76953..3f10dc4e7a4b 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -54,6 +54,10 @@ enum { * TCA_TUNNEL_KEY_ENC_OPTS_ * attributes */ + TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN, /* Nested + * TCA_TUNNEL_KEY_ENC_OPTS_ + * attributes + */ __TCA_TUNNEL_KEY_ENC_OPTS_MAX, }; @@ -80,4 +84,16 @@ enum { #define TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX \ (__TCA_TUNNEL_KEY_ENC_OPT_VXLAN_MAX - 1) +enum { + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_UNSPEC, + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_VER, /* u8 */ + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_INDEX, /* be32 */ + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_DIR, /* u8 */ + TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_HWID, /* u8 */ + __TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX, +}; + +#define TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX \ + (__TCA_TUNNEL_KEY_ENC_OPT_ERSPAN_MAX - 1) + #endif -- cgit v1.2.3 From d8f9dfae49ce4ffb772dc10dd6578dc815b34c12 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:28 +0800 Subject: net: sched: allow flower to match vxlan options This patch is to allow matching gbp option in vxlan. The options can be described in the form GBP/GBP_MASK, where GBP is represented as a 32bit hexadecimal value. Different from geneve, only one option can be set. And also, geneve options and vxlan options can't be set at the same time. # ip link add name vxlan0 type vxlan dstport 0 external # tc qdisc add dev vxlan0 ingress # tc filter add dev vxlan0 protocol ip parent ffff: \ flower \ enc_src_ip 10.0.99.192 \ enc_dst_ip 10.0.99.193 \ enc_key_id 11 \ vxlan_opts 01020304/ffffffff \ ip_proto udp \ action mirred egress redirect dev eth0 v1->v2: - add .strict_start_type for enc_opts_policy as Jakub noticed. - use Duplicate instead of Wrong in err msg for extack as Jakub suggested. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index c6ad22f76ede..929825d710e2 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -571,6 +571,10 @@ enum { * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ * attributes */ + TCA_FLOWER_KEY_ENC_OPTS_VXLAN, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_VXLAN_ + * attributes + */ __TCA_FLOWER_KEY_ENC_OPTS_MAX, }; @@ -588,6 +592,15 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPT_VXLAN_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_VXLAN_GBP, /* u32 */ + __TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), -- cgit v1.2.3 From 79b1011cb33d166f531a1347a17e6602954e4eb1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 21 Nov 2019 18:03:29 +0800 Subject: net: sched: allow flower to match erspan options This patch is to allow matching options in erspan. The options can be described in the form: VER:INDEX:DIR:HWID/VER:INDEX_MASK:DIR_MASK:HWID_MASK. When ver is set to 1, index will be applied while dir and hwid will be ignored, and when ver is set to 2, dir and hwid will be used while index will be ignored. Different from geneve, only one option can be set. And also, geneve options, vxlan options or erspan options can't be set at the same time. # ip link add name erspan1 type erspan external # tc qdisc add dev erspan1 ingress # tc filter add dev erspan1 protocol ip parent ffff: \ flower \ enc_src_ip 10.0.99.192 \ enc_dst_ip 10.0.99.193 \ enc_key_id 11 \ erspan_opts 1:12:0:0/1:ffff:0:0 \ ip_proto udp \ action mirred egress redirect dev eth0 v1->v2: - improve some err msgs of extack. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 929825d710e2..449a63971451 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -575,6 +575,10 @@ enum { * TCA_FLOWER_KEY_ENC_OPT_VXLAN_ * attributes */ + TCA_FLOWER_KEY_ENC_OPTS_ERSPAN, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_ERSPAN_ + * attributes + */ __TCA_FLOWER_KEY_ENC_OPTS_MAX, }; @@ -601,6 +605,18 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_VXLAN_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_VER, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_INDEX, /* be32 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_DIR, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID, /* u8 */ + __TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1) + enum { TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), -- cgit v1.2.3 From 84bb46cd62283cc371769ec1f77ff7924099f584 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 23 Nov 2019 09:54:58 -0800 Subject: Revert "bpf: Emit audit messages upon successful prog load and unload" This commit reverts commit 91e6015b082b ("bpf: Emit audit messages upon successful prog load and unload") and its follow up commit 7599a896f2e4 ("audit: Move audit_log_task declaration under CONFIG_AUDITSYSCALL") as requested by Paul Moore. The change needs close review on linux-audit, tests etc. Signed-off-by: Jakub Kicinski --- include/uapi/linux/audit.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h index 32a5db900f47..c89c6495983d 100644 --- a/include/uapi/linux/audit.h +++ b/include/uapi/linux/audit.h @@ -116,7 +116,6 @@ #define AUDIT_FANOTIFY 1331 /* Fanotify access decision */ #define AUDIT_TIME_INJOFFSET 1332 /* Timekeeping offset injected */ #define AUDIT_TIME_ADJNTPVAL 1333 /* NTP value adjustment */ -#define AUDIT_BPF 1334 /* BPF subsystem */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ -- cgit v1.2.3