From 8aa26c575fb343ebde810b30dad0cba7d8121efb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 18 Aug 2020 10:17:33 +0200 Subject: netlink: make NLA_BINARY validation more flexible Add range validation for NLA_BINARY, allowing validation of any combination of combination minimum or maximum lengths, using the existing NLA_POLICY_RANGE()/NLA_POLICY_FULL_RANGE() macros, just like for integers where the value is checked. Also make NLA_POLICY_EXACT_LEN(), NLA_POLICY_EXACT_LEN_WARN() and NLA_POLICY_MIN_LEN() special cases of this, removing the old types NLA_EXACT_LEN and NLA_MIN_LEN. This allows us to save some code where both minimum and maximum lengths are requires, currently the policy only allows maximum (NLA_BINARY), minimum (NLA_MIN_LEN) or exact (NLA_EXACT_LEN), so a range of lengths cannot be accepted and must be checked by the code that consumes the attributes later. Also, this allows advertising the correct ranges in the policy export to userspace. Here, NLA_MIN_LEN and NLA_EXACT_LEN already were special cases of NLA_BINARY with min and min/max length respectively. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 58 +++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index c0411f14fb53..fdd317f8fde4 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -181,8 +181,6 @@ enum { NLA_S64, NLA_BITFIELD32, NLA_REJECT, - NLA_EXACT_LEN, - NLA_MIN_LEN, __NLA_TYPE_MAX, }; @@ -199,11 +197,11 @@ struct netlink_range_validation_signed { enum nla_policy_validation { NLA_VALIDATE_NONE, NLA_VALIDATE_RANGE, + NLA_VALIDATE_RANGE_WARN_TOO_LONG, NLA_VALIDATE_MIN, NLA_VALIDATE_MAX, NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, - NLA_VALIDATE_WARN_TOO_LONG, }; /** @@ -222,7 +220,7 @@ enum nla_policy_validation { * NLA_NUL_STRING Maximum length of string (excluding NUL) * NLA_FLAG Unused * NLA_BINARY Maximum length of attribute payload - * NLA_MIN_LEN Minimum length of attribute payload + * (but see also below with the validation type) * NLA_NESTED, * NLA_NESTED_ARRAY Length verification is done by checking len of * nested header (or empty); len field is used if @@ -237,11 +235,6 @@ enum nla_policy_validation { * just like "All other" * NLA_BITFIELD32 Unused * NLA_REJECT Unused - * NLA_EXACT_LEN Attribute should have exactly this length, otherwise - * it is rejected or warned about, the latter happening - * if and only if the `validation_type' is set to - * NLA_VALIDATE_WARN_TOO_LONG. - * NLA_MIN_LEN Minimum length of attribute payload * All other Minimum length of attribute payload * * Meaning of validation union: @@ -296,6 +289,11 @@ enum nla_policy_validation { * pointer to a struct netlink_range_validation_signed * that indicates the min/max values. * Use NLA_POLICY_FULL_RANGE_SIGNED(). + * + * NLA_BINARY If the validation type is like the ones for integers + * above, then the min/max length (not value like for + * integers) of the attribute is enforced. + * * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: @@ -309,7 +307,7 @@ enum nla_policy_validation { * static const struct nla_policy my_policy[ATTR_MAX+1] = { * [ATTR_FOO] = { .type = NLA_U16 }, * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ }, - * [ATTR_BAZ] = { .type = NLA_EXACT_LEN, .len = sizeof(struct mystruct) }, + * [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)), * [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags), * }; */ @@ -335,9 +333,10 @@ struct nla_policy { * nesting validation starts here. * * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT - * for any types >= this, so need to use NLA_MIN_LEN to get the - * previous pure { .len = xyz } behaviour. The advantage of this - * is that types not specified in the policy will be rejected. + * for any types >= this, so need to use NLA_POLICY_MIN_LEN() to + * get the previous pure { .len = xyz } behaviour. The advantage + * of this is that types not specified in the policy will be + * rejected. * * For completely new families it should be set to 1 so that the * validation is enforced for all attributes. For existing ones @@ -349,12 +348,6 @@ struct nla_policy { }; }; -#define NLA_POLICY_EXACT_LEN(_len) { .type = NLA_EXACT_LEN, .len = _len } -#define NLA_POLICY_EXACT_LEN_WARN(_len) \ - { .type = NLA_EXACT_LEN, .len = _len, \ - .validation_type = NLA_VALIDATE_WARN_TOO_LONG, } -#define NLA_POLICY_MIN_LEN(_len) { .type = NLA_MIN_LEN, .len = _len } - #define NLA_POLICY_ETH_ADDR NLA_POLICY_EXACT_LEN(ETH_ALEN) #define NLA_POLICY_ETH_ADDR_COMPAT NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN) @@ -370,19 +363,21 @@ struct nla_policy { { .type = NLA_BITFIELD32, .bitfield32_valid = valid } #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) -#define NLA_ENSURE_UINT_TYPE(tp) \ +#define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(tp == NLA_U8 || tp == NLA_U16 || \ tp == NLA_U32 || tp == NLA_U64 || \ - tp == NLA_MSECS) + tp) + tp == NLA_MSECS || \ + tp == NLA_BINARY) + tp) #define NLA_ENSURE_SINT_TYPE(tp) \ (__NLA_ENSURE(tp == NLA_S8 || tp == NLA_S16 || \ tp == NLA_S32 || tp == NLA_S64) + tp) -#define NLA_ENSURE_INT_TYPE(tp) \ +#define NLA_ENSURE_INT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(tp == NLA_S8 || tp == NLA_U8 || \ tp == NLA_S16 || tp == NLA_U16 || \ tp == NLA_S32 || tp == NLA_U32 || \ tp == NLA_S64 || tp == NLA_U64 || \ - tp == NLA_MSECS) + tp) + tp == NLA_MSECS || \ + tp == NLA_BINARY) + tp) #define NLA_ENSURE_NO_VALIDATION_PTR(tp) \ (__NLA_ENSURE(tp != NLA_BITFIELD32 && \ tp != NLA_REJECT && \ @@ -390,14 +385,14 @@ struct nla_policy { tp != NLA_NESTED_ARRAY) + tp) #define NLA_POLICY_RANGE(tp, _min, _max) { \ - .type = NLA_ENSURE_INT_TYPE(tp), \ + .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE, \ .min = _min, \ .max = _max \ } #define NLA_POLICY_FULL_RANGE(tp, _range) { \ - .type = NLA_ENSURE_UINT_TYPE(tp), \ + .type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE_PTR, \ .range = _range, \ } @@ -409,13 +404,13 @@ struct nla_policy { } #define NLA_POLICY_MIN(tp, _min) { \ - .type = NLA_ENSURE_INT_TYPE(tp), \ + .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MIN, \ .min = _min, \ } #define NLA_POLICY_MAX(tp, _max) { \ - .type = NLA_ENSURE_INT_TYPE(tp), \ + .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MAX, \ .max = _max, \ } @@ -427,6 +422,15 @@ struct nla_policy { .len = __VA_ARGS__ + 0, \ } +#define NLA_POLICY_EXACT_LEN(_len) NLA_POLICY_RANGE(NLA_BINARY, _len, _len) +#define NLA_POLICY_EXACT_LEN_WARN(_len) { \ + .type = NLA_BINARY, \ + .validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG, \ + .min = _len, \ + .max = _len \ +} +#define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len) + /** * struct nl_info - netlink source information * @nlh: Netlink message header of original request -- cgit v1.2.3 From b558b6c24068d87ffcd95dad3bf0d9bd2ac290e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 18 Aug 2020 18:07:09 -0700 Subject: net-tun: Add type safety to tun_xdp_to_ptr() and tun_ptr_to_xdp() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reduces likelihood of incorrect use. Test: builds Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200819010710.3959310-1-zenczykowski@gmail.com --- include/linux/if_tun.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 5bda8cf457b6..6c37e1dbc5df 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -28,8 +28,8 @@ struct tun_xdp_hdr { struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); bool tun_is_xdp_frame(void *ptr); -void *tun_xdp_to_ptr(void *ptr); -void *tun_ptr_to_xdp(void *ptr); +void *tun_xdp_to_ptr(struct xdp_frame *xdp); +struct xdp_frame *tun_ptr_to_xdp(void *ptr); void tun_ptr_free(void *ptr); #else #include @@ -48,11 +48,11 @@ static inline bool tun_is_xdp_frame(void *ptr) { return false; } -static inline void *tun_xdp_to_ptr(void *ptr) +static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return NULL; } -static inline void *tun_ptr_to_xdp(void *ptr) +static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return NULL; } -- cgit v1.2.3 From 596b5ef458f903d4362fb3c69967b6d8a23334bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 18 Aug 2020 18:07:10 -0700 Subject: net-tun: Eliminate two tun/xdp related function calls from vhost-net MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This provides a minor performance boost by virtue of inlining instead of cross module function calls. Test: builds Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200819010710.3959310-2-zenczykowski@gmail.com --- include/linux/if_tun.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/if_tun.h b/include/linux/if_tun.h index 6c37e1dbc5df..2a7660843444 100644 --- a/include/linux/if_tun.h +++ b/include/linux/if_tun.h @@ -27,9 +27,18 @@ struct tun_xdp_hdr { #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); -bool tun_is_xdp_frame(void *ptr); -void *tun_xdp_to_ptr(struct xdp_frame *xdp); -struct xdp_frame *tun_ptr_to_xdp(void *ptr); +static inline bool tun_is_xdp_frame(void *ptr) +{ + return (unsigned long)ptr & TUN_XDP_FLAG; +} +static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) +{ + return (void *)((unsigned long)xdp | TUN_XDP_FLAG); +} +static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) +{ + return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); +} void tun_ptr_free(void *ptr); #else #include -- cgit v1.2.3 From bdfbb63c314a6fb7d27dddd62f5a3e4f062b92bb Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Tue, 18 Aug 2020 12:32:43 +0200 Subject: ptp: Add generic ptp v2 header parsing function Reason: A lot of the ptp drivers - which implement hardware time stamping - need specific fields such as the sequence id from the ptp v2 header. Currently all drivers implement that themselves. Introduce a generic function to retrieve a pointer to the start of the ptp v2 header. Suggested-by: Russell King Signed-off-by: Kurt Kanzenbach Reviewed-by: Richard Cochran Reviewed-by: Florian Fainelli Reviewed-by: Russell King Signed-off-by: David S. Miller --- include/linux/ptp_classify.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index dd00fa41f7e7..0a9cc0eb0801 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -44,6 +44,30 @@ #define OFF_IHL 14 #define IPV4_HLEN(data) (((struct iphdr *)(data + OFF_IHL))->ihl << 2) +struct clock_identity { + u8 id[8]; +} __packed; + +struct port_identity { + struct clock_identity clock_identity; + __be16 port_number; +} __packed; + +struct ptp_header { + u8 tsmt; /* transportSpecific | messageType */ + u8 ver; /* reserved | versionPTP */ + __be16 message_length; + u8 domain_number; + u8 reserved1; + u8 flag_field[2]; + __be64 correction; + __be32 reserved2; + struct port_identity source_port_identity; + __be16 sequence_id; + u8 control; + u8 log_message_interval; +} __packed; + #if defined(CONFIG_NET_PTP_CLASSIFY) /** * ptp_classify_raw - classify a PTP packet @@ -57,6 +81,21 @@ */ unsigned int ptp_classify_raw(const struct sk_buff *skb); +/** + * ptp_parse_header - Get pointer to the PTP v2 header + * @skb: packet buffer + * @type: type of the packet (see ptp_classify_raw()) + * + * This function takes care of the VLAN, UDP, IPv4 and IPv6 headers. The length + * is checked. + * + * Note, internally skb_mac_header() is used. Make sure that the @skb is + * initialized accordingly. + * + * Return: Pointer to the ptp v2 header or NULL if not found + */ +struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type); + void __init ptp_classifier_init(void); #else static inline void ptp_classifier_init(void) @@ -66,5 +105,10 @@ static inline unsigned int ptp_classify_raw(struct sk_buff *skb) { return PTP_CLASS_NONE; } +static inline struct ptp_header *ptp_parse_header(struct sk_buff *skb, + unsigned int type) +{ + return NULL; +} #endif #endif /* _PTP_CLASSIFY_H_ */ -- cgit v1.2.3 From 036c508ba95e573f53bd5bbb79b0c4d71003b319 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Tue, 18 Aug 2020 12:32:44 +0200 Subject: ptp: Add generic ptp message type function The message type is located at different offsets within the ptp header depending on the ptp version (v1 or v2). Therefore, drivers which also deal with ptp v1 have some code for it. Extract this into a helper function for drivers to be used. Signed-off-by: Kurt Kanzenbach Reviewed-by: Richard Cochran Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/ptp_classify.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index 0a9cc0eb0801..cfc6d9152f69 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -96,6 +96,31 @@ unsigned int ptp_classify_raw(const struct sk_buff *skb); */ struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type); +/** + * ptp_get_msgtype - Extract ptp message type from given header + * @hdr: ptp header + * @type: type of the packet (see ptp_classify_raw()) + * + * This function returns the message type for a given ptp header. It takes care + * of the different ptp header versions (v1 or v2). + * + * Return: The message type + */ +static inline u8 ptp_get_msgtype(const struct ptp_header *hdr, + unsigned int type) +{ + u8 msgtype; + + if (unlikely(type & PTP_CLASS_V1)) { + /* msg type is located at the control field for ptp v1 */ + msgtype = hdr->control; + } else { + msgtype = hdr->tsmt & 0x0f; + } + + return msgtype; +} + void __init ptp_classifier_init(void); #else static inline void ptp_classifier_init(void) -- cgit v1.2.3 From 17060fb5069f2c73a566015ce6e019d5685680b5 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Tue, 18 Aug 2020 12:32:51 +0200 Subject: ptp: Remove unused macro The offset for the control field is not needed anymore. Remove it. Signed-off-by: Kurt Kanzenbach Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/ptp_classify.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index cfc6d9152f69..8437307cca8c 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -36,7 +36,6 @@ #define OFF_PTP_SOURCE_UUID 22 /* PTPv1 only */ #define OFF_PTP_SEQUENCE_ID 30 -#define OFF_PTP_CONTROL 32 /* PTPv1 only */ /* Below defines should actually be removed at some point in time. */ #define IP6_HLEN 40 -- cgit v1.2.3 From 005142b8a1f0f32d33fbe04b728464c1b7acfa0e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 18 Aug 2020 21:27:56 -0700 Subject: bpf: Factor out bpf_link_by_id() helper. Refactor the code a bit to extract bpf_link_by_id() helper. It's similar to existing bpf_prog_by_id(). Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20200819042759.51280-2-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 55f694b63164..a9b7185a6b37 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1358,6 +1358,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); struct bpf_prog *bpf_prog_by_id(u32 id); +struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); #else /* !CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From 6b0a249a301e2af9adda84adbced3a2988248b95 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 21 Aug 2020 11:44:18 -0700 Subject: bpf: Implement link_query for bpf iterators This patch implemented bpf_link callback functions show_fdinfo and fill_link_info to support link_query interface. The general interface for show_fdinfo and fill_link_info will print/fill the target_name. Each targets can register show_fdinfo and fill_link_info callbacks to print/fill more target specific information. For example, the below is a fdinfo result for a bpf task iterator. $ cat /proc/1749/fdinfo/7 pos: 0 flags: 02000000 mnt_id: 14 link_type: iter link_id: 11 prog_tag: 990e1f8152f7e54f prog_id: 59 target_name: task Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200821184418.574122-1-yhs@fb.com --- include/linux/bpf.h | 6 ++++++ include/uapi/linux/bpf.h | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9b7185a6b37..529e9b183eeb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1218,12 +1218,18 @@ typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, union bpf_iter_link_info *linfo, struct bpf_iter_aux_info *aux); typedef void (*bpf_iter_detach_target_t)(struct bpf_iter_aux_info *aux); +typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, + struct seq_file *seq); +typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info); #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; bpf_iter_attach_target_t attach_target; bpf_iter_detach_target_t detach_target; + bpf_iter_show_fdinfo_t show_fdinfo; + bpf_iter_fill_link_info_t fill_link_info; u32 ctx_arg_info_size; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0480f893facd..a1bbaff7a0af 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4071,6 +4071,13 @@ struct bpf_link_info { __u64 cgroup_id; __u32 attach_type; } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + union { + __u32 map_id; + } map; + } iter; struct { __u32 netns_ino; __u32 attach_type; -- cgit v1.2.3 From b76f22269028fb252727a696084c70494d80a52c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 21 Aug 2020 11:44:19 -0700 Subject: bpf: Implement link_query callbacks in map element iterators For bpf_map_elem and bpf_sk_local_storage bpf iterators, additional map_id should be shown for fdinfo and userspace query. For example, the following is for a bpf_map_elem iterator. $ cat /proc/1753/fdinfo/9 pos: 0 flags: 02000000 mnt_id: 14 link_type: iter link_id: 34 prog_tag: 104be6d3fe45e6aa prog_id: 173 target_name: bpf_map_elem map_id: 127 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200821184419.574240-1-yhs@fb.com --- include/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 529e9b183eeb..30c144af894a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1256,6 +1256,10 @@ int bpf_iter_new_fd(struct bpf_link *link); bool bpf_link_is_iter(struct bpf_link *link); struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop); int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx); +void bpf_iter_map_show_fdinfo(const struct bpf_iter_aux_info *aux, + struct seq_file *seq); +int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux, + struct bpf_link_info *info); int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); -- cgit v1.2.3 From 7b219da43f94a3b4d5a8aa4cc52b75b34f0301ec Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 21 Aug 2020 11:29:43 +0100 Subject: net: sk_msg: Simplify sk_psock initialization Initializing psock->sk_proto and other saved callbacks is only done in sk_psock_update_proto, after sk_psock_init has returned. The logic for this is difficult to follow, and needlessly complex. Instead, initialize psock->sk_proto whenever we allocate a new psock. Additionally, assert the following invariants: * The SK has no ULP: ULP does it's own finagling of sk->sk_prot * sk_user_data is unused: we need it to store sk_psock Protect our access to sk_user_data with sk_callback_lock, which is what other users like reuseport arrays, etc. do. The result is that an sk_psock is always fully initialized, and that psock->sk_proto is always the "original" struct proto. The latter allows us to use psock->sk_proto when initializing IPv6 TCP / UDP callbacks for sockmap. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200821102948.21918-2-lmb@cloudflare.com --- include/linux/skmsg.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 1e9ed840b9fc..3119928fc103 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -340,23 +340,6 @@ static inline void sk_psock_update_proto(struct sock *sk, struct sk_psock *psock, struct proto *ops) { - /* Initialize saved callbacks and original proto only once, since this - * function may be called multiple times for a psock, e.g. when - * psock->progs.msg_parser is updated. - * - * Since we've not installed the new proto, psock is not yet in use and - * we can initialize it without synchronization. - */ - if (!psock->sk_proto) { - struct proto *orig = READ_ONCE(sk->sk_prot); - - psock->saved_unhash = orig->unhash; - psock->saved_close = orig->close; - psock->saved_write_space = sk->sk_write_space; - - psock->sk_proto = orig; - } - /* Pairs with lockless read in sk_clone_lock() */ WRITE_ONCE(sk->sk_prot, ops); } -- cgit v1.2.3 From 13b79d3ffbb8add9e2a6d604db2b49f241b97303 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Fri, 21 Aug 2020 11:29:45 +0100 Subject: bpf: sockmap: Call sock_map_update_elem directly Don't go via map->ops to call sock_map_update_elem, since we know what function to call in bpf_map_update_value. Since we currently don't allow calling map_update_elem from BPF context, we can remove ops->map_update_elem and rename the function to sock_map_update_elem_sys. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200821102948.21918-4-lmb@cloudflare.com --- include/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 30c144af894a..81f38e2fda78 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1648,6 +1648,7 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else @@ -1669,6 +1670,12 @@ static inline int sock_map_prog_detach(const union bpf_attr *attr, { return -EOPNOTSUPP; } + +static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, + u64 flags) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_BPF_STREAM_PARSER */ #if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) -- cgit v1.2.3 From eee049c0ef5b5b433f36841801e34c21c9f82a23 Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Sat, 22 Aug 2020 15:59:08 +0100 Subject: l2tp: remove tunnel and session debug flags field The l2tp subsystem now uses standard kernel logging APIs for informational and warning messages, and tracepoints for debug information. Now that the tunnel and session debug flags are unused, remove the field from the core structures. Various system calls (in the case of l2tp_ppp) and netlink messages handle the getting and setting of debug flags. To avoid userspace breakage don't modify the API of these calls; simply ignore set requests, and send dummy data for get requests. Signed-off-by: Tom Parkin Signed-off-by: David S. Miller --- include/uapi/linux/if_pppol2tp.h | 2 +- include/uapi/linux/l2tp.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_pppol2tp.h b/include/uapi/linux/if_pppol2tp.h index 060b4d1f3129..a91044328bc9 100644 --- a/include/uapi/linux/if_pppol2tp.h +++ b/include/uapi/linux/if_pppol2tp.h @@ -75,7 +75,7 @@ struct pppol2tpv3in6_addr { }; /* Socket options: - * DEBUG - bitmask of debug message categories + * DEBUG - bitmask of debug message categories (not used) * SENDSEQ - 0 => don't send packets with sequence numbers * 1 => send packets with sequence numbers * RECVSEQ - 0 => receive packet sequence numbers are optional diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 61158f5a1a5b..88a0d32b8c07 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -108,7 +108,7 @@ enum { L2TP_ATTR_VLAN_ID, /* u16 (not used) */ L2TP_ATTR_COOKIE, /* 0, 4 or 8 bytes */ L2TP_ATTR_PEER_COOKIE, /* 0, 4 or 8 bytes */ - L2TP_ATTR_DEBUG, /* u32, enum l2tp_debug_flags */ + L2TP_ATTR_DEBUG, /* u32, enum l2tp_debug_flags (not used) */ L2TP_ATTR_RECV_SEQ, /* u8 */ L2TP_ATTR_SEND_SEQ, /* u8 */ L2TP_ATTR_LNS_MODE, /* u8 */ @@ -177,7 +177,9 @@ enum l2tp_seqmode { }; /** - * enum l2tp_debug_flags - debug message categories for L2TP tunnels/sessions + * enum l2tp_debug_flags - debug message categories for L2TP tunnels/sessions. + * + * Unused. * * @L2TP_MSG_DEBUG: verbose debug (if compiled in) * @L2TP_MSG_CONTROL: userspace - kernel interface -- cgit v1.2.3 From 70a217f1976f75a6cfe8223e5669ad7b405daaad Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:14 -0700 Subject: tcp: Use a struct to represent a saved_syn The TCP_SAVE_SYN has both the network header and tcp header. The total length of the saved syn packet is currently stored in the first 4 bytes (u32) of an array and the actual packet data is stored after that. A later patch will add a bpf helper that allows to get the tcp header alone from the saved syn without the network header. It will be more convenient to have a direct offset to a specific header instead of re-parsing it. This requires to separately store the network hdrlen. The total header length (i.e. network + tcp) is still needed for the current usage in getsockopt. Although this total length can be obtained by looking into the tcphdr and then get the (th->doff << 2), this patch chooses to directly store the tcp hdrlen in the second four bytes of this newly created "struct saved_syn". By using a new struct, it can give a readable name to each individual header length. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190014.2883694-1-kafai@fb.com --- include/linux/tcp.h | 7 ++++++- include/net/request_sock.h | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 14b62d7df942..2088d5a079af 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -406,7 +406,7 @@ struct tcp_sock { * socket. Used to retransmit SYNACKs etc. */ struct request_sock __rcu *fastopen_rsk; - u32 *saved_syn; + struct saved_syn *saved_syn; }; enum tsq_enum { @@ -484,6 +484,11 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) tp->saved_syn = NULL; } +static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) +{ + return saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; +} + struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb); diff --git a/include/net/request_sock.h b/include/net/request_sock.h index b2eb8b4ba697..7d9ed99a77bd 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -41,6 +41,12 @@ struct request_sock_ops { int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); +struct saved_syn { + u32 network_hdrlen; + u32 tcp_hdrlen; + u8 data[]; +}; + /* struct request_sock - mini sock to represent a connection request */ struct request_sock { @@ -60,7 +66,7 @@ struct request_sock { struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; struct sock *sk; - u32 *saved_syn; + struct saved_syn *saved_syn; u32 secid; u32 peer_secid; }; -- cgit v1.2.3 From 2b8ee4f05d4f6a6c427ad30dd6c1bb49eb2efd3b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:21 -0700 Subject: tcp: bpf: Add TCP_BPF_DELACK_MAX setsockopt This change is mostly from an internal patch and adapts it from sysctl config to the bpf_setsockopt setup. The bpf_prog can set the max delay ack by using bpf_setsockopt(TCP_BPF_DELACK_MAX). This max delay ack can be communicated to its peer through bpf header option. The receiving peer can then use this max delay ack and set a potentially lower rto by using bpf_setsockopt(TCP_BPF_RTO_MIN) which will be introduced in the next patch. Another later selftest patch will also use it like the above to show how to write and parse bpf tcp header option. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190021.2884000-1-kafai@fb.com --- include/net/inet_connection_sock.h | 1 + include/uapi/linux/bpf.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index aa8893c68c50..da7264a1ebfc 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -86,6 +86,7 @@ struct inet_connection_sock { struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; + __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; const struct inet_connection_sock_af_ops *icsk_af_ops; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a1bbaff7a0af..7b905cb0213e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4257,6 +4257,7 @@ enum { enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ }; struct bpf_perf_event_value { -- cgit v1.2.3 From ca584ba070864c606f3a54faaafe774726d5b4a1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:27 -0700 Subject: tcp: bpf: Add TCP_BPF_RTO_MIN for bpf_setsockopt This patch adds bpf_setsockopt(TCP_BPF_RTO_MIN) to allow bpf prog to set the min rto of a connection. It could be used together with the earlier patch which has added bpf_setsockopt(TCP_BPF_DELACK_MAX). A later selftest patch will communicate the max delay ack in a bpf tcp header option and then the receiving side can use bpf_setsockopt(TCP_BPF_RTO_MIN) to set a shorter rto. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190027.2884170-1-kafai@fb.com --- include/net/inet_connection_sock.h | 1 + include/net/tcp.h | 2 +- include/uapi/linux/bpf.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index da7264a1ebfc..c738abeb3265 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -86,6 +86,7 @@ struct inet_connection_sock { struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; + __u32 icsk_rto_min; __u32 icsk_delack_max; __u32 icsk_pmtu_cookie; const struct tcp_congestion_ops *icsk_ca_ops; diff --git a/include/net/tcp.h b/include/net/tcp.h index eab6c7510b5b..dda778c782fe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -699,7 +699,7 @@ static inline void tcp_fast_path_check(struct sock *sk) static inline u32 tcp_rto_min(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); - u32 rto_min = TCP_RTO_MIN; + u32 rto_min = inet_csk(sk)->icsk_rto_min; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7b905cb0213e..1ae20058b574 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4258,6 +4258,7 @@ enum { TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ }; struct bpf_perf_event_value { -- cgit v1.2.3 From 7656d68455891f7fc6689f95415fd59e7a1d629b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:33 -0700 Subject: tcp: Add saw_unknown to struct tcp_options_received In a later patch, the bpf prog only wants to be called to handle a header option if that particular header option cannot be handled by the kernel. This unknown option could be written by the peer's bpf-prog. It could also be a new standard option that the running kernel does not support it while a bpf-prog can handle it. This patch adds a "saw_unknown" bit to "struct tcp_options_received" and it uses an existing one byte hole to do that. "saw_unknown" will be set in tcp_parse_options() if it sees an option that the kernel cannot handle. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190033.2884430-1-kafai@fb.com --- include/linux/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2088d5a079af..29d166263ae7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -92,6 +92,8 @@ struct tcp_options_received { smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ + u8 saw_unknown:1, /* Received unknown option */ + unused:7; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ -- cgit v1.2.3 From 72be0fe6ba76282704cb84952bd5a1eb47910290 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:39 -0700 Subject: bpf: tcp: Add bpf_skops_established() In tcp_init_transfer(), it currently calls the bpf prog to give it a chance to handle the just "ESTABLISHED" event (e.g. do setsockopt on the newly established sk). Right now, it is done by calling the general purpose tcp_call_bpf(). In the later patch, it also needs to pass the just-received skb which concludes the 3 way handshake. E.g. the SYNACK received at the active side. The bpf prog can then learn some specific header options written by the peer's bpf-prog and potentially do setsockopt on the newly established sk. Thus, instead of reusing the general purpose tcp_call_bpf(), a new function bpf_skops_established() is added to allow passing the "skb" to the bpf prog. The actual skb passing from bpf_skops_established() to the bpf prog will happen together in a later patch which has the necessary bpf pieces. A "skb" arg is also added to tcp_init_transfer() such that it can then be passed to bpf_skops_established(). Calling the new bpf_skops_established() instead of tcp_call_bpf() should be a noop in this patch. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200820190039.2884750-1-kafai@fb.com --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index dda778c782fe..c186dbf731e1 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -394,7 +394,7 @@ void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); -void tcp_init_transfer(struct sock *sk, int bpf_op); +void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, -- cgit v1.2.3 From 00d211a4ea6f48e8e3b758813fe23ad28193d3bf Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:46 -0700 Subject: bpf: tcp: Add bpf_skops_parse_hdr() The patch adds a function bpf_skops_parse_hdr(). It will call the bpf prog to parse the TCP header received at a tcp_sock that has at least reached the ESTABLISHED state. For the packets received during the 3WHS (SYN, SYNACK and ACK), the received skb will be available to the bpf prog during the callback in bpf_skops_established() introduced in the previous patch and in the bpf_skops_write_hdr_opt() that will be added in the next patch. Calling bpf prog to parse header is controlled by two new flags in tp->bpf_sock_ops_cb_flags: BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG and BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG. When BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG is set, the bpf prog will only be called when there is unknown option in the TCP header. When BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG is set, the bpf prog will be called on all received TCP header. This function is half implemented to highlight the changes in TCP stack. The actual codes preparing the bpf running context and invoking the bpf prog will be added in the later patch with other necessary bpf pieces. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190046.2885054-1-kafai@fb.com --- include/uapi/linux/bpf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1ae20058b574..010ed2abcb66 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4173,8 +4173,10 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0xF, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x3F, }; /* List of known BPF sock_ops operators. -- cgit v1.2.3 From 331fca4315efa3bbd258fbdf8209d59d253c0480 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:52 -0700 Subject: bpf: tcp: Add bpf_skops_hdr_opt_len() and bpf_skops_write_hdr_opt() The bpf prog needs to parse the SYN header to learn what options have been sent by the peer's bpf-prog before writing its options into SYNACK. This patch adds a "syn_skb" arg to tcp_make_synack() and send_synack(). This syn_skb will eventually be made available (as read-only) to the bpf prog. This will be the only SYN packet available to the bpf prog during syncookie. For other regular cases, the bpf prog can also use the saved_syn. When writing options, the bpf prog will first be called to tell the kernel its required number of bytes. It is done by the new bpf_skops_hdr_opt_len(). The bpf prog will only be called when the new BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG is set in tp->bpf_sock_ops_cb_flags. When the bpf prog returns, the kernel will know how many bytes are needed and then update the "*remaining" arg accordingly. 4 byte alignment will be included in the "*remaining" before this function returns. The 4 byte aligned number of bytes will also be stored into the opts->bpf_opt_len. "bpf_opt_len" is a newly added member to the struct tcp_out_options. Then the new bpf_skops_write_hdr_opt() will call the bpf prog to write the header options. The bpf prog is only called if it has reserved spaces before (opts->bpf_opt_len > 0). The bpf prog is the last one getting a chance to reserve header space and writing the header option. These two functions are half implemented to highlight the changes in TCP stack. The actual codes preparing the bpf running context and invoking the bpf prog will be added in the later patch with other necessary bpf pieces. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190052.2885316-1-kafai@fb.com --- include/net/tcp.h | 6 ++++-- include/uapi/linux/bpf.h | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index c186dbf731e1..3e768a6b8264 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -455,7 +455,8 @@ enum tcp_synack_type { struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type); + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); @@ -2035,7 +2036,8 @@ struct tcp_request_sock_ops { int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - enum tcp_synack_type synack_type); + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb); }; extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 010ed2abcb66..18d0e128bc3c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4175,8 +4175,9 @@ enum { BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ - BPF_SOCK_OPS_ALL_CB_FLAGS = 0x3F, + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, }; /* List of known BPF sock_ops operators. -- cgit v1.2.3 From c9985d09e18965131958102f4b67fa1e742df335 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:00:58 -0700 Subject: bpf: sock_ops: Change some members of sock_ops_kern from u32 to u8 A later patch needs to add a few pointers and a few u8 to sock_ops_kern. Hence, this patch saves some spaces by moving some of the existing members from u32 to u8 so that the later patch can still fit everything in a cacheline. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200820190058.2885640-1-kafai@fb.com --- include/linux/filter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 0a355b005bf4..c427dfa5f908 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1236,13 +1236,13 @@ struct bpf_sock_addr_kern { struct bpf_sock_ops_kern { struct sock *sk; - u32 op; union { u32 args[4]; u32 reply; u32 replylong[4]; }; - u32 is_fullsock; + u8 op; + u8 is_fullsock; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that -- cgit v1.2.3 From 0813a841566f0962a5551be7749b43c45f0022a0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:01:04 -0700 Subject: bpf: tcp: Allow bpf prog to write and parse TCP header option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Note: The TCP changes here is mainly to implement the bpf pieces into the bpf_skops_*() functions introduced in the earlier patches. ] The earlier effort in BPF-TCP-CC allows the TCP Congestion Control algorithm to be written in BPF. It opens up opportunities to allow a faster turnaround time in testing/releasing new congestion control ideas to production environment. The same flexibility can be extended to writing TCP header option. It is not uncommon that people want to test new TCP header option to improve the TCP performance. Another use case is for data-center that has a more controlled environment and has more flexibility in putting header options for internal only use. For example, we want to test the idea in putting maximum delay ACK in TCP header option which is similar to a draft RFC proposal [1]. This patch introduces the necessary BPF API and use them in the TCP stack to allow BPF_PROG_TYPE_SOCK_OPS program to parse and write TCP header options. It currently supports most of the TCP packet except RST. Supported TCP header option: ─────────────────────────── This patch allows the bpf-prog to write any option kind. Different bpf-progs can write its own option by calling the new helper bpf_store_hdr_opt(). The helper will ensure there is no duplicated option in the header. By allowing bpf-prog to write any option kind, this gives a lot of flexibility to the bpf-prog. Different bpf-prog can write its own option kind. It could also allow the bpf-prog to support a recently standardized option on an older kernel. Sockops Callback Flags: ────────────────────── The bpf program will only be called to parse/write tcp header option if the following newly added callback flags are enabled in tp->bpf_sock_ops_cb_flags: BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG A few words on the PARSE CB flags. When the above PARSE CB flags are turned on, the bpf-prog will be called on packets received at a sk that has at least reached the ESTABLISHED state. The parsing of the SYN-SYNACK-ACK will be discussed in the "3 Way HandShake" section. The default is off for all of the above new CB flags, i.e. the bpf prog will not be called to parse or write bpf hdr option. There are details comment on these new cb flags in the UAPI bpf.h. sock_ops->skb_data and bpf_load_hdr_opt() ───────────────────────────────────────── sock_ops->skb_data and sock_ops->skb_data_end covers the whole TCP header and its options. They are read only. The new bpf_load_hdr_opt() helps to read a particular option "kind" from the skb_data. Please refer to the comment in UAPI bpf.h. It has details on what skb_data contains under different sock_ops->op. 3 Way HandShake ─────────────── The bpf-prog can learn if it is sending SYN or SYNACK by reading the sock_ops->skb_tcp_flags. * Passive side When writing SYNACK (i.e. sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB), the received SYN skb will be available to the bpf prog. The bpf prog can use the SYN skb (which may carry the header option sent from the remote bpf prog) to decide what bpf header option should be written to the outgoing SYNACK skb. The SYN packet can be obtained by getsockopt(TCP_BPF_SYN*). More on this later. Also, the bpf prog can learn if it is in syncookie mode (by checking sock_ops->args[0] == BPF_WRITE_HDR_TCP_SYNACK_COOKIE). The bpf prog can store the received SYN pkt by using the existing bpf_setsockopt(TCP_SAVE_SYN). The example in a later patch does it. [ Note that the fullsock here is a listen sk, bpf_sk_storage is not very useful here since the listen sk will be shared by many concurrent connection requests. Extending bpf_sk_storage support to request_sock will add weight to the minisock and it is not necessary better than storing the whole ~100 bytes SYN pkt. ] When the connection is established, the bpf prog will be called in the existing PASSIVE_ESTABLISHED_CB callback. At that time, the bpf prog can get the header option from the saved syn and then apply the needed operation to the newly established socket. The later patch will use the max delay ack specified in the SYN header and set the RTO of this newly established connection as an example. The received ACK (that concludes the 3WHS) will also be available to the bpf prog during PASSIVE_ESTABLISHED_CB through the sock_ops->skb_data. It could be useful in syncookie scenario. More on this later. There is an existing getsockopt "TCP_SAVED_SYN" to return the whole saved syn pkt which includes the IP[46] header and the TCP header. A few "TCP_BPF_SYN*" getsockopt has been added to allow specifying where to start getting from, e.g. starting from TCP header, or from IP[46] header. The new getsockopt(TCP_BPF_SYN*) will also know where it can get the SYN's packet from: - (a) the just received syn (available when the bpf prog is writing SYNACK) and it is the only way to get SYN during syncookie mode. or - (b) the saved syn (available in PASSIVE_ESTABLISHED_CB and also other existing CB). The bpf prog does not need to know where the SYN pkt is coming from. The getsockopt(TCP_BPF_SYN*) will hide this details. Similarly, a flags "BPF_LOAD_HDR_OPT_TCP_SYN" is also added to bpf_load_hdr_opt() to read a particular header option from the SYN packet. * Fastopen Fastopen should work the same as the regular non fastopen case. This is a test in a later patch. * Syncookie For syncookie, the later example patch asks the active side's bpf prog to resend the header options in ACK. The server can use bpf_load_hdr_opt() to look at the options in this received ACK during PASSIVE_ESTABLISHED_CB. * Active side The bpf prog will get a chance to write the bpf header option in the SYN packet during WRITE_HDR_OPT_CB. The received SYNACK pkt will also be available to the bpf prog during the existing ACTIVE_ESTABLISHED_CB callback through the sock_ops->skb_data and bpf_load_hdr_opt(). * Turn off header CB flags after 3WHS If the bpf prog does not need to write/parse header options beyond the 3WHS, the bpf prog can clear the bpf_sock_ops_cb_flags to avoid being called for header options. Or the bpf-prog can select to leave the UNKNOWN_HDR_OPT_CB_FLAG on so that the kernel will only call it when there is option that the kernel cannot handle. [1]: draft-wang-tcpm-low-latency-opt-00 https://tools.ietf.org/html/draft-wang-tcpm-low-latency-opt-00 Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200820190104.2885895-1-kafai@fb.com --- include/linux/bpf-cgroup.h | 25 ++++ include/linux/filter.h | 4 + include/net/tcp.h | 49 ++++++++ include/uapi/linux/bpf.h | 300 ++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 377 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 64f367044e25..2f98d2fce62e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -279,6 +279,31 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) +/* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a + * fullsock and its parent fullsock cannot be traced by + * sk_to_full_sk(). + * + * e.g. sock_ops->sk is a request_sock and it is under syncookie mode. + * Its listener-sk is not attached to the rsk_listener. + * In this case, the caller holds the listener-sk (unlocked), + * set its sock_ops->sk to req_sk, and call this SOCK_OPS"_SK" with + * the listener-sk such that the cgroup-bpf-progs of the + * listener-sk will be run. + * + * Regardless of syncookie mode or not, + * calling bpf_setsockopt on listener-sk will not make sense anyway, + * so passing 'sock_ops->sk == req_sk' to the bpf prog is appropriate here. + */ +#define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ + sock_ops, \ + BPF_CGROUP_SOCK_OPS); \ + __ret; \ +}) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ diff --git a/include/linux/filter.h b/include/linux/filter.h index c427dfa5f908..995625950cc1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1241,8 +1241,12 @@ struct bpf_sock_ops_kern { u32 reply; u32 replylong[4]; }; + struct sk_buff *syn_skb; + struct sk_buff *skb; + void *skb_data_end; u8 op; u8 is_fullsock; + u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that diff --git a/include/net/tcp.h b/include/net/tcp.h index 3e768a6b8264..1f967b4e22f6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2235,6 +2235,55 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ +#ifdef CONFIG_CGROUP_BPF +/* Copy the listen sk's HDR_OPT_CB flags to its child. + * + * During 3-Way-HandShake, the synack is usually sent from + * the listen sk with the HDR_OPT_CB flags set so that + * bpf-prog will be called to write the BPF hdr option. + * + * In fastopen, the child sk is used to send synack instead + * of the listen sk. Thus, inheriting the HDR_OPT_CB flags + * from the listen sk gives the bpf-prog a chance to write + * BPF hdr option in the synack pkt during fastopen. + * + * Both fastopen and non-fastopen child will inherit the + * HDR_OPT_CB flags to keep the bpf-prog having a consistent + * behavior when deciding to clear this cb flags (or not) + * during the PASSIVE_ESTABLISHED_CB. + * + * In the future, other cb flags could be inherited here also. + */ +static inline void bpf_skops_init_child(const struct sock *sk, + struct sock *child) +{ + tcp_sk(child)->bpf_sock_ops_cb_flags = + tcp_sk(sk)->bpf_sock_ops_cb_flags & + (BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG | + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG | + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG); +} + +static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, + struct sk_buff *skb, + unsigned int end_offset) +{ + skops->skb = skb; + skops->skb_data_end = skb->data + end_offset; +} +#else +static inline void bpf_skops_init_child(const struct sock *sk, + struct sock *child) +{ +} + +static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, + struct sk_buff *skb, + unsigned int end_offset) +{ +} +#endif + /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 18d0e128bc3c..f67ec5d9e57d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3395,6 +3395,120 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * + * If *flags* is 0, it will search the option from the + * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * has details on what skb_data contains under different + * sock_ops->op. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * >0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. + * + * **-EINVAL** If param is invalid + * + * **-ENOMSG** The option is not found + * + * **-ENOENT** No syn packet available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * + * **-ENOSPC** Not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** Cannot parse the header options in the packet + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid + * + * **-ENOSPC** Not enough space in the header. + * Nothing has been written + * + * **-EEXIST** The option has already existed + * + * **-EFAULT** Cannot parse the existing header options + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by bpf_store_hdr_opt() later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * If bpf_reserve_hdr_opt() is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if param is invalid + * + * **-ENOSPC** Not enough space in the header. + * + * **-EPERM** This helper cannot be used under the + * current sock_ops->op. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3539,6 +3653,9 @@ union bpf_attr { FN(skc_to_tcp_request_sock), \ FN(skc_to_udp6_sock), \ FN(get_task_stack), \ + FN(load_hdr_opt), \ + FN(store_hdr_opt), \ + FN(reserve_hdr_opt), /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4165,6 +4282,36 @@ struct bpf_sock_ops { __u64 bytes_received; __u64 bytes_acked; __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ }; /* Definitions for bpf_sock_ops_cb_flags */ @@ -4173,8 +4320,48 @@ enum { BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), - BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), /* Mask of all currently supported cb flags */ BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, @@ -4233,6 +4420,63 @@ enum { */ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ }; /* List of TCP states. There is a build check in net/ipv4/tcp.c to detect @@ -4262,6 +4506,60 @@ enum { TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ }; struct bpf_perf_event_value { -- cgit v1.2.3 From 267cf9fa43d1c9d525d5d818a8651f2900e3aa9e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 20 Aug 2020 12:01:23 -0700 Subject: tcp: bpf: Optionally store mac header in TCP_SAVE_SYN This patch is adapted from Eric's patch in an earlier discussion [1]. The TCP_SAVE_SYN currently only stores the network header and tcp header. This patch allows it to optionally store the mac header also if the setsockopt's optval is 2. It requires one more bit for the "save_syn" bit field in tcp_sock. This patch achieves this by moving the syn_smc bit next to the is_mptcp. The syn_smc is currently used with the TCP experimental option. Since syn_smc is only used when CONFIG_SMC is enabled, this patch also puts the "IS_ENABLED(CONFIG_SMC)" around it like the is_mptcp did with "IS_ENABLED(CONFIG_MPTCP)". The mac_hdrlen is also stored in the "struct saved_syn" to allow a quick offset from the bpf prog if it chooses to start getting from the network header or the tcp header. [1]: https://lore.kernel.org/netdev/CANn89iLJNWh6bkH7DNhy_kmcAexuUCccqERqe7z2QsvPhGrYPQ@mail.gmail.com/ Suggested-by: Eric Dumazet Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/bpf/20200820190123.2886935-1-kafai@fb.com --- include/linux/tcp.h | 13 ++++++++----- include/net/request_sock.h | 1 + include/uapi/linux/bpf.h | 1 + 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 29d166263ae7..56ff2952edaf 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -239,14 +239,13 @@ struct tcp_sock { repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; - u8 syn_data:1, /* SYN includes data */ + u8 save_syn:2, /* Save headers of SYN packet */ + syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - save_syn:1, /* Save headers of SYN packet */ - is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ - syn_smc:1; /* SYN includes SMC */ + is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ @@ -393,6 +392,9 @@ struct tcp_sock { #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif +#if IS_ENABLED(CONFIG_SMC) + bool syn_smc; /* SYN includes SMC */ +#endif #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ @@ -488,7 +490,8 @@ static inline void tcp_saved_syn_free(struct tcp_sock *tp) static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) { - return saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; + return saved_syn->mac_hdrlen + saved_syn->network_hdrlen + + saved_syn->tcp_hdrlen; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 7d9ed99a77bd..29e41ff3ec93 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -42,6 +42,7 @@ struct request_sock_ops { int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); struct saved_syn { + u32 mac_hdrlen; u32 network_hdrlen; u32 tcp_hdrlen; u8 data[]; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f67ec5d9e57d..544b89a64918 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4540,6 +4540,7 @@ enum { */ TCP_BPF_SYN = 1005, /* Copy the TCP header */ TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ }; enum { -- cgit v1.2.3 From 583bbf0624dfd8fc45f1049be1d4980be59451ff Mon Sep 17 00:00:00 2001 From: Luke Hsiao Date: Fri, 21 Aug 2020 21:41:04 -0700 Subject: io_uring: allow tcp ancillary data for __sys_recvmsg_sock() For TCP tx zero-copy, the kernel notifies the process of completions by queuing completion notifications on the socket error queue. This patch allows reading these notifications via recvmsg to support TCP tx zero-copy. Ancillary data was originally disallowed due to privilege escalation via io_uring's offloading of sendmsg() onto a kernel thread with kernel credentials (https://crbug.com/project-zero/1975). So, we must ensure that the socket type is one where the ancillary data types that are delivered on recvmsg are plain data (no file descriptors or values that are translated based on the identity of the calling process). This was tested by using io_uring to call recvmsg on the MSG_ERRQUEUE with tx zero-copy enabled. Before this patch, we received -EINVALID from this specific code path. After this patch, we could read tcp tx zero-copy completion notifications from the MSG_ERRQUEUE. Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Arjun Roy Acked-by: Eric Dumazet Reviewed-by: Jann Horn Reviewed-by: Jens Axboe Signed-off-by: Luke Hsiao Signed-off-by: David S. Miller --- include/linux/net.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index d48ff1180879..7657c6432a69 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -41,6 +41,8 @@ struct net; #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 +#define PROTO_CMSG_DATA_ONLY 0x0001 + #ifndef ARCH_HAS_SOCKET_TYPES /** * enum sock_type - Socket types @@ -135,6 +137,7 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, struct proto_ops { int family; + unsigned int flags; struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, -- cgit v1.2.3 From 755f982bb1ff469a181df3eaf8dd5d769267ab8e Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Sun, 23 Aug 2020 14:19:26 +0300 Subject: qed/qede: make devlink survive recovery Devlink instance lifecycle was linked to qed_dev object, that caused devlink to be recreated on each recovery. Changing it by making higher level driver (qede) responsible for its life. This way devlink now survives recoveries. qede now stores devlink structure pointer as a part of its device object, devlink private data contains a linkage structure, qed_devlink. Signed-off-by: Igor Russkikh Signed-off-by: Alexander Lobakin Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index cd6a5c7e56eb..d8368e1770df 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -21,6 +21,7 @@ #include #include #include +#include enum dcbx_protocol_type { DCBX_PROTOCOL_ISCSI, @@ -779,6 +780,10 @@ enum qed_nvm_flash_cmd { QED_NVM_FLASH_CMD_NVM_MAX, }; +struct qed_devlink { + struct qed_dev *cdev; +}; + struct qed_common_cb_ops { void (*arfs_filter_op)(void *dev, void *fltr, u8 fw_rc); void (*link_update)(void *dev, struct qed_link_output *link); @@ -1137,6 +1142,10 @@ struct qed_common_ops { * */ int (*set_grc_config)(struct qed_dev *cdev, u32 cfg_id, u32 val); + + struct devlink* (*devlink_register)(struct qed_dev *cdev); + + void (*devlink_unregister)(struct devlink *devlink); }; #define MASK_FIELD(_name, _value) \ -- cgit v1.2.3 From 9524067b9a91dce2a096a0de7727c217495e3d2e Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Sun, 23 Aug 2020 14:19:29 +0300 Subject: qed: health reporter init deinit seq Here we declare health reporter ops (empty for now) and register these in qed probe and remove callbacks. This way we get devlink attached to all kind of qed* PCI device entities: networking or storage offload entity. Signed-off-by: Igor Russkikh Signed-off-by: Alexander Lobakin Signed-off-by: Michal Kalderon Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index d8368e1770df..30fe06fe06a0 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -782,6 +782,7 @@ enum qed_nvm_flash_cmd { struct qed_devlink { struct qed_dev *cdev; + struct devlink_health_reporter *fw_reporter; }; struct qed_common_cb_ops { -- cgit v1.2.3 From 4f5a8db27eb926616500f827d9b81dc40595b0ef Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Sun, 23 Aug 2020 14:19:30 +0300 Subject: qed: use devlink logic to report errors Use devlink_health_report to push error indications. We implement this in qede via callback function to make it possible to reuse the same for other drivers sitting on top of qed in future. Signed-off-by: Igor Russkikh Signed-off-by: Alexander Lobakin Signed-off-by: Michal Kalderon Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 30fe06fe06a0..a75533de9186 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -906,6 +906,9 @@ struct qed_common_ops { int (*dbg_all_data_size) (struct qed_dev *cdev); + int (*report_fatal_error)(struct devlink *devlink, + enum qed_hw_err_type err_type); + /** * @brief can_link_change - can the instance change the link or not * -- cgit v1.2.3 From c5c642c55e2fd43fcf42262fd1e87271d413fb42 Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Sun, 23 Aug 2020 14:19:33 +0300 Subject: qed: align adjacent indent Remove extra indent on some of adjacent declarations. Signed-off-by: Igor Russkikh Signed-off-by: Alexander Lobakin Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- include/linux/qed/qed_if.h | 69 ++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index a75533de9186..56fa55841d39 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -850,10 +850,9 @@ struct qed_common_ops { struct qed_dev* (*probe)(struct pci_dev *dev, struct qed_probe_params *params); - void (*remove)(struct qed_dev *cdev); + void (*remove)(struct qed_dev *cdev); - int (*set_power_state)(struct qed_dev *cdev, - pci_power_t state); + int (*set_power_state)(struct qed_dev *cdev, pci_power_t state); void (*set_name) (struct qed_dev *cdev, char name[]); @@ -861,50 +860,48 @@ struct qed_common_ops { * PF params required for the call before slowpath_start is * documented within the qed_pf_params structure definition. */ - void (*update_pf_params)(struct qed_dev *cdev, - struct qed_pf_params *params); - int (*slowpath_start)(struct qed_dev *cdev, - struct qed_slowpath_params *params); + void (*update_pf_params)(struct qed_dev *cdev, + struct qed_pf_params *params); - int (*slowpath_stop)(struct qed_dev *cdev); + int (*slowpath_start)(struct qed_dev *cdev, + struct qed_slowpath_params *params); + + int (*slowpath_stop)(struct qed_dev *cdev); /* Requests to use `cnt' interrupts for fastpath. * upon success, returns number of interrupts allocated for fastpath. */ - int (*set_fp_int)(struct qed_dev *cdev, - u16 cnt); + int (*set_fp_int)(struct qed_dev *cdev, u16 cnt); /* Fills `info' with pointers required for utilizing interrupts */ - int (*get_fp_int)(struct qed_dev *cdev, - struct qed_int_info *info); - - u32 (*sb_init)(struct qed_dev *cdev, - struct qed_sb_info *sb_info, - void *sb_virt_addr, - dma_addr_t sb_phy_addr, - u16 sb_id, - enum qed_sb_type type); - - u32 (*sb_release)(struct qed_dev *cdev, - struct qed_sb_info *sb_info, - u16 sb_id, - enum qed_sb_type type); - - void (*simd_handler_config)(struct qed_dev *cdev, - void *token, - int index, - void (*handler)(void *)); - - void (*simd_handler_clean)(struct qed_dev *cdev, - int index); - int (*dbg_grc)(struct qed_dev *cdev, - void *buffer, u32 *num_dumped_bytes); + int (*get_fp_int)(struct qed_dev *cdev, struct qed_int_info *info); + + u32 (*sb_init)(struct qed_dev *cdev, + struct qed_sb_info *sb_info, + void *sb_virt_addr, + dma_addr_t sb_phy_addr, + u16 sb_id, + enum qed_sb_type type); + + u32 (*sb_release)(struct qed_dev *cdev, + struct qed_sb_info *sb_info, + u16 sb_id, + enum qed_sb_type type); + + void (*simd_handler_config)(struct qed_dev *cdev, + void *token, + int index, + void (*handler)(void *)); + + void (*simd_handler_clean)(struct qed_dev *cdev, int index); + + int (*dbg_grc)(struct qed_dev *cdev, void *buffer, u32 *num_dumped_bytes); int (*dbg_grc_size)(struct qed_dev *cdev); - int (*dbg_all_data) (struct qed_dev *cdev, void *buffer); + int (*dbg_all_data)(struct qed_dev *cdev, void *buffer); - int (*dbg_all_data_size) (struct qed_dev *cdev); + int (*dbg_all_data_size)(struct qed_dev *cdev); int (*report_fatal_error)(struct devlink *devlink, enum qed_hw_err_type err_type); -- cgit v1.2.3 From 1f00d375af84fbcdb6dd6c79fd7c3d02d2390338 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:13 +0200 Subject: bpf: Renames in preparation for bpf_local_storage A purely mechanical change to split the renaming from the actual generalization. Flags/consts: SK_STORAGE_CREATE_FLAG_MASK BPF_LOCAL_STORAGE_CREATE_FLAG_MASK BPF_SK_STORAGE_CACHE_SIZE BPF_LOCAL_STORAGE_CACHE_SIZE MAX_VALUE_SIZE BPF_LOCAL_STORAGE_MAX_VALUE_SIZE Structs: bucket bpf_local_storage_map_bucket bpf_sk_storage_map bpf_local_storage_map bpf_sk_storage_data bpf_local_storage_data bpf_sk_storage_elem bpf_local_storage_elem bpf_sk_storage bpf_local_storage The "sk" member in bpf_local_storage is also updated to "owner" in preparation for changing the type to void * in a subsequent patch. Functions: selem_linked_to_sk selem_linked_to_storage selem_alloc bpf_selem_alloc __selem_unlink_sk bpf_selem_unlink_storage_nolock __selem_link_sk bpf_selem_link_storage_nolock selem_unlink_sk __bpf_selem_unlink_storage sk_storage_update bpf_local_storage_update __sk_storage_lookup bpf_local_storage_lookup bpf_sk_storage_map_free bpf_local_storage_map_free bpf_sk_storage_map_alloc bpf_local_storage_map_alloc bpf_sk_storage_map_alloc_check bpf_local_storage_map_alloc_check bpf_sk_storage_map_check_btf bpf_local_storage_map_check_btf Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200825182919.1118197-2-kpsingh@chromium.org --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 064637d1ddf6..18423cc9cde8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -246,7 +246,7 @@ struct sock_common { /* public: */ }; -struct bpf_sk_storage; +struct bpf_local_storage; /** * struct sock - network layer representation of sockets @@ -517,7 +517,7 @@ struct sock { void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; #ifdef CONFIG_BPF_SYSCALL - struct bpf_sk_storage __rcu *sk_bpf_storage; + struct bpf_local_storage __rcu *sk_bpf_storage; #endif struct rcu_head sk_rcu; }; -- cgit v1.2.3 From 4cc9ce4e739961a7b9e6b2f3b27a72124d356373 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:14 +0200 Subject: bpf: Generalize caching for sk_storage. Provide the a ability to define local storage caches on a per-object type basis. The caches and caching indices for different objects should not be inter-mixed as suggested in: https://lore.kernel.org/bpf/20200630193441.kdwnkestulg5erii@kafai-mbp.dhcp.thefacebook.com/ "Caching a sk-storage at idx=0 of a sk should not stop an inode-storage to be cached at the same idx of a inode." Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200825182919.1118197-3-kpsingh@chromium.org --- include/net/bpf_sk_storage.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 5036c94c0503..950c5aaba15e 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -3,6 +3,9 @@ #ifndef _BPF_SK_STORAGE_H #define _BPF_SK_STORAGE_H +#include +#include + struct sock; void bpf_sk_storage_free(struct sock *sk); @@ -15,6 +18,22 @@ struct sk_buff; struct nlattr; struct sock; +#define BPF_LOCAL_STORAGE_CACHE_SIZE 16 + +struct bpf_local_storage_cache { + spinlock_t idx_lock; + u64 idx_usage_counts[BPF_LOCAL_STORAGE_CACHE_SIZE]; +}; + +#define DEFINE_BPF_STORAGE_CACHE(name) \ +static struct bpf_local_storage_cache name = { \ + .idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock), \ +} + +u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); +void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, + u16 idx); + #ifdef CONFIG_BPF_SYSCALL int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); struct bpf_sk_storage_diag * -- cgit v1.2.3 From f836a56e84ffc9f1a1cd73f77e10404ca46a4616 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:15 +0200 Subject: bpf: Generalize bpf_sk_storage Refactor the functionality in bpf_sk_storage.c so that concept of storage linked to kernel objects can be extended to other objects like inode, task_struct etc. Each new local storage will still be a separate map and provide its own set of helpers. This allows for future object specific extensions and still share a lot of the underlying implementation. This includes the changes suggested by Martin in: https://lore.kernel.org/bpf/20200725013047.4006241-1-kafai@fb.com/ adding new map operations to support bpf_local_storage maps: * storages for different kernel objects to optionally have different memory charging strategy (map_local_storage_charge, map_local_storage_uncharge) * Functionality to extract the storage pointer from a pointer to the owning object (map_owner_storage_ptr) Co-developed-by: Martin KaFai Lau Signed-off-by: Martin KaFai Lau Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200825182919.1118197-4-kpsingh@chromium.org --- include/linux/bpf.h | 8 +++++++ include/net/bpf_sk_storage.h | 52 ++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 8 +++++-- 3 files changed, 66 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 81f38e2fda78..8c443b93ac11 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -34,6 +34,8 @@ struct btf_type; struct exception_table_entry; struct seq_operations; struct bpf_iter_aux_info; +struct bpf_local_storage; +struct bpf_local_storage_map; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -104,6 +106,12 @@ struct bpf_map_ops { __poll_t (*map_poll)(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts); + /* Functions called by bpf_local_storage maps */ + int (*map_local_storage_charge)(struct bpf_local_storage_map *smap, + void *owner, u32 size); + void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap, + void *owner, u32 size); + struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 950c5aaba15e..9e631b5466e3 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -3,8 +3,15 @@ #ifndef _BPF_SK_STORAGE_H #define _BPF_SK_STORAGE_H +#include +#include +#include #include #include +#include +#include +#include +#include struct sock; @@ -13,6 +20,7 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +struct bpf_local_storage_elem; struct bpf_sk_storage_diag; struct sk_buff; struct nlattr; @@ -34,6 +42,50 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, u16 idx); +/* Helper functions for bpf_local_storage */ +int bpf_local_storage_map_alloc_check(union bpf_attr *attr); + +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); + +struct bpf_local_storage_data * +bpf_local_storage_lookup(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + bool cacheit_lockit); + +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); + +int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type); + +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); + +bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_omem); + +void bpf_selem_unlink(struct bpf_local_storage_elem *selem); + +void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem); + +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); + +struct bpf_local_storage_elem * +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, + bool charge_mem); + +int +bpf_local_storage_alloc(void *owner, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *first_selem); + +struct bpf_local_storage_data * +bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags); + #ifdef CONFIG_BPF_SYSCALL int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); struct bpf_sk_storage_diag * diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 544b89a64918..2cbd137eed86 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3765,9 +3765,13 @@ enum { BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), }; -/* BPF_FUNC_sk_storage_get flags */ +/* BPF_FUNC__storage_get flags */ enum { - BPF_SK_STORAGE_GET_F_CREATE = (1ULL << 0), + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, }; /* BPF_FUNC_read_branch_records flags. */ -- cgit v1.2.3 From 450af8d0f6be2e7dd2a528a3fb054bb726bf1747 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:16 +0200 Subject: bpf: Split bpf_local_storage to bpf_sk_storage A purely mechanical change: bpf_sk_storage.c = bpf_sk_storage.c + bpf_local_storage.c bpf_sk_storage.h = bpf_sk_storage.h + bpf_local_storage.h Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200825182919.1118197-5-kpsingh@chromium.org --- include/linux/bpf_local_storage.h | 163 ++++++++++++++++++++++++++++++++++++++ include/net/bpf_sk_storage.h | 61 +------------- 2 files changed, 164 insertions(+), 60 deletions(-) create mode 100644 include/linux/bpf_local_storage.h (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h new file mode 100644 index 000000000000..b2c9463f36a1 --- /dev/null +++ b/include/linux/bpf_local_storage.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2019 Facebook + * Copyright 2020 Google LLC. + */ + +#ifndef _BPF_LOCAL_STORAGE_H +#define _BPF_LOCAL_STORAGE_H + +#include +#include +#include +#include +#include +#include + +#define BPF_LOCAL_STORAGE_CACHE_SIZE 16 + +struct bpf_local_storage_map_bucket { + struct hlist_head list; + raw_spinlock_t lock; +}; + +/* Thp map is not the primary owner of a bpf_local_storage_elem. + * Instead, the container object (eg. sk->sk_bpf_storage) is. + * + * The map (bpf_local_storage_map) is for two purposes + * 1. Define the size of the "local storage". It is + * the map's value_size. + * + * 2. Maintain a list to keep track of all elems such + * that they can be cleaned up during the map destruction. + * + * When a bpf local storage is being looked up for a + * particular object, the "bpf_map" pointer is actually used + * as the "key" to search in the list of elem in + * the respective bpf_local_storage owned by the object. + * + * e.g. sk->sk_bpf_storage is the mini-map with the "bpf_map" pointer + * as the searching key. + */ +struct bpf_local_storage_map { + struct bpf_map map; + /* Lookup elem does not require accessing the map. + * + * Updating/Deleting requires a bucket lock to + * link/unlink the elem from the map. Having + * multiple buckets to improve contention. + */ + struct bpf_local_storage_map_bucket *buckets; + u32 bucket_log; + u16 elem_size; + u16 cache_idx; +}; + +struct bpf_local_storage_data { + /* smap is used as the searching key when looking up + * from the object's bpf_local_storage. + * + * Put it in the same cacheline as the data to minimize + * the number of cachelines access during the cache hit case. + */ + struct bpf_local_storage_map __rcu *smap; + u8 data[] __aligned(8); +}; + +/* Linked to bpf_local_storage and bpf_local_storage_map */ +struct bpf_local_storage_elem { + struct hlist_node map_node; /* Linked to bpf_local_storage_map */ + struct hlist_node snode; /* Linked to bpf_local_storage */ + struct bpf_local_storage __rcu *local_storage; + struct rcu_head rcu; + /* 8 bytes hole */ + /* The data is stored in aother cacheline to minimize + * the number of cachelines access during a cache hit. + */ + struct bpf_local_storage_data sdata ____cacheline_aligned; +}; + +struct bpf_local_storage { + struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; + struct hlist_head list; /* List of bpf_local_storage_elem */ + void *owner; /* The object that owns the above "list" of + * bpf_local_storage_elem. + */ + struct rcu_head rcu; + raw_spinlock_t lock; /* Protect adding/removing from the "list" */ +}; + +/* U16_MAX is much more than enough for sk local storage + * considering a tcp_sock is ~2k. + */ +#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE \ + min_t(u32, \ + (KMALLOC_MAX_SIZE - MAX_BPF_STACK - \ + sizeof(struct bpf_local_storage_elem)), \ + (U16_MAX - sizeof(struct bpf_local_storage_elem))) + +#define SELEM(_SDATA) \ + container_of((_SDATA), struct bpf_local_storage_elem, sdata) +#define SDATA(_SELEM) (&(_SELEM)->sdata) + +#define BPF_LOCAL_STORAGE_CACHE_SIZE 16 + +struct bpf_local_storage_cache { + spinlock_t idx_lock; + u64 idx_usage_counts[BPF_LOCAL_STORAGE_CACHE_SIZE]; +}; + +#define DEFINE_BPF_STORAGE_CACHE(name) \ +static struct bpf_local_storage_cache name = { \ + .idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock), \ +} + +u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); +void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, + u16 idx); + +/* Helper functions for bpf_local_storage */ +int bpf_local_storage_map_alloc_check(union bpf_attr *attr); + +struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); + +struct bpf_local_storage_data * +bpf_local_storage_lookup(struct bpf_local_storage *local_storage, + struct bpf_local_storage_map *smap, + bool cacheit_lockit); + +void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); + +int bpf_local_storage_map_check_btf(const struct bpf_map *map, + const struct btf *btf, + const struct btf_type *key_type, + const struct btf_type *value_type); + +void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem); + +bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_omem); + +void bpf_selem_unlink(struct bpf_local_storage_elem *selem); + +void bpf_selem_link_map(struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *selem); + +void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); + +struct bpf_local_storage_elem * +bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, + bool charge_mem); + +int +bpf_local_storage_alloc(void *owner, + struct bpf_local_storage_map *smap, + struct bpf_local_storage_elem *first_selem); + +struct bpf_local_storage_data * +bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, + void *value, u64 map_flags); + +#endif /* _BPF_LOCAL_STORAGE_H */ diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 9e631b5466e3..3c516dd07caf 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -12,6 +12,7 @@ #include #include #include +#include struct sock; @@ -26,66 +27,6 @@ struct sk_buff; struct nlattr; struct sock; -#define BPF_LOCAL_STORAGE_CACHE_SIZE 16 - -struct bpf_local_storage_cache { - spinlock_t idx_lock; - u64 idx_usage_counts[BPF_LOCAL_STORAGE_CACHE_SIZE]; -}; - -#define DEFINE_BPF_STORAGE_CACHE(name) \ -static struct bpf_local_storage_cache name = { \ - .idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock), \ -} - -u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); -void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, - u16 idx); - -/* Helper functions for bpf_local_storage */ -int bpf_local_storage_map_alloc_check(union bpf_attr *attr); - -struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); - -struct bpf_local_storage_data * -bpf_local_storage_lookup(struct bpf_local_storage *local_storage, - struct bpf_local_storage_map *smap, - bool cacheit_lockit); - -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap); - -int bpf_local_storage_map_check_btf(const struct bpf_map *map, - const struct btf *btf, - const struct btf_type *key_type, - const struct btf_type *value_type); - -void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem); - -bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem, - bool uncharge_omem); - -void bpf_selem_unlink(struct bpf_local_storage_elem *selem); - -void bpf_selem_link_map(struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *selem); - -void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); - -struct bpf_local_storage_elem * -bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, - bool charge_mem); - -int -bpf_local_storage_alloc(void *owner, - struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *first_selem); - -struct bpf_local_storage_data * -bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags); - #ifdef CONFIG_BPF_SYSCALL int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk); struct bpf_sk_storage_diag * -- cgit v1.2.3 From 8ea636848aca35b9f97c5b5dee30225cf2dd0fe6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:17 +0200 Subject: bpf: Implement bpf_local_storage for inodes Similar to bpf_local_storage for sockets, add local storage for inodes. The life-cycle of storage is managed with the life-cycle of the inode. i.e. the storage is destroyed along with the owning inode. The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the security blob which are now stackable and can co-exist with other LSMs. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200825182919.1118197-6-kpsingh@chromium.org --- include/linux/bpf_lsm.h | 29 +++++++++++++++++++++++++++++ include/linux/bpf_types.h | 3 +++ include/uapi/linux/bpf.h | 40 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 71 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index af74712af585..aaacb6aafc87 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -17,9 +17,28 @@ #include #undef LSM_HOOK +struct bpf_storage_blob { + struct bpf_local_storage __rcu *storage; +}; + +extern struct lsm_blob_sizes bpf_lsm_blob_sizes; + int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); +static inline struct bpf_storage_blob *bpf_inode( + const struct inode *inode) +{ + if (unlikely(!inode->i_security)) + return NULL; + + return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; +} + +extern const struct bpf_func_proto bpf_inode_storage_get_proto; +extern const struct bpf_func_proto bpf_inode_storage_delete_proto; +void bpf_inode_storage_free(struct inode *inode); + #else /* !CONFIG_BPF_LSM */ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, @@ -28,6 +47,16 @@ static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return -EOPNOTSUPP; } +static inline struct bpf_storage_blob *bpf_inode( + const struct inode *inode) +{ + return NULL; +} + +static inline void bpf_inode_storage_free(struct inode *inode) +{ +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a52a5688418e..2e6f568377f1 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -107,6 +107,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SK_STORAGE, sk_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif +#ifdef CONFIG_BPF_LSM +BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2cbd137eed86..b6bfcd085a76 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -155,6 +155,7 @@ enum bpf_map_type { BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, }; /* Note that tracing related programs such as @@ -3509,6 +3510,41 @@ union bpf_attr { * * **-EPERM** This helper cannot be used under the * current sock_ops->op. + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3655,7 +3691,9 @@ union bpf_attr { FN(get_task_stack), \ FN(load_hdr_opt), \ FN(store_hdr_opt), \ - FN(reserve_hdr_opt), + FN(reserve_hdr_opt), \ + FN(inode_storage_get), \ + FN(inode_storage_delete), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 30897832d8b97e93833fb52c0a02951db3692ed2 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 25 Aug 2020 20:29:18 +0200 Subject: bpf: Allow local storage to be used from LSM programs Adds support for both bpf_{sk, inode}_storage_{get, delete} to be used in LSM programs. These helpers are not used for tracing programs (currently) as their usage is tied to the life-cycle of the object and should only be used where the owning object won't be freed (when the owning object is passed as an argument to the LSM hook). Thus, they are safer to use in LSM hooks than tracing. Usage of local storage in tracing programs will probably follow a per function based whitelist approach. Since the UAPI helper signature for bpf_sk_storage expect a bpf_sock, it, leads to a compilation warning for LSM programs, it's also updated to accept a void * pointer instead. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200825182919.1118197-7-kpsingh@chromium.org --- include/net/bpf_sk_storage.h | 2 ++ include/uapi/linux/bpf.h | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 3c516dd07caf..119f4c9c3a9c 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -20,6 +20,8 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +extern const struct bpf_func_proto sk_storage_get_btf_proto; +extern const struct bpf_func_proto sk_storage_delete_btf_proto; struct bpf_local_storage_elem; struct bpf_sk_storage_diag; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b6bfcd085a76..0e1cdf806fe1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2808,7 +2808,7 @@ union bpf_attr { * * **-ERANGE** if resulting value was out of range. * - * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) * Description * Get a bpf-local-storage from a *sk*. * @@ -2824,6 +2824,9 @@ union bpf_attr { * "type". The bpf-local-storage "type" (i.e. the *map*) is * searched against all bpf-local-storages residing at *sk*. * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be * used such that a new bpf-local-storage will be * created if one does not exist. *value* can be used @@ -2836,7 +2839,7 @@ union bpf_attr { * **NULL** if not found or there was an error in adding * a new bpf-local-storage. * - * long bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) * Description * Delete a bpf-local-storage from a *sk*. * Return -- cgit v1.2.3 From 6298399bfc101f8e8cf35a916f26aa32bdf04278 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:13 +0200 Subject: bpf: Move btf_resolve_size into __btf_resolve_size Moving btf_resolve_size into __btf_resolve_size and keeping btf_resolve_size public with just first 3 arguments, because the rest of the arguments are not used by outside callers. Following changes are adding more arguments, which are not useful to outside callers. They will be added to the __btf_resolve_size function. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-4-jolsa@kernel.org --- include/linux/btf.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 8b81fbb4497c..a9af5e7a7ece 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -64,8 +64,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf, u32 id, u32 *res_id); const struct btf_type * btf_resolve_size(const struct btf *btf, const struct btf_type *type, - u32 *type_size, const struct btf_type **elem_type, - u32 *total_nelems); + u32 *type_size); #define for_each_member(i, struct_type, member) \ for (i = 0, member = btf_type_member(struct_type); \ -- cgit v1.2.3 From faaf4a790d93794b46d67e2fd69b8e5c8cae2d41 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:18 +0200 Subject: bpf: Add btf_struct_ids_match function Adding btf_struct_ids_match function to check if given address provided by BTF object + offset is also address of another nested BTF object. This allows to pass an argument to helper, which is defined via parent BTF object + offset, like for bpf_d_path (added in following changes): SEC("fentry/filp_close") int BPF_PROG(prog_close, struct file *file, void *id) { ... ret = bpf_d_path(&file->f_path, ... The first bpf_d_path argument is hold by verifier as BTF file object plus offset of f_path member. The btf_struct_ids_match function will walk the struct file object and check if there's nested struct path object on the given offset. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-9-jolsa@kernel.org --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8c443b93ac11..540f5e6c3788 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1358,6 +1358,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); +bool btf_struct_ids_match(struct bpf_verifier_log *log, + int off, u32 id, u32 need_type_id); int btf_resolve_helper_id(struct bpf_verifier_log *log, const struct bpf_func_proto *fn, int); -- cgit v1.2.3 From eae2e83e62633a2659e3bc690facba1c2fc9c45b Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:19 +0200 Subject: bpf: Add BTF_SET_START/END macros Adding support to define sorted set of BTF ID values. Following defines sorted set of BTF ID values: BTF_SET_START(btf_allowlist_d_path) BTF_ID(func, vfs_truncate) BTF_ID(func, vfs_fallocate) BTF_ID(func, dentry_open) BTF_ID(func, vfs_getattr) BTF_ID(func, filp_close) BTF_SET_END(btf_allowlist_d_path) It defines following 'struct btf_id_set' variable to access values and count: struct btf_id_set btf_allowlist_d_path; Adding 'allowed' callback to struct bpf_func_proto, to allow verifier the check on allowed callers. Adding btf_id_set_contains function, which will be used by allowed callbacks to verify the caller's BTF ID value is within allowed set. Also removing extra '\' in __BTF_ID_LIST macro. Added BTF_SET_START_GLOBAL macro for global sets. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200825192124.710397-10-jolsa@kernel.org --- include/linux/bpf.h | 4 ++++ include/linux/btf_ids.h | 51 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 540f5e6c3788..a6131d95e31e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -317,6 +317,7 @@ struct bpf_func_proto { * for this argument. */ int *ret_btf_id; /* return value btf_id */ + bool (*allowed)(const struct bpf_prog *prog); }; /* bpf_context is intentionally undefined structure. Pointer to bpf_context is @@ -1878,4 +1879,7 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +struct btf_id_set; +bool btf_id_set_contains(struct btf_id_set *set, u32 id); + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 4867d549e3c1..210b086188a3 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -3,6 +3,11 @@ #ifndef _LINUX_BTF_IDS_H #define _LINUX_BTF_IDS_H +struct btf_id_set { + u32 cnt; + u32 ids[]; +}; + #ifdef CONFIG_DEBUG_INFO_BTF #include /* for __PASTE */ @@ -62,7 +67,7 @@ asm( \ ".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ "." #scope " " #name "; \n" \ #name ":; \n" \ -".popsection; \n"); \ +".popsection; \n"); #define BTF_ID_LIST(name) \ __BTF_ID_LIST(name, local) \ @@ -88,12 +93,56 @@ asm( \ ".zero 4 \n" \ ".popsection; \n"); +/* + * The BTF_SET_START/END macros pair defines sorted list of + * BTF IDs plus its members count, with following layout: + * + * BTF_SET_START(list) + * BTF_ID(type1, name1) + * BTF_ID(type2, name2) + * BTF_SET_END(list) + * + * __BTF_ID__set__list: + * .zero 4 + * list: + * __BTF_ID__type1__name1__3: + * .zero 4 + * __BTF_ID__type2__name2__4: + * .zero 4 + * + */ +#define __BTF_SET_START(name, scope) \ +asm( \ +".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ +"." #scope " __BTF_ID__set__" #name "; \n" \ +"__BTF_ID__set__" #name ":; \n" \ +".zero 4 \n" \ +".popsection; \n"); + +#define BTF_SET_START(name) \ +__BTF_ID_LIST(name, local) \ +__BTF_SET_START(name, local) + +#define BTF_SET_START_GLOBAL(name) \ +__BTF_ID_LIST(name, globl) \ +__BTF_SET_START(name, globl) + +#define BTF_SET_END(name) \ +asm( \ +".pushsection " BTF_IDS_SECTION ",\"a\"; \n" \ +".size __BTF_ID__set__" #name ", .-" #name " \n" \ +".popsection; \n"); \ +extern struct btf_id_set name; + #else #define BTF_ID_LIST(name) static u32 name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_SET_START(name) static struct btf_id_set name = { 0 }; +#define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; +#define BTF_SET_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ -- cgit v1.2.3 From 6e22ab9da79343532cd3cde39df25e5a5478c692 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Aug 2020 21:21:20 +0200 Subject: bpf: Add d_path helper Adding d_path helper function that returns full path for given 'struct path' object, which needs to be the kernel BTF 'path' object. The path is returned in buffer provided 'buf' of size 'sz' and is zero terminated. bpf_d_path(&file->f_path, buf, size); The helper calls directly d_path function, so there's only limited set of function it can be called from. Adding just very modest set for the start. Updating also bpf.h tools uapi header and adding 'path' to bpf_helpers_doc.py script. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200825192124.710397-11-jolsa@kernel.org --- include/uapi/linux/bpf.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0e1cdf806fe1..0388bc0200b0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3513,6 +3513,7 @@ union bpf_attr { * * **-EPERM** This helper cannot be used under the * current sock_ops->op. + * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description * Get a bpf_local_storage from an *inode*. @@ -3548,6 +3549,18 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given 'struct path' object, which + * needs to be the kernel BTF 'path' object. The path is + * returned in the provided buffer 'buf' of size 'sz' and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3697,6 +3710,7 @@ union bpf_attr { FN(reserve_hdr_opt), \ FN(inode_storage_get), \ FN(inode_storage_delete), \ + FN(d_path), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 24da79902efc4a8443fae09e6c8e25b515bd8db2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Aug 2020 06:50:16 -0700 Subject: inet: remove inet_sk_copy_descendant() This is no longer used, SCTP now uses a private helper. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/ipv6.h | 11 ----------- include/net/inet_sock.h | 7 ------- 2 files changed, 18 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index a44789d027cc..bac8f4fffbd6 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -345,17 +345,6 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk) return (struct raw6_sock *)sk; } -static inline void inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from) -{ - int ancestor_size = sizeof(struct inet_sock); - - if (sk_from->sk_family == PF_INET6) - ancestor_size += sizeof(struct ipv6_pinfo); - - __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size); -} - #define __ipv6_only_sock(sk) (sk->sk_ipv6only) #define ipv6_only_sock(sk) (__ipv6_only_sock(sk)) #define ipv6_sk_rxinfo(sk) ((sk)->sk_family == PF_INET6 && \ diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index a3702d1d4875..89163ef8cf4b 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -296,13 +296,6 @@ static inline void __inet_sk_copy_descendant(struct sock *sk_to, memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, sk_from->sk_prot->obj_size - ancestor_size); } -#if !(IS_ENABLED(CONFIG_IPV6)) -static inline void inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from) -{ - __inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock)); -} -#endif int inet_sk_rebuild_header(struct sock *sk); -- cgit v1.2.3 From f468f21b7af0fa472ff8ff70f10b9b4995ef7eb3 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Wed, 26 Aug 2020 15:54:16 +0300 Subject: net: Take common prefetch code structure into a function Many device drivers use the same prefetch code structure to deal with small L1 cacheline size. Take this code into a function and call it from the drivers. Suggested-by: Jakub Kicinski Signed-off-by: Tariq Toukan Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/netdevice.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b0e303f6603f..b8abe1d7aa0b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2193,6 +2193,22 @@ int netdev_get_num_tc(struct net_device *dev) return dev->num_tc; } +static inline void net_prefetch(void *p) +{ + prefetch(p); +#if L1_CACHE_BYTES < 128 + prefetch((u8 *)p + L1_CACHE_BYTES); +#endif +} + +static inline void net_prefetchw(void *p) +{ + prefetchw(p); +#if L1_CACHE_BYTES < 128 + prefetchw((u8 *)p + L1_CACHE_BYTES); +#endif +} + void netdev_unbind_sb_channel(struct net_device *dev, struct net_device *sb_dev); int netdev_bind_sb_channel_queue(struct net_device *dev, -- cgit v1.2.3 From 493a0ebd804c986e6bd207603c5e1ca748470d3d Mon Sep 17 00:00:00 2001 From: James Prestwood Date: Mon, 13 Apr 2020 09:20:53 -0700 Subject: nl80211: fix PORT_AUTHORIZED wording to reflect behavior The CMD_PORT_AUTHORIZED event was described as an event which indicated a successfully completed 4-way handshake. But the behavior was not as advertized. The only driver which uses this is brcmfmac, and this driver only sends the event after a successful 802.1X-FT roam. This prevents userspace applications from knowing if the 4-way completed on: 1. Normal 802.1X connects 2. Normal PSK connections 3. FT-PSK roams wpa_supplicant handles this incorrect behavior by just completing the connection after association, before the 4-way has completed. If the 4-way ends up failing it disconnects at that point. Since this behavior appears to be expected (wpa_s handles it this way) I have changed the wording in the API description to reflect the actual behavior. Signed-off-by: James Prestwood Link: https://lore.kernel.org/r/20200413162053.3711-1-prestwoj@gmail.com [fix spelling of 802.1X throughout ...] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 631f3a997b3c..8cc2b825e4e4 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -647,13 +647,9 @@ * authentication/association or not receiving a response from the AP. * Non-zero %NL80211_ATTR_STATUS_CODE value is indicated in that case as * well to remain backwards compatible. - * When establishing a security association, drivers that support 4 way - * handshake offload should send %NL80211_CMD_PORT_AUTHORIZED event when - * the 4 way handshake is completed successfully. * @NL80211_CMD_ROAM: Notification indicating the card/driver roamed by itself. - * When a security association was established with the new AP (e.g. if - * the FT protocol was used for roaming or the driver completed the 4 way - * handshake), this event should be followed by an + * When a security association was established on an 802.1X network using + * fast transition, this event should be followed by an * %NL80211_CMD_PORT_AUTHORIZED event. * @NL80211_CMD_DISCONNECT: drop a given connection; also used to notify * userspace that a connection was dropped by the AP or due to other @@ -1067,13 +1063,11 @@ * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously * configured PMK for the authenticator address identified by * %NL80211_ATTR_MAC. - * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates that the 4 way - * handshake was completed successfully by the driver. The BSSID is - * specified with %NL80211_ATTR_MAC. Drivers that support 4 way handshake - * offload should send this event after indicating 802.11 association with - * %NL80211_CMD_CONNECT or %NL80211_CMD_ROAM. If the 4 way handshake failed - * %NL80211_CMD_DISCONNECT should be indicated instead. - * + * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates an 802.1X FT roam was + * completed successfully. Drivers that support 4 way handshake offload + * should send this event after indicating 802.1X FT assocation with + * %NL80211_CMD_ROAM. If the 4 way handshake failed %NL80211_CMD_DISCONNECT + * should be indicated instead. * @NL80211_CMD_CONTROL_PORT_FRAME: Control Port (e.g. PAE) frame TX request * and RX notification. This command is used both as a request to transmit * a control port frame and as a notification that a control port frame -- cgit v1.2.3 From eb89a6a6b7a1af2d9c8d83ee44fa67700d6337e7 Mon Sep 17 00:00:00 2001 From: Miles Hu Date: Tue, 4 Aug 2020 10:16:29 +0200 Subject: nl80211: add support for setting fixed HE rate/gi/ltf This patch adds the nl80211 structs, definitions, policies and parsing code required to pass fixed HE rate, GI and LTF settings. Signed-off-by: Miles Hu Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20200804081630.2013619-1-john@phrozen.org [fix comment] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d9e6b9fbd95b..c9bce9bba511 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -678,7 +678,10 @@ struct cfg80211_bitrate_mask { u32 legacy; u8 ht_mcs[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mcs[NL80211_VHT_NSS_MAX]; + u16 he_mcs[NL80211_HE_NSS_MAX]; enum nl80211_txrate_gi gi; + enum nl80211_he_gi he_gi; + enum nl80211_he_ltf he_ltf; } control[NUM_NL80211_BANDS]; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8cc2b825e4e4..1a4b922f489f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3180,6 +3180,18 @@ enum nl80211_he_gi { NL80211_RATE_INFO_HE_GI_3_2, }; +/** + * enum nl80211_he_ltf - HE long training field + * @NL80211_RATE_INFO_HE_1xLTF: 3.2 usec + * @NL80211_RATE_INFO_HE_2xLTF: 6.4 usec + * @NL80211_RATE_INFO_HE_4xLTF: 12.8 usec + */ +enum nl80211_he_ltf { + NL80211_RATE_INFO_HE_1XLTF, + NL80211_RATE_INFO_HE_2XLTF, + NL80211_RATE_INFO_HE_4XLTF, +}; + /** * enum nl80211_he_ru_alloc - HE RU allocation values * @NL80211_RATE_INFO_HE_RU_ALLOC_26: 26-tone RU allocation @@ -4735,6 +4747,10 @@ enum nl80211_key_attributes { * @NL80211_TXRATE_VHT: VHT rates allowed for TX rate selection, * see &struct nl80211_txrate_vht * @NL80211_TXRATE_GI: configure GI, see &enum nl80211_txrate_gi + * @NL80211_TXRATE_HE: HE rates allowed for TX rate selection, + * see &struct nl80211_txrate_he + * @NL80211_TXRATE_HE_GI: configure HE GI, 0.8us, 1.6us and 3.2us. + * @NL80211_TXRATE_HE_LTF: configure HE LTF, 1XLTF, 2XLTF and 4XLTF. * @__NL80211_TXRATE_AFTER_LAST: internal * @NL80211_TXRATE_MAX: highest TX rate attribute */ @@ -4744,6 +4760,9 @@ enum nl80211_tx_rate_attributes { NL80211_TXRATE_HT, NL80211_TXRATE_VHT, NL80211_TXRATE_GI, + NL80211_TXRATE_HE, + NL80211_TXRATE_HE_GI, + NL80211_TXRATE_HE_LTF, /* keep last */ __NL80211_TXRATE_AFTER_LAST, @@ -4761,6 +4780,15 @@ struct nl80211_txrate_vht { __u16 mcs[NL80211_VHT_NSS_MAX]; }; +#define NL80211_HE_NSS_MAX 8 +/** + * struct nl80211_txrate_he - HE MCS/NSS txrate bitmap + * @mcs: MCS bitmap table for each NSS (array index 0 for 1 stream, etc.) + */ +struct nl80211_txrate_he { + __u16 mcs[NL80211_HE_NSS_MAX]; +}; + enum nl80211_txrate_gi { NL80211_TXRATE_DEFAULT_GI, NL80211_TXRATE_FORCE_SGI, -- cgit v1.2.3 From 00c207edfb2bff9cf03a8f21e57c9c752a1d9f16 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 11 Aug 2020 10:01:03 +0200 Subject: nl80211: rename csa counter attributes countdown counters We want to reuse the attributes for other counters such as BSS color change. Rename them to more generic names. Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20200811080107.3615705-1-john@phrozen.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1a4b922f489f..ec96d5fe0e05 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2076,10 +2076,10 @@ enum nl80211_commands { * operation). * @NL80211_ATTR_CSA_IES: Nested set of attributes containing the IE information * for the time while performing a channel switch. - * @NL80211_ATTR_CSA_C_OFF_BEACON: An array of offsets (u16) to the channel - * switch counters in the beacons tail (%NL80211_ATTR_BEACON_TAIL). - * @NL80211_ATTR_CSA_C_OFF_PRESP: An array of offsets (u16) to the channel - * switch counters in the probe response (%NL80211_ATTR_PROBE_RESP). + * @NL80211_ATTR_CNTDWN_OFFS_BEACON: An array of offsets (u16) to the channel + * switch or color change counters in the beacons tail (%NL80211_ATTR_BEACON_TAIL). + * @NL80211_ATTR_CNTDWN_OFFS_PRESP: An array of offsets (u16) to the channel + * switch or color change counters in the probe response (%NL80211_ATTR_PROBE_RESP). * * @NL80211_ATTR_RXMGMT_FLAGS: flags for nl80211_send_mgmt(), u32. * As specified in the &enum nl80211_rxmgmt_flags. @@ -2815,8 +2815,8 @@ enum nl80211_attrs { NL80211_ATTR_CH_SWITCH_COUNT, NL80211_ATTR_CH_SWITCH_BLOCK_TX, NL80211_ATTR_CSA_IES, - NL80211_ATTR_CSA_C_OFF_BEACON, - NL80211_ATTR_CSA_C_OFF_PRESP, + NL80211_ATTR_CNTDWN_OFFS_BEACON, + NL80211_ATTR_CNTDWN_OFFS_PRESP, NL80211_ATTR_RXMGMT_FLAGS, @@ -3003,6 +3003,8 @@ enum nl80211_attrs { #define NL80211_ATTR_MESH_PARAMS NL80211_ATTR_MESH_CONFIG #define NL80211_ATTR_IFACE_SOCKET_OWNER NL80211_ATTR_SOCKET_OWNER #define NL80211_ATTR_SAE_DATA NL80211_ATTR_AUTH_DATA +#define NL80211_ATTR_CSA_C_OFF_BEACON NL80211_ATTR_CNTDWN_OFFS_BEACON +#define NL80211_ATTR_CSA_C_OFF_PRESP NL80211_ATTR_CNTDWN_OFFS_PRESP /* * Allow user space programs to use #ifdef on new attributes by defining them -- cgit v1.2.3 From 8552a434b6a05cc38006733afe6a239ad4d600a2 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Tue, 11 Aug 2020 10:01:04 +0200 Subject: mac80211: rename csa counters to countdown counters We want to reuse the functions and structs for other counters such as BSS color change. Rename them to more generic names. Signed-off-by: John Crispin Link: https://lore.kernel.org/r/20200811080107.3615705-2-john@phrozen.org Signed-off-by: Johannes Berg --- include/net/mac80211.h | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 66e2bfd165e8..ec148b3e9c41 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3736,7 +3736,7 @@ enum ieee80211_reconfig_type { * decremented, and when they reach 1 the driver must call * ieee80211_csa_finish(). Drivers which use ieee80211_beacon_get() * get the csa counter decremented by mac80211, but must check if it is - * 1 using ieee80211_csa_is_complete() after the beacon has been + * 1 using ieee80211_beacon_counter_is_complete() after the beacon has been * transmitted and then call ieee80211_csa_finish(). * If the CSA count starts as zero or 1, this function will not be called, * since there won't be any time to beacon before the switch anyway. @@ -4763,21 +4763,21 @@ void ieee80211_tx_status_8023(struct ieee80211_hw *hw, */ void ieee80211_report_low_ack(struct ieee80211_sta *sta, u32 num_packets); -#define IEEE80211_MAX_CSA_COUNTERS_NUM 2 +#define IEEE80211_MAX_CNTDWN_COUNTERS_NUM 2 /** * struct ieee80211_mutable_offsets - mutable beacon offsets * @tim_offset: position of TIM element * @tim_length: size of TIM element - * @csa_counter_offs: array of IEEE80211_MAX_CSA_COUNTERS_NUM offsets - * to CSA counters. This array can contain zero values which + * @cntdwn_counter_offs: array of IEEE80211_MAX_CNTDWN_COUNTERS_NUM offsets + * to countdown counters. This array can contain zero values which * should be ignored. */ struct ieee80211_mutable_offsets { u16 tim_offset; u16 tim_length; - u16 csa_counter_offs[IEEE80211_MAX_CSA_COUNTERS_NUM]; + u16 cntdwn_counter_offs[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; }; /** @@ -4846,31 +4846,31 @@ static inline struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw, } /** - * ieee80211_csa_update_counter - request mac80211 to decrement the csa counter + * ieee80211_beacon_update_cntdwn - request mac80211 to decrement the beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. * - * The csa counter should be updated after each beacon transmission. + * The beacon counter should be updated after each beacon transmission. * This function is called implicitly when * ieee80211_beacon_get/ieee80211_beacon_get_tim are called, however if the * beacon frames are generated by the device, the driver should call this - * function after each beacon transmission to sync mac80211's csa counters. + * function after each beacon transmission to sync mac80211's beacon countdown. * - * Return: new csa counter value + * Return: new countdown value */ -u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif); +u8 ieee80211_beacon_update_cntdwn(struct ieee80211_vif *vif); /** - * ieee80211_csa_set_counter - request mac80211 to set csa counter + * ieee80211_beacon_set_cntdwn - request mac80211 to set beacon countdown * @vif: &struct ieee80211_vif pointer from the add_interface callback. * @counter: the new value for the counter * - * The csa counter can be changed by the device, this API should be + * The beacon countdown can be changed by the device, this API should be * used by the device driver to update csa counter in mac80211. * - * It should never be used together with ieee80211_csa_update_counter(), + * It should never be used together with ieee80211_beacon_update_cntdwn(), * as it will cause a race condition around the counter value. */ -void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter); +void ieee80211_beacon_set_cntdwn(struct ieee80211_vif *vif, u8 counter); /** * ieee80211_csa_finish - notify mac80211 about channel switch @@ -4883,13 +4883,12 @@ void ieee80211_csa_set_counter(struct ieee80211_vif *vif, u8 counter); void ieee80211_csa_finish(struct ieee80211_vif *vif); /** - * ieee80211_csa_is_complete - find out if counters reached 1 + * ieee80211_beacon_cntdwn_is_complete - find out if countdown reached 1 * @vif: &struct ieee80211_vif pointer from the add_interface callback. * - * This function returns whether the channel switch counters reached zero. + * This function returns whether the countdown reached zero. */ -bool ieee80211_csa_is_complete(struct ieee80211_vif *vif); - +bool ieee80211_beacon_cntdwn_is_complete(struct ieee80211_vif *vif); /** * ieee80211_proberesp_get - retrieve a Probe Response template -- cgit v1.2.3 From 2831a631022eed6e3f800f08892132c6edde652c Mon Sep 17 00:00:00 2001 From: Chung-Hsien Hsu Date: Mon, 17 Aug 2020 02:33:15 -0500 Subject: nl80211: support SAE authentication offload in AP mode Let drivers advertise support for AP-mode SAE authentication offload with a new NL80211_EXT_FEATURE_SAE_OFFLOAD_AP flag. Signed-off-by: Chung-Hsien Hsu Signed-off-by: Chi-Hsien Lin Link: https://lore.kernel.org/r/20200817073316.33402-4-stanley.hsu@cypress.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ec96d5fe0e05..0584e0d349f0 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -252,9 +252,13 @@ * DOC: SAE authentication offload * * By setting @NL80211_EXT_FEATURE_SAE_OFFLOAD flag drivers can indicate they - * support offloading SAE authentication for WPA3-Personal networks. In - * %NL80211_CMD_CONNECT the password for SAE should be specified using - * %NL80211_ATTR_SAE_PASSWORD. + * support offloading SAE authentication for WPA3-Personal networks in station + * mode. Similarly @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP flag can be set by + * drivers indicating the offload support in AP mode. + * + * The password for SAE should be specified using %NL80211_ATTR_SAE_PASSWORD in + * %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP for station and AP mode + * respectively. */ /** @@ -5845,6 +5849,9 @@ enum nl80211_feature_flags { * handshake with PSK in AP mode (PSK is passed as part of the start AP * command). * + * @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP: Device wants to do SAE authentication + * in AP mode (SAE password is passed as part of the start AP command). + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5902,6 +5909,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS, NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, + NL80211_EXT_FEATURE_SAE_OFFLOAD_AP, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 2fa4e4b799e191530edbae4b96b85d4486e55053 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 27 Aug 2020 04:00:28 +0200 Subject: net: pcs: Move XPCS into new PCS subdirectory Create drivers/net/pcs and move the Synopsys DesignWare XPCS into the new directory. Move the header file into a subdirectory include/linux/pcs Start a naming convention of all PCS files use the prefix pcs-, and rename the XPCS files to fit. v2: Add include/linux/pcs v4: Fix include path in stmmac. Remove PCS_DEVICES to avoid new prompts Cc: Jose Abreu Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio-xpcs.h | 41 ----------------------------------------- include/linux/pcs/pcs-xpcs.h | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 41 deletions(-) delete mode 100644 include/linux/mdio-xpcs.h create mode 100644 include/linux/pcs/pcs-xpcs.h (limited to 'include') diff --git a/include/linux/mdio-xpcs.h b/include/linux/mdio-xpcs.h deleted file mode 100644 index 9a841aa5982d..000000000000 --- a/include/linux/mdio-xpcs.h +++ /dev/null @@ -1,41 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (c) 2020 Synopsys, Inc. and/or its affiliates. - * Synopsys DesignWare XPCS helpers - */ - -#ifndef __LINUX_MDIO_XPCS_H -#define __LINUX_MDIO_XPCS_H - -#include -#include - -struct mdio_xpcs_args { - __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); - struct mii_bus *bus; - int addr; -}; - -struct mdio_xpcs_ops { - int (*validate)(struct mdio_xpcs_args *xpcs, - unsigned long *supported, - struct phylink_link_state *state); - int (*config)(struct mdio_xpcs_args *xpcs, - const struct phylink_link_state *state); - int (*get_state)(struct mdio_xpcs_args *xpcs, - struct phylink_link_state *state); - int (*link_up)(struct mdio_xpcs_args *xpcs, int speed, - phy_interface_t interface); - int (*probe)(struct mdio_xpcs_args *xpcs, phy_interface_t interface); -}; - -#if IS_ENABLED(CONFIG_MDIO_XPCS) -struct mdio_xpcs_ops *mdio_xpcs_get_ops(void); -#else -static inline struct mdio_xpcs_ops *mdio_xpcs_get_ops(void) -{ - return NULL; -} -#endif - -#endif /* __LINUX_MDIO_XPCS_H */ diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h new file mode 100644 index 000000000000..351c1c9aedc5 --- /dev/null +++ b/include/linux/pcs/pcs-xpcs.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020 Synopsys, Inc. and/or its affiliates. + * Synopsys DesignWare XPCS helpers + */ + +#ifndef __LINUX_PCS_XPCS_H +#define __LINUX_PCS_XPCS_H + +#include +#include + +struct mdio_xpcs_args { + __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); + struct mii_bus *bus; + int addr; +}; + +struct mdio_xpcs_ops { + int (*validate)(struct mdio_xpcs_args *xpcs, + unsigned long *supported, + struct phylink_link_state *state); + int (*config)(struct mdio_xpcs_args *xpcs, + const struct phylink_link_state *state); + int (*get_state)(struct mdio_xpcs_args *xpcs, + struct phylink_link_state *state); + int (*link_up)(struct mdio_xpcs_args *xpcs, int speed, + phy_interface_t interface); + int (*probe)(struct mdio_xpcs_args *xpcs, phy_interface_t interface); +}; + +#if IS_ENABLED(CONFIG_PCS_XPCS) +struct mdio_xpcs_ops *mdio_xpcs_get_ops(void); +#else +static inline struct mdio_xpcs_ops *mdio_xpcs_get_ops(void) +{ + return NULL; +} +#endif + +#endif /* __LINUX_PCS_XPCS_H */ -- cgit v1.2.3 From fcba68bd75bb1d42b3aec7f471d382a9e639a672 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 27 Aug 2020 04:00:29 +0200 Subject: net/phy/mdio-i2c: Move header file to include/linux/mdio In preparation for moving all MDIO drivers into drivers/net/mdio, move the mdio-i2c header file into include/linux/mdio so it can be used by both the MDIO driver and the SFP code which instantiates I2C MDIO busses. v2: Add include/linux/mdio Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio/mdio-i2c.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 include/linux/mdio/mdio-i2c.h (limited to 'include') diff --git a/include/linux/mdio/mdio-i2c.h b/include/linux/mdio/mdio-i2c.h new file mode 100644 index 000000000000..b1d27f7cd23f --- /dev/null +++ b/include/linux/mdio/mdio-i2c.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * MDIO I2C bridge + * + * Copyright (C) 2015 Russell King + */ +#ifndef MDIO_I2C_H +#define MDIO_I2C_H + +struct device; +struct i2c_adapter; +struct mii_bus; + +struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c); + +#endif -- cgit v1.2.3 From 232e15e1d7ddb191c28248cb681f4544c0ff1c54 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 27 Aug 2020 04:00:30 +0200 Subject: net: xgene: Move shared header file into include/linux This header file is currently included into the ethernet driver via a relative path into the PHY subsystem. This is bad practice, and causes issues for the upcoming move of the MDIO driver. Move the header file into include/linux to clean this up. v2: Move header to include/linux/mdio Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio/mdio-xgene.h | 130 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 include/linux/mdio/mdio-xgene.h (limited to 'include') diff --git a/include/linux/mdio/mdio-xgene.h b/include/linux/mdio/mdio-xgene.h new file mode 100644 index 000000000000..8af93ada8b64 --- /dev/null +++ b/include/linux/mdio/mdio-xgene.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* Applied Micro X-Gene SoC MDIO Driver + * + * Copyright (c) 2016, Applied Micro Circuits Corporation + * Author: Iyappan Subramanian + */ + +#ifndef __MDIO_XGENE_H__ +#define __MDIO_XGENE_H__ + +#define BLOCK_XG_MDIO_CSR_OFFSET 0x5000 +#define BLOCK_DIAG_CSR_OFFSET 0xd000 +#define XGENET_CONFIG_REG_ADDR 0x20 + +#define MAC_ADDR_REG_OFFSET 0x00 +#define MAC_COMMAND_REG_OFFSET 0x04 +#define MAC_WRITE_REG_OFFSET 0x08 +#define MAC_READ_REG_OFFSET 0x0c +#define MAC_COMMAND_DONE_REG_OFFSET 0x10 + +#define CLKEN_OFFSET 0x08 +#define SRST_OFFSET 0x00 + +#define MENET_CFG_MEM_RAM_SHUTDOWN_ADDR 0x70 +#define MENET_BLOCK_MEM_RDY_ADDR 0x74 + +#define MAC_CONFIG_1_ADDR 0x00 +#define MII_MGMT_COMMAND_ADDR 0x24 +#define MII_MGMT_ADDRESS_ADDR 0x28 +#define MII_MGMT_CONTROL_ADDR 0x2c +#define MII_MGMT_STATUS_ADDR 0x30 +#define MII_MGMT_INDICATORS_ADDR 0x34 +#define SOFT_RESET BIT(31) + +#define MII_MGMT_CONFIG_ADDR 0x20 +#define MII_MGMT_COMMAND_ADDR 0x24 +#define MII_MGMT_ADDRESS_ADDR 0x28 +#define MII_MGMT_CONTROL_ADDR 0x2c +#define MII_MGMT_STATUS_ADDR 0x30 +#define MII_MGMT_INDICATORS_ADDR 0x34 + +#define MIIM_COMMAND_ADDR 0x20 +#define MIIM_FIELD_ADDR 0x24 +#define MIIM_CONFIGURATION_ADDR 0x28 +#define MIIM_LINKFAILVECTOR_ADDR 0x2c +#define MIIM_INDICATOR_ADDR 0x30 +#define MIIMRD_FIELD_ADDR 0x34 + +#define MDIO_CSR_OFFSET 0x5000 + +#define REG_ADDR_POS 0 +#define REG_ADDR_LEN 5 +#define PHY_ADDR_POS 8 +#define PHY_ADDR_LEN 5 + +#define HSTMIIMWRDAT_POS 0 +#define HSTMIIMWRDAT_LEN 16 +#define HSTPHYADX_POS 23 +#define HSTPHYADX_LEN 5 +#define HSTREGADX_POS 18 +#define HSTREGADX_LEN 5 +#define HSTLDCMD BIT(3) +#define HSTMIIMCMD_POS 0 +#define HSTMIIMCMD_LEN 3 + +#define BUSY_MASK BIT(0) +#define READ_CYCLE_MASK BIT(0) + +enum xgene_enet_cmd { + XGENE_ENET_WR_CMD = BIT(31), + XGENE_ENET_RD_CMD = BIT(30) +}; + +enum { + MIIM_CMD_IDLE, + MIIM_CMD_LEGACY_WRITE, + MIIM_CMD_LEGACY_READ, +}; + +enum xgene_mdio_id { + XGENE_MDIO_RGMII = 1, + XGENE_MDIO_XFI +}; + +struct xgene_mdio_pdata { + struct clk *clk; + struct device *dev; + void __iomem *mac_csr_addr; + void __iomem *diag_csr_addr; + void __iomem *mdio_csr_addr; + struct mii_bus *mdio_bus; + int mdio_id; + spinlock_t mac_lock; /* mac lock */ +}; + +/* Set the specified value into a bit-field defined by its starting position + * and length within a single u64. + */ +static inline u64 xgene_enet_set_field_value(int pos, int len, u64 val) +{ + return (val & ((1ULL << len) - 1)) << pos; +} + +#define SET_VAL(field, val) \ + xgene_enet_set_field_value(field ## _POS, field ## _LEN, val) + +#define SET_BIT(field) \ + xgene_enet_set_field_value(field ## _POS, 1, 1) + +/* Get the value from a bit-field defined by its starting position + * and length within the specified u64. + */ +static inline u64 xgene_enet_get_field_value(int pos, int len, u64 src) +{ + return (src >> pos) & ((1ULL << len) - 1); +} + +#define GET_VAL(field, src) \ + xgene_enet_get_field_value(field ## _POS, field ## _LEN, src) + +#define GET_BIT(field, src) \ + xgene_enet_get_field_value(field ## _POS, 1, src) + +u32 xgene_mdio_rd_mac(struct xgene_mdio_pdata *pdata, u32 rd_addr); +void xgene_mdio_wr_mac(struct xgene_mdio_pdata *pdata, u32 wr_addr, u32 data); +int xgene_mdio_rgmii_read(struct mii_bus *bus, int phy_id, int reg); +int xgene_mdio_rgmii_write(struct mii_bus *bus, int phy_id, int reg, u16 data); +struct phy_device *xgene_enet_phy_register(struct mii_bus *bus, int phy_addr); + +#endif /* __MDIO_XGENE_H__ */ -- cgit v1.2.3 From 50aba46c234ea6ab3134cebb5ab27885f33a3e5d Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 27 Aug 2020 14:19:23 +0200 Subject: gtp: add notification mechanism Like all other network functions, let's notify gtp context on creation and deletion. Signed-off-by: Nicolas Dichtel Tested-by: Gabriel Ganne Acked-by: Harald Welte Signed-off-by: David S. Miller --- include/uapi/linux/gtp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h index c7d66755d212..79f9191bbb24 100644 --- a/include/uapi/linux/gtp.h +++ b/include/uapi/linux/gtp.h @@ -2,6 +2,8 @@ #ifndef _UAPI_LINUX_GTP_H_ #define _UAPI_LINUX_GTP_H_ +#define GTP_GENL_MCGRP_NAME "gtp" + enum gtp_genl_cmds { GTP_CMD_NEWPDP, GTP_CMD_DELPDP, -- cgit v1.2.3 From b0c9eb37817943840a1a82dbc998c491609a0afd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 27 Aug 2020 22:19:22 -0700 Subject: bpf: Make bpf_link_info.iter similar to bpf_iter_link_info bpf_link_info.iter is used by link_query to return bpf_iter_link_info to user space. Fields may be different, e.g., map_fd vs. map_id, so we cannot reuse the exact structure. But make them similar, e.g., struct bpf_link_info { /* common fields */ union { struct { ... } raw_tracepoint; struct { ... } tracing; ... struct { /* common fields for iter */ union { struct { __u32 map_id; } map; /* other structs for other targets */ }; }; }; }; so the structure is extensible the same way as bpf_iter_link_info. Fixes: 6b0a249a301e ("bpf: Implement link_query for bpf iterators") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200828051922.758950-1-yhs@fb.com --- include/uapi/linux/bpf.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0388bc0200b0..ef7af384f5ee 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4251,8 +4251,10 @@ struct bpf_link_info { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ union { - __u32 map_id; - } map; + struct { + __u32 map_id; + } map; + }; } iter; struct { __u32 netns_ino; -- cgit v1.2.3 From f4d05259213ff1e91f767c91dcab455f68308fac Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 27 Aug 2020 18:18:06 -0700 Subject: bpf: Add map_meta_equal map ops Some properties of the inner map is used in the verification time. When an inner map is inserted to an outer map at runtime, bpf_map_meta_equal() is currently used to ensure those properties of the inserting inner map stays the same as the verification time. In particular, the current bpf_map_meta_equal() checks max_entries which turns out to be too restrictive for most of the maps which do not use max_entries during the verification time. It limits the use case that wants to replace a smaller inner map with a larger inner map. There are some maps do use max_entries during verification though. For example, the map_gen_lookup in array_map_ops uses the max_entries to generate the inline lookup code. To accommodate differences between maps, the map_meta_equal is added to bpf_map_ops. Each map-type can decide what to check when its map is used as an inner map during runtime. Also, some map types cannot be used as an inner map and they are currently black listed in bpf_map_meta_alloc() in map_in_map.c. It is not unusual that the new map types may not aware that such blacklist exists. This patch enforces an explicit opt-in and only allows a map to be used as an inner map if it has implemented the map_meta_equal ops. It is based on the discussion in [1]. All maps that support inner map has its map_meta_equal points to bpf_map_meta_equal in this patch. A later patch will relax the max_entries check for most maps. bpf_types.h counts 28 map types. This patch adds 23 ".map_meta_equal" by using coccinelle. -5 for BPF_MAP_TYPE_PROG_ARRAY BPF_MAP_TYPE_(PERCPU)_CGROUP_STORAGE BPF_MAP_TYPE_STRUCT_OPS BPF_MAP_TYPE_ARRAY_OF_MAPS BPF_MAP_TYPE_HASH_OF_MAPS The "if (inner_map->inner_map_meta)" check in bpf_map_meta_alloc() is moved such that the same error is returned. [1]: https://lore.kernel.org/bpf/20200522022342.899756-1-kafai@fb.com/ Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200828011806.1970400-1-kafai@fb.com --- include/linux/bpf.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a6131d95e31e..dbba82a80087 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -112,6 +112,19 @@ struct bpf_map_ops { void (*map_local_storage_uncharge)(struct bpf_local_storage_map *smap, void *owner, u32 size); struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); + + /* map_meta_equal must be implemented for maps that can be + * used as an inner map. It is a runtime check to ensure + * an inner map can be inserted to an outer map. + * + * Some properties of the inner map has been used during the + * verification time. When inserting an inner map at the runtime, + * map_meta_equal has to ensure the inserting map has the same + * properties that the verifier has used earlier. + */ + bool (*map_meta_equal)(const struct bpf_map *meta0, + const struct bpf_map *meta1); + /* BTF name and id of struct allocated by map_alloc */ const char * const map_btf_name; int *map_btf_id; @@ -235,6 +248,9 @@ int map_check_no_btf(const struct bpf_map *map, const struct btf_type *key_type, const struct btf_type *value_type); +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1); + extern const struct bpf_map_ops bpf_map_offload_ops; /* function argument constraints */ -- cgit v1.2.3 From 316cdaa1158af17250397054f92bb339fbd8e282 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Wed, 26 Aug 2020 09:05:35 -0700 Subject: net: add option to not create fall-back tunnels in root-ns as well The sysctl that was added earlier by commit 79134e6ce2c ("net: do not create fallback tunnels for non-default namespaces") to create fall-back only in root-ns. This patch enhances that behavior to provide option not to create fallback tunnels in root-ns as well. Since modules that create fallback tunnels could be built-in and setting the sysctl value after booting is pointless, so added a kernel cmdline options to change this default. The default setting is preseved for backward compatibility. The kernel command line option of fb_tunnels=initns will set the sysctl value to 1 and will create fallback tunnels only in initns while kernel cmdline fb_tunnels=none will set the sysctl value to 2 and fallback tunnels are skipped in every netns. Signed-off-by: Mahesh Bandewar Cc: Eric Dumazet Cc: Maciej Zenczykowski Cc: Jian Yang Cc: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b8abe1d7aa0b..c0b512e6a02b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -640,10 +640,14 @@ struct netdev_queue { extern int sysctl_fb_tunnels_only_for_init_net; extern int sysctl_devconf_inherit_init_net; +/* + * sysctl_fb_tunnels_only_for_init_net == 0 : For all netns + * == 1 : For initns only + * == 2 : For none. + */ static inline bool net_has_fallback_tunnels(const struct net *net) { - return net == &init_net || - !IS_ENABLED(CONFIG_SYSCTL) || + return (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1) || !sysctl_fb_tunnels_only_for_init_net; } -- cgit v1.2.3 From 7a81575b806e5dab214025e6757362c62d946405 Mon Sep 17 00:00:00 2001 From: "Jose M. Guisado Gomez" Date: Thu, 20 Aug 2020 10:19:01 +0200 Subject: netfilter: nf_tables: add userdata attributes to nft_table Enables storing userdata for nft_table. Field udata points to user data and udlen store its length. Adds new attribute flag NFTA_TABLE_USERDATA Signed-off-by: Jose M. Guisado Gomez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index bf9491b77d16..97a7e147a59a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1080,6 +1080,8 @@ struct nft_table { flags:8, genmask:2; char *name; + u16 udlen; + u8 *udata; }; void nft_register_chain_type(const struct nft_chain_type *); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 42f351c1f5c5..aeb88cbd303e 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -172,6 +172,7 @@ enum nft_table_flags { * @NFTA_TABLE_NAME: name of the table (NLA_STRING) * @NFTA_TABLE_FLAGS: bitmask of enum nft_table_flags (NLA_U32) * @NFTA_TABLE_USE: number of chains in this table (NLA_U32) + * @NFTA_TABLE_USERDATA: user data (NLA_BINARY) */ enum nft_table_attributes { NFTA_TABLE_UNSPEC, @@ -180,6 +181,7 @@ enum nft_table_attributes { NFTA_TABLE_USE, NFTA_TABLE_HANDLE, NFTA_TABLE_PAD, + NFTA_TABLE_USERDATA, __NFTA_TABLE_MAX }; #define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) -- cgit v1.2.3 From 4afc41dfa5a716e9e7a90c22972583f337c0bcbf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Aug 2020 00:52:43 +0200 Subject: netfilter: conntrack: remove ignore stats This counter increments when nf_conntrack_in sees a packet that already has a conntrack attached or when the packet is marked as UNTRACKED. Neither is an error. The former is normal for loopback traffic. The second happens for certain ICMPv6 packets or when nftables/ip(6)tables rules are in place. In case someone needs to count UNTRACKED packets, or packets that are marked as untracked before conntrack_in this can be done with both nftables and ip(6)tables rules. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 1 - include/uapi/linux/netfilter/nfnetlink_conntrack.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 1db83c931d9c..96b90d7e361f 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -8,7 +8,6 @@ struct ip_conntrack_stat { unsigned int found; unsigned int invalid; - unsigned int ignore; unsigned int insert; unsigned int insert_failed; unsigned int drop; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 262881792671..3e471558da82 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -247,7 +247,7 @@ enum ctattr_stats_cpu { CTA_STATS_FOUND, CTA_STATS_NEW, /* no longer used */ CTA_STATS_INVALID, - CTA_STATS_IGNORE, + CTA_STATS_IGNORE, /* no longer used */ CTA_STATS_DELETE, /* no longer used */ CTA_STATS_DELETE_LIST, /* no longer used */ CTA_STATS_INSERT, -- cgit v1.2.3 From bc92470413f3af152db0d8f90ef3eb13f8cc417a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Aug 2020 00:52:44 +0200 Subject: netfilter: conntrack: add clash resolution stat counter There is a misconception about what "insert_failed" means. We increment this even when a clash got resolved, so it might not indicate a problem. Add a dedicated counter for clash resolution and only increment insert_failed if a clash cannot be resolved. For the old /proc interface, export this in place of an older stat that got removed a while back. For ctnetlink, export this with a new attribute. Also correct an outdated comment that implies we add a duplicate tuple -- we only add the (unique) reply direction. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_common.h | 1 + include/uapi/linux/netfilter/nfnetlink_conntrack.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 96b90d7e361f..0c7d8d1e945d 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -10,6 +10,7 @@ struct ip_conntrack_stat { unsigned int invalid; unsigned int insert; unsigned int insert_failed; + unsigned int clash_resolve; unsigned int drop; unsigned int early_drop; unsigned int error; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index 3e471558da82..d8484be72fdc 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -256,6 +256,7 @@ enum ctattr_stats_cpu { CTA_STATS_EARLY_DROP, CTA_STATS_ERROR, CTA_STATS_SEARCH_RESTART, + CTA_STATS_CLASH_RESOLVE, __CTA_STATS_MAX, }; #define CTA_STATS_MAX (__CTA_STATS_MAX - 1) -- cgit v1.2.3 From 1e6c62a8821557720a9b2ea9617359b264f2f67c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 27 Aug 2020 15:01:11 -0700 Subject: bpf: Introduce sleepable BPF programs Introduce sleepable BPF programs that can request such property for themselves via BPF_F_SLEEPABLE flag at program load time. In such case they will be able to use helpers like bpf_copy_from_user() that might sleep. At present only fentry/fexit/fmod_ret and lsm programs can request to be sleepable and only when they are attached to kernel functions that are known to allow sleeping. The non-sleepable programs are relying on implicit rcu_read_lock() and migrate_disable() to protect life time of programs, maps that they use and per-cpu kernel structures used to pass info between bpf programs and the kernel. The sleepable programs cannot be enclosed into rcu_read_lock(). migrate_disable() maps to preempt_disable() in non-RT kernels, so the progs should not be enclosed in migrate_disable() as well. Therefore rcu_read_lock_trace is used to protect the life time of sleepable progs. There are many networking and tracing program types. In many cases the 'struct bpf_prog *' pointer itself is rcu protected within some other kernel data structure and the kernel code is using rcu_dereference() to load that program pointer and call BPF_PROG_RUN() on it. All these cases are not touched. Instead sleepable bpf programs are allowed with bpf trampoline only. The program pointers are hard-coded into generated assembly of bpf trampoline and synchronize_rcu_tasks_trace() is used to protect the life time of the program. The same trampoline can hold both sleepable and non-sleepable progs. When rcu_read_lock_trace is held it means that some sleepable bpf program is running from bpf trampoline. Those programs can use bpf arrays and preallocated hash/lru maps. These map types are waiting on programs to complete via synchronize_rcu_tasks_trace(); Updates to trampoline now has to do synchronize_rcu_tasks_trace() and synchronize_rcu_tasks() to wait for sleepable progs to finish and for trampoline assembly to finish. This is the first step of introducing sleepable progs. Eventually dynamically allocated hash maps can be allowed and networking program types can become sleepable too. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Reviewed-by: Josef Bacik Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200827220114.69225-3-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 8 ++++++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dbba82a80087..4dd7e927621d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -539,6 +539,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(void); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); +void notrace __bpf_prog_enter_sleepable(void); +void notrace __bpf_prog_exit_sleepable(void); struct bpf_ksym { unsigned long start; @@ -734,6 +736,7 @@ struct bpf_prog_aux { bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; + bool sleepable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ef7af384f5ee..6e8b706aeb05 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -346,6 +346,14 @@ enum bpf_link_type { /* The verifier internal test flag. Behavior is undefined */ #define BPF_F_TEST_STATE_FREQ (1U << 3) +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * two extensions: * -- cgit v1.2.3 From 07be4c4a3e7a0db148e44b16c5190e753d1c8569 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 27 Aug 2020 15:01:12 -0700 Subject: bpf: Add bpf_copy_from_user() helper. Sleepable BPF programs can now use copy_from_user() to access user memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20200827220114.69225-4-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4dd7e927621d..c6d9f2c444f4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1784,6 +1784,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; +extern const struct bpf_func_proto bpf_copy_from_user_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6e8b706aeb05..a613750d5515 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3569,6 +3569,13 @@ union bpf_attr { * On success, the strictly positive length of the string, * including the trailing NUL character. On error, a negative * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of copy_from_user(). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3719,6 +3726,7 @@ union bpf_attr { FN(inode_storage_get), \ FN(inode_storage_delete), \ FN(d_path), \ + FN(copy_from_user), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 67407a406db337acdaabecd3747d160d89a929e4 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Sat, 29 Aug 2020 08:19:15 +0200 Subject: netfilter: nft_socket: add wildcard support Add NFT_SOCKET_WILDCARD to match to wildcard socket listener. Signed-off-by: Balazs Scheidler Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index aeb88cbd303e..543dc697b796 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1010,10 +1010,12 @@ enum nft_socket_attributes { * * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option * @NFT_SOCKET_MARK: Value of the socket mark + * @NFT_SOCKET_WILDCARD: Whether the socket is zero-bound (e.g. 0.0.0.0 or ::0) */ enum nft_socket_keys { NFT_SOCKET_TRANSPARENT, NFT_SOCKET_MARK, + NFT_SOCKET_WILDCARD, __NFT_SOCKET_MAX }; #define NFT_SOCKET_MAX (__NFT_SOCKET_MAX - 1) -- cgit v1.2.3 From 9667305c6374df8672be46bc496f52f040999531 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 31 Aug 2020 08:51:55 -0700 Subject: bpf: Fix build without BPF_SYSCALL, but with BPF_JIT. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_BPF_SYSCALL is not set, but CONFIG_BPF_JIT=y the kernel build fails: In file included from ../kernel/bpf/trampoline.c:11: ../kernel/bpf/trampoline.c: In function ‘bpf_trampoline_update’: ../kernel/bpf/trampoline.c:220:39: error: ‘call_rcu_tasks_trace’ undeclared ../kernel/bpf/trampoline.c: In function ‘__bpf_prog_enter_sleepable’: ../kernel/bpf/trampoline.c:411:2: error: implicit declaration of function ‘rcu_read_lock_trace’ ../kernel/bpf/trampoline.c: In function ‘__bpf_prog_exit_sleepable’: ../kernel/bpf/trampoline.c:416:2: error: implicit declaration of function ‘rcu_read_unlock_trace’ This is due to: obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_JIT) += dispatcher.o There is a number of functions that arch/x86/net/bpf_jit_comp.c is using from these two files, but none of them will be used when only cBPF is on (which is the case for BPF_SYSCALL=n BPF_JIT=y). Add rcu_trace functions to rcupdate_trace.h. The JITed code won't execute them and BPF trampoline logic won't be used without BPF_SYSCALL. Fixes: 1e6c62a88215 ("bpf: Introduce sleepable BPF programs") Reported-by: kernel test robot Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Paul E. McKenney Link: https://lore.kernel.org/bpf/20200831155155.62754-1-alexei.starovoitov@gmail.com --- include/linux/rcupdate_trace.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index d9015aac78c6..aaaac8ac927c 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -82,7 +82,14 @@ static inline void rcu_read_unlock_trace(void) void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func); void synchronize_rcu_tasks_trace(void); void rcu_barrier_tasks_trace(void); - +#else +/* + * The BPF JIT forms these addresses even when it doesn't call these + * functions, so provide definitions that result in runtime errors. + */ +static inline void call_rcu_tasks_trace(struct rcu_head *rhp, rcu_callback_t func) { BUG(); } +static inline void rcu_read_lock_trace(void) { BUG(); } +static inline void rcu_read_unlock_trace(void) { BUG(); } #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ #endif /* __LINUX_RCUPDATE_TRACE_H */ -- cgit v1.2.3 From 1742b3d528690ae7773cf7bf2f01a90ee1de2fe0 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 10:26:15 +0200 Subject: xsk: i40e: ice: ixgbe: mlx5: Pass buffer pool to driver instead of umem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the explicit umem reference passed to the driver in AF_XDP zero-copy mode with the buffer pool instead. This in preparation for extending the functionality of the zero-copy mode so that umems can be shared between queues on the same netdev and also between netdevs. In this commit, only an umem reference has been added to the buffer pool struct. But later commits will add other entities to it. These are going to be entities that are different between different queue ids and netdevs even though the umem is shared between them. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-2-git-send-email-magnus.karlsson@intel.com --- include/linux/netdevice.h | 10 +++++----- include/net/xdp_sock_drv.h | 7 ++++--- include/net/xsk_buff_pool.h | 4 +++- 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b0e303f6603f..d088dd866825 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -618,7 +618,7 @@ struct netdev_queue { /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS - struct xdp_umem *umem; + struct xsk_buff_pool *pool; #endif /* * write-mostly part @@ -751,7 +751,7 @@ struct netdev_rx_queue { struct net_device *dev; struct xdp_rxq_info xdp_rxq; #ifdef CONFIG_XDP_SOCKETS - struct xdp_umem *umem; + struct xsk_buff_pool *pool; #endif } ____cacheline_aligned_in_smp; @@ -879,7 +879,7 @@ enum bpf_netdev_command { /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, - XDP_SETUP_XSK_UMEM, + XDP_SETUP_XSK_POOL, }; struct bpf_prog_offload_ops; @@ -913,9 +913,9 @@ struct netdev_bpf { struct { struct bpf_offloaded_map *offmap; }; - /* XDP_SETUP_XSK_UMEM */ + /* XDP_SETUP_XSK_POOL */ struct { - struct xdp_umem *umem; + struct xsk_buff_pool *pool; u16 queue_id; } xsk; }; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index ccf848f7efa4..5dc8d3c0dcd4 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -14,7 +14,8 @@ void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); void xsk_umem_consume_tx_done(struct xdp_umem *umem); -struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); +struct xsk_buff_pool *xdp_get_xsk_pool_from_qid(struct net_device *dev, + u16 queue_id); void xsk_set_rx_need_wakeup(struct xdp_umem *umem); void xsk_set_tx_need_wakeup(struct xdp_umem *umem); void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); @@ -125,8 +126,8 @@ static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) { } -static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, - u16 queue_id) +static inline struct xsk_buff_pool * +xdp_get_xsk_pool_from_qid(struct net_device *dev, u16 queue_id) { return NULL; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 6842990e2712..f851b0a68324 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -13,6 +13,7 @@ struct xsk_buff_pool; struct xdp_rxq_info; struct xsk_queue; struct xdp_desc; +struct xdp_umem; struct device; struct page; @@ -42,13 +43,14 @@ struct xsk_buff_pool { u32 frame_len; bool dma_need_sync; bool unaligned; + struct xdp_umem *umem; void *addrs; struct device *dev; struct xdp_buff_xsk *free_heads[]; }; /* AF_XDP core. */ -struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, +struct xsk_buff_pool *xp_create(struct xdp_umem *umem, u32 chunks, u32 chunk_size, u32 headroom, u64 size, bool unaligned); void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); -- cgit v1.2.3 From c4655761d3cf62bf5f86650e79349c1bfa5c6285 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 10:26:16 +0200 Subject: xsk: i40e: ice: ixgbe: mlx5: Rename xsk zero-copy driver interfaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the AF_XDP zero-copy driver interface functions to better reflect what they do after the replacement of umems with buffer pools in the previous commit. Mostly it is about replacing the umem name from the function names with xsk_buff and also have them take the a buffer pool pointer instead of a umem. The various ring functions have also been renamed in the process so that they have the same naming convention as the internal functions in xsk_queue.h. This so that it will be clearer what they do and also for consistency. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-3-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 1 + include/net/xdp_sock_drv.h | 114 +++++++++++++++++++++++---------------------- 2 files changed, 60 insertions(+), 55 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index c9d87cc40c11..ccf6cb54f9a6 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -52,6 +52,7 @@ struct xdp_sock { struct net_device *dev; struct xdp_umem *umem; struct list_head flush_node; + struct xsk_buff_pool *pool; u16 queue_id; bool zc; enum { diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 5dc8d3c0dcd4..a7c7d2eff860 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -11,48 +11,50 @@ #ifdef CONFIG_XDP_SOCKETS -void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); -bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); -void xsk_umem_consume_tx_done(struct xdp_umem *umem); -struct xsk_buff_pool *xdp_get_xsk_pool_from_qid(struct net_device *dev, - u16 queue_id); -void xsk_set_rx_need_wakeup(struct xdp_umem *umem); -void xsk_set_tx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); -void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); -bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); +void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); +bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); +void xsk_tx_release(struct xsk_buff_pool *pool); +struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, + u16 queue_id); +void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool); +void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool); +void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool); +void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool); +bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool); -static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { - return XDP_PACKET_HEADROOM + umem->headroom; + return XDP_PACKET_HEADROOM + pool->headroom; } -static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { - return umem->chunk_size; + return pool->chunk_size; } -static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { - return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem); + return xsk_pool_get_chunk_size(pool) - xsk_pool_get_headroom(pool); } -static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, +static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { - xp_set_rxq_info(umem->pool, rxq); + xp_set_rxq_info(pool, rxq); } -static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, +static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { - xp_dma_unmap(umem->pool, attrs); + xp_dma_unmap(pool, attrs); } -static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, - unsigned long attrs) +static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, + struct device *dev, unsigned long attrs) { - return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs); + struct xdp_umem *umem = pool->umem; + + return xp_dma_map(pool, dev, attrs, umem->pgs, umem->npgs); } static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) @@ -69,14 +71,14 @@ static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) return xp_get_frame_dma(xskb); } -static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { - return xp_alloc(umem->pool); + return xp_alloc(pool); } -static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { - return xp_can_alloc(umem->pool, count); + return xp_can_alloc(pool, count); } static inline void xsk_buff_free(struct xdp_buff *xdp) @@ -86,14 +88,15 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) xp_free(xskb); } -static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, + u64 addr) { - return xp_raw_get_dma(umem->pool, addr); + return xp_raw_get_dma(pool, addr); } -static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { - return xp_raw_get_data(umem->pool, addr); + return xp_raw_get_data(pool, addr); } static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) @@ -103,83 +106,83 @@ static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) xp_dma_sync_for_cpu(xskb); } -static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, +static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { - xp_dma_sync_for_device(umem->pool, dma, size); + xp_dma_sync_for_device(pool, dma, size); } #else -static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +static inline void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries) { } -static inline bool xsk_umem_consume_tx(struct xdp_umem *umem, - struct xdp_desc *desc) +static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) { return false; } -static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) +static inline void xsk_tx_release(struct xsk_buff_pool *pool) { } static inline struct xsk_buff_pool * -xdp_get_xsk_pool_from_qid(struct net_device *dev, u16 queue_id) +xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id) { return NULL; } -static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) +static inline void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { } -static inline void xsk_set_tx_need_wakeup(struct xdp_umem *umem) +static inline void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool) { } -static inline void xsk_clear_rx_need_wakeup(struct xdp_umem *umem) +static inline void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool) { } -static inline void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) +static inline void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool) { } -static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) +static inline bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool) { return false; } -static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +static inline u32 xsk_pool_get_headroom(struct xsk_buff_pool *pool) { return 0; } -static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +static inline u32 xsk_pool_get_chunk_size(struct xsk_buff_pool *pool) { return 0; } -static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +static inline u32 xsk_pool_get_rx_frame_size(struct xsk_buff_pool *pool) { return 0; } -static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, +static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) { } -static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, +static inline void xsk_pool_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) { } -static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, - unsigned long attrs) +static inline int xsk_pool_dma_map(struct xsk_buff_pool *pool, + struct device *dev, unsigned long attrs) { return 0; } @@ -194,12 +197,12 @@ static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) return 0; } -static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +static inline struct xdp_buff *xsk_buff_alloc(struct xsk_buff_pool *pool) { return NULL; } -static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +static inline bool xsk_buff_can_alloc(struct xsk_buff_pool *pool, u32 count) { return false; } @@ -208,12 +211,13 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) { } -static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, + u64 addr) { return 0; } -static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) { return NULL; } @@ -222,7 +226,7 @@ static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) { } -static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, +static inline void xsk_buff_raw_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, size_t size) { -- cgit v1.2.3 From 1c1efc2af158869795d3334a12fed2afd9c51539 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 10:26:17 +0200 Subject: xsk: Create and free buffer pool independently from umem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create and free the buffer pool independently from the umem. Move these operations that are performed on the buffer pool from the umem create and destroy functions to new create and destroy functions just for the buffer pool. This so that in later commits we can instantiate multiple buffer pools per umem when sharing a umem between HW queues and/or devices. We also erradicate the back pointer from the umem to the buffer pool as this will not work when we introduce the possibility to have multiple buffer pools per umem. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-4-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 3 +-- include/net/xsk_buff_pool.h | 13 ++++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index ccf6cb54f9a6..ea2b020c4fbc 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -20,13 +20,12 @@ struct xdp_buff; struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; - struct xsk_buff_pool *pool; u64 size; u32 headroom; u32 chunk_size; + u32 chunks; struct user_struct *user; refcount_t users; - struct work_struct work; struct page **pgs; u32 npgs; u16 queue_id; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index f851b0a68324..4025486cc057 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -14,6 +14,7 @@ struct xdp_rxq_info; struct xsk_queue; struct xdp_desc; struct xdp_umem; +struct xdp_sock; struct device; struct page; @@ -46,16 +47,22 @@ struct xsk_buff_pool { struct xdp_umem *umem; void *addrs; struct device *dev; + refcount_t users; + struct work_struct work; struct xdp_buff_xsk *free_heads[]; }; /* AF_XDP core. */ -struct xsk_buff_pool *xp_create(struct xdp_umem *umem, u32 chunks, - u32 chunk_size, u32 headroom, u64 size, - bool unaligned); +struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, + struct xdp_umem *umem); +int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, + u16 queue_id, u16 flags); void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); void xp_destroy(struct xsk_buff_pool *pool); void xp_release(struct xdp_buff_xsk *xskb); +void xp_get_pool(struct xsk_buff_pool *pool); +void xp_put_pool(struct xsk_buff_pool *pool); +void xp_clear_dev(struct xsk_buff_pool *pool); /* AF_XDP, and XDP core. */ void xp_free(struct xdp_buff_xsk *xskb); -- cgit v1.2.3 From 7361f9c3d71955c624fdad5676c99fc88a8249e9 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 10:26:18 +0200 Subject: xsk: Move fill and completion rings to buffer pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the fill and completion rings from the umem to the buffer pool. This so that we in a later commit can share the umem between multiple HW queue ids. In this case, we need one fill and completion ring per queue id. As the buffer pool is per queue id and napi id this is a natural place for it and one umem struture can be shared between these buffer pools. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-5-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 4 ++-- include/net/xsk_buff_pool.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index ea2b020c4fbc..2a284e137e9a 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -18,8 +18,6 @@ struct xsk_queue; struct xdp_buff; struct xdp_umem { - struct xsk_queue *fq; - struct xsk_queue *cq; u64 size; u32 headroom; u32 chunk_size; @@ -77,6 +75,8 @@ struct xdp_sock { struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; + struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */ + struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; #ifdef CONFIG_XDP_SOCKETS diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 4025486cc057..380d9aeedbea 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -30,6 +30,7 @@ struct xdp_buff_xsk { struct xsk_buff_pool { struct xsk_queue *fq; + struct xsk_queue *cq; struct list_head free_list; dma_addr_t *dma_pages; struct xdp_buff_xsk *heads; @@ -57,7 +58,6 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem); int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, u16 queue_id, u16 flags); -void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); void xp_destroy(struct xsk_buff_pool *pool); void xp_release(struct xdp_buff_xsk *xskb); void xp_get_pool(struct xsk_buff_pool *pool); -- cgit v1.2.3 From c2d3d6a474629e30428b1622af3d551f560cd1d8 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 10:26:19 +0200 Subject: xsk: Move queue_id, dev and need_wakeup to buffer pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move queue_id, dev, and need_wakeup from the umem to the buffer pool. This so that we in a later commit can share the umem between multiple HW queues. There is one buffer pool per dev and queue id, so these variables should belong to the buffer pool, not the umem. Need_wakeup is also something that is set on a per napi level, so there is usually one per device and queue id. So move this to the buffer pool too. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-6-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 3 --- include/net/xsk_buff_pool.h | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 2a284e137e9a..b052f1c005a9 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -26,11 +26,8 @@ struct xdp_umem { refcount_t users; struct page **pgs; u32 npgs; - u16 queue_id; - u8 need_wakeup; u8 flags; int id; - struct net_device *dev; bool zc; spinlock_t xsk_tx_list_lock; struct list_head xsk_tx_list; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 380d9aeedbea..2d948905f05f 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -43,11 +43,15 @@ struct xsk_buff_pool { u32 headroom; u32 chunk_size; u32 frame_len; + u16 queue_id; + u8 cached_need_wakeup; + bool uses_need_wakeup; bool dma_need_sync; bool unaligned; struct xdp_umem *umem; void *addrs; struct device *dev; + struct net_device *netdev; refcount_t users; struct work_struct work; struct xdp_buff_xsk *free_heads[]; -- cgit v1.2.3 From a5aa8e529e3667eb377ec132d4b4926dee065a45 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 10:26:20 +0200 Subject: xsk: Move xsk_tx_list and its lock to buffer pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the xsk_tx_list and the xsk_tx_list_lock from the umem to the buffer pool. This so that we in a later commit can share the umem between multiple HW queues. There is one xsk_tx_list per device and queue id, so it should be located in the buffer pool. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-7-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 4 +--- include/net/xsk_buff_pool.h | 5 +++++ 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index b052f1c005a9..9a61d05ec132 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -29,8 +29,6 @@ struct xdp_umem { u8 flags; int id; bool zc; - spinlock_t xsk_tx_list_lock; - struct list_head xsk_tx_list; }; struct xsk_map { @@ -57,7 +55,7 @@ struct xdp_sock { /* Protects multiple processes in the control path */ struct mutex mutex; struct xsk_queue *tx ____cacheline_aligned_in_smp; - struct list_head list; + struct list_head tx_list; /* Mutual exclusion of NAPI TX thread and sendmsg error paths * in the SKB destructor callback. */ diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 2d948905f05f..83f100c6d440 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -52,6 +52,9 @@ struct xsk_buff_pool { void *addrs; struct device *dev; struct net_device *netdev; + struct list_head xsk_tx_list; + /* Protects modifications to the xsk_tx_list */ + spinlock_t xsk_tx_list_lock; refcount_t users; struct work_struct work; struct xdp_buff_xsk *free_heads[]; @@ -67,6 +70,8 @@ void xp_release(struct xdp_buff_xsk *xskb); void xp_get_pool(struct xsk_buff_pool *pool); void xp_put_pool(struct xsk_buff_pool *pool); void xp_clear_dev(struct xsk_buff_pool *pool); +void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); +void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); /* AF_XDP, and XDP core. */ void xp_free(struct xdp_buff_xsk *xskb); -- cgit v1.2.3 From 7f7ffa4e9c38f01d380ed9df6adb238fd5e6eea5 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 10:26:21 +0200 Subject: xsk: Move addrs from buffer pool to umem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replicate the addrs pointer in the buffer pool to the umem. This mapping will be the same for all buffer pools sharing the same umem. In the buffer pool we leave the addrs pointer for performance reasons. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-8-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 9a61d05ec132..126d24364b5a 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -18,6 +18,7 @@ struct xsk_queue; struct xdp_buff; struct xdp_umem { + void *addrs; u64 size; u32 headroom; u32 chunk_size; -- cgit v1.2.3 From 921b68692abb4fd02237b6875b2056bc59435116 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 10:26:22 +0200 Subject: xsk: Enable sharing of dma mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable the sharing of dma mappings by moving them out from the buffer pool. Instead we put each dma mapped umem region in a list in the umem structure. If dma has already been mapped for this umem and device, it is not mapped again and the existing dma mappings are reused. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-9-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 1 + include/net/xsk_buff_pool.h | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 126d24364b5a..282aeba0d20f 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -30,6 +30,7 @@ struct xdp_umem { u8 flags; int id; bool zc; + struct list_head xsk_dma_list; }; struct xsk_map { diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 83f100c6d440..356d0ac74eba 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -28,10 +28,23 @@ struct xdp_buff_xsk { struct list_head free_list_node; }; +struct xsk_dma_map { + dma_addr_t *dma_pages; + struct device *dev; + struct net_device *netdev; + refcount_t users; + struct list_head list; /* Protected by the RTNL_LOCK */ + u32 dma_pages_cnt; + bool dma_need_sync; +}; + struct xsk_buff_pool { struct xsk_queue *fq; struct xsk_queue *cq; struct list_head free_list; + /* For performance reasons, each buff pool has its own array of dma_pages + * even when they are identical. + */ dma_addr_t *dma_pages; struct xdp_buff_xsk *heads; u64 chunk_mask; -- cgit v1.2.3 From 8ef4e27eb3f03edfbfbe5657b8061f2a47757037 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 10:26:23 +0200 Subject: xsk: Rearrange internal structs for better performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rearrange the xdp_sock, xdp_umem and xsk_buff_pool structures so that they get smaller and align better to the cache lines. In the previous commits of this patch set, these structs have been reordered with the focus on functionality and simplicity, not performance. This patch improves throughput performance by around 3%. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-10-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock.h | 13 +++++++------ include/net/xsk_buff_pool.h | 27 +++++++++++++++------------ 2 files changed, 22 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 282aeba0d20f..1a9559c0cbdd 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -23,13 +23,13 @@ struct xdp_umem { u32 headroom; u32 chunk_size; u32 chunks; + u32 npgs; struct user_struct *user; refcount_t users; - struct page **pgs; - u32 npgs; u8 flags; - int id; bool zc; + struct page **pgs; + int id; struct list_head xsk_dma_list; }; @@ -42,7 +42,7 @@ struct xsk_map { struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; - struct xsk_queue *rx; + struct xsk_queue *rx ____cacheline_aligned_in_smp; struct net_device *dev; struct xdp_umem *umem; struct list_head flush_node; @@ -54,8 +54,7 @@ struct xdp_sock { XSK_BOUND, XSK_UNBOUND, } state; - /* Protects multiple processes in the control path */ - struct mutex mutex; + struct xsk_queue *tx ____cacheline_aligned_in_smp; struct list_head tx_list; /* Mutual exclusion of NAPI TX thread and sendmsg error paths @@ -72,6 +71,8 @@ struct xdp_sock { struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; + /* Protects multiple processes in the control path */ + struct mutex mutex; struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */ struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 356d0ac74eba..38d03a64c9ea 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -39,9 +39,22 @@ struct xsk_dma_map { }; struct xsk_buff_pool { - struct xsk_queue *fq; - struct xsk_queue *cq; + /* Members only used in the control path first. */ + struct device *dev; + struct net_device *netdev; + struct list_head xsk_tx_list; + /* Protects modifications to the xsk_tx_list */ + spinlock_t xsk_tx_list_lock; + refcount_t users; + struct xdp_umem *umem; + struct work_struct work; struct list_head free_list; + u32 heads_cnt; + u16 queue_id; + + /* Data path members as close to free_heads at the end as possible. */ + struct xsk_queue *fq ____cacheline_aligned_in_smp; + struct xsk_queue *cq; /* For performance reasons, each buff pool has its own array of dma_pages * even when they are identical. */ @@ -51,25 +64,15 @@ struct xsk_buff_pool { u64 addrs_cnt; u32 free_list_cnt; u32 dma_pages_cnt; - u32 heads_cnt; u32 free_heads_cnt; u32 headroom; u32 chunk_size; u32 frame_len; - u16 queue_id; u8 cached_need_wakeup; bool uses_need_wakeup; bool dma_need_sync; bool unaligned; - struct xdp_umem *umem; void *addrs; - struct device *dev; - struct net_device *netdev; - struct list_head xsk_tx_list; - /* Protects modifications to the xsk_tx_list */ - spinlock_t xsk_tx_list_lock; - refcount_t users; - struct work_struct work; struct xdp_buff_xsk *free_heads[]; }; -- cgit v1.2.3 From 9647c57b11e563f5b33a49ef72b347753917c21c Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 10:26:24 +0200 Subject: xsk: i40e: ice: ixgbe: mlx5: Test for dma_need_sync earlier for better performance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test for dma_need_sync earlier to increase performance. xsk_buff_dma_sync_for_cpu() takes an xdp_buff as parameter and from that the xsk_buff_pool reference is dug out. Perf shows that this dereference causes a lot of cache misses. But as the buffer pool is now sent down to the driver at zero-copy initialization time, we might as well use this pointer directly, instead of going via the xsk_buff and we can do so already in xsk_buff_dma_sync_for_cpu() instead of in xp_dma_sync_for_cpu. This gets rid of these cache misses. Throughput increases with 3% for the xdpsock l2fwd sample application on my machine. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-11-git-send-email-magnus.karlsson@intel.com --- include/net/xdp_sock_drv.h | 7 +++++-- include/net/xsk_buff_pool.h | 3 --- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index a7c7d2eff860..5b1ee8a9976d 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -99,10 +99,13 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } -static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + if (!pool->dma_need_sync) + return; + xp_dma_sync_for_cpu(xskb); } @@ -222,7 +225,7 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } -static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 38d03a64c9ea..907537dddcac 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -114,9 +114,6 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb); static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) { - if (!xskb->pool->dma_need_sync) - return; - xp_dma_sync_for_cpu_slow(xskb); } -- cgit v1.2.3 From b5aea28dca13456c1a08b9b2ef8a8b92598ac426 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 28 Aug 2020 10:26:25 +0200 Subject: xsk: Add shared umem support between queue ids MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support to share a umem between queue ids on the same device. This mode can be invoked with the XDP_SHARED_UMEM bind flag. Previously, sharing was only supported within the same queue id and device, and you shared one set of fill and completion rings. However, note that when sharing a umem between queue ids, you need to create a fill ring and a completion ring and tie them to the socket before you do the bind with the XDP_SHARED_UMEM flag. This so that the single-producer single-consumer semantics can be upheld. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1598603189-32145-12-git-send-email-magnus.karlsson@intel.com --- include/net/xsk_buff_pool.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 907537dddcac..0140d086dc84 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -81,6 +81,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem); int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, u16 queue_id, u16 flags); +int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, + struct net_device *dev, u16 queue_id); void xp_destroy(struct xsk_buff_pool *pool); void xp_release(struct xdp_buff_xsk *xskb); void xp_get_pool(struct xsk_buff_pool *pool); -- cgit v1.2.3 From 1d97898b36bab91e8ffb38a660cc40eaba613f88 Mon Sep 17 00:00:00 2001 From: wenxu Date: Fri, 28 Aug 2020 23:14:31 +0800 Subject: ipv6: add ipv6_fragment hook in ipv6_stub Add ipv6_fragment to ipv6_stub to avoid calling netfilter when access ip6_fragment. Signed-off-by: wenxu Signed-off-by: David S. Miller --- include/net/ipv6_stubs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index d7a7f7c81e7b..8fce558b5fea 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -63,6 +63,9 @@ struct ipv6_stub { int encap_type); #endif struct neigh_table *nd_tbl; + + int (*ipv6_fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, + int (*output)(struct net *, struct sock *, struct sk_buff *)); }; extern const struct ipv6_stub *ipv6_stub __read_mostly; -- cgit v1.2.3 From 5af68891dc16e1c8216705034a5e0144fd47779a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 29 Aug 2020 05:21:30 -0400 Subject: net: clean up codestyle This is a pure codestyle cleanup patch. No functional change intended. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- include/net/dst.h | 2 +- include/net/sock.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 6ae2e625050d..8ea8812b0b41 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -214,7 +214,7 @@ dst_allfrag(const struct dst_entry *dst) static inline int dst_metric_locked(const struct dst_entry *dst, int metric) { - return dst_metric(dst, RTAX_LOCK) & (1<sk_forward_alloc || + return size <= sk->sk_forward_alloc || __sk_mem_schedule(sk, size, SK_MEM_RECV) || skb_pfmemalloc(skb); } -- cgit v1.2.3 From afd6220999d4932616322f8a893371f8d0567a2a Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Sun, 30 Aug 2020 11:33:58 +0300 Subject: net: phylink: add helper function to decode USXGMII word With the new addition of the USXGMII link partner ability constants we can now introduce a phylink helper that decodes the USXGMII word and populates the appropriate fields in the phylink_link_state structure based on them. Signed-off-by: Ioana Ciornei Reviewed-by: Russell King Signed-off-by: David S. Miller --- include/linux/phylink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index c36fb41a7d90..d81a714cfbbd 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -490,4 +490,7 @@ void phylink_mii_c22_pcs_an_restart(struct mdio_device *pcs); void phylink_mii_c45_pcs_get_state(struct mdio_device *pcs, struct phylink_link_state *state); + +void phylink_decode_usxgmii_word(struct phylink_link_state *state, + uint16_t lpa); #endif -- cgit v1.2.3 From 2dab432c5ae4f9c8eab3132ac0e9facf498ead92 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Sun, 30 Aug 2020 11:34:00 +0300 Subject: net: mdiobus: add clause 45 mdiobus write accessor Add the locked variant of the clause 45 mdiobus write accessor - mdiobus_c45_write(). Signed-off-by: Ioana Ciornei Reviewed-by: Russell King Signed-off-by: David S. Miller --- include/linux/mdio.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 898cbf00332a..3a88b699b758 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -358,6 +358,12 @@ static inline int mdiobus_c45_read(struct mii_bus *bus, int prtad, int devad, return mdiobus_read(bus, prtad, mdiobus_c45_addr(devad, regnum)); } +static inline int mdiobus_c45_write(struct mii_bus *bus, int prtad, int devad, + u16 regnum, u16 val) +{ + return mdiobus_write(bus, prtad, mdiobus_c45_addr(devad, regnum), val); +} + int mdiobus_register_device(struct mdio_device *mdiodev); int mdiobus_unregister_device(struct mdio_device *mdiodev); bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); -- cgit v1.2.3 From 0da4c3d393e40e41e3c6b9f1cebaa498512c2abb Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Sun, 30 Aug 2020 11:34:01 +0300 Subject: net: phy: add Lynx PCS module Add a Lynx PCS module which exposes the necessary operations to drive the PCS using phylink. The majority of the code is extracted from the Felix DSA driver, which will be also modified in a later patch, and exposed as a separate module for code reusability purposes. As such, this aims at feature and bug parity with the existing Felix DSA driver, and thus USXGMII, SGMII, QSGMII and 2500Base-X (only w/o in-band AN) are supported by the Lynx PCS module since these were also supported by Felix. The module can only be enabled by the drivers in need and not user selectable. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- include/linux/pcs-lynx.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 include/linux/pcs-lynx.h (limited to 'include') diff --git a/include/linux/pcs-lynx.h b/include/linux/pcs-lynx.h new file mode 100644 index 000000000000..a6440d6ebe95 --- /dev/null +++ b/include/linux/pcs-lynx.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* Copyright 2020 NXP + * Lynx PCS helpers + */ + +#ifndef __LINUX_PCS_LYNX_H +#define __LINUX_PCS_LYNX_H + +#include +#include + +struct lynx_pcs { + struct phylink_pcs pcs; + struct mdio_device *mdio; +}; + +struct lynx_pcs *lynx_pcs_create(struct mdio_device *mdio); + +void lynx_pcs_destroy(struct lynx_pcs *pcs); + +#endif /* __LINUX_PCS_LYNX_H */ -- cgit v1.2.3 From 3f7d820bad6cace3ecb859bf3fa5bc56fe8bb1c6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 31 Aug 2020 02:26:10 -0400 Subject: net: ipv6: remove unused arg exact_dif in compute_score The arg exact_dif is not used anymore, remove it. inet6_exact_dif_match() is no longer needed after the above is removed, remove it too. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- include/linux/ipv6.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index bac8f4fffbd6..dda61d150a13 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -177,17 +177,6 @@ static inline int inet6_sdif(const struct sk_buff *skb) return 0; } -/* can not be used in TCP layer after tcp_v6_fill_cb */ -static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if defined(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_tcp_l3mdev_accept && - skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) - return true; -#endif - return false; -} - struct tcp6_request_sock { struct tcp_request_sock tcp6rsk_tcp; }; -- cgit v1.2.3 From 34e1ec319e99322bfed02767d51f4998a961d205 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 31 Aug 2020 02:26:34 -0400 Subject: net: ipv4: remove unused arg exact_dif in compute_score The arg exact_dif is not used anymore, remove it. inet_exact_dif_match() is no longer needed after the above is removed, so remove it too. Signed-off-by: Miaohe Lin Signed-off-by: David S. Miller --- include/net/tcp.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index eab6c7510b5b..0d11db6436c8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -941,16 +941,6 @@ INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *skb)); #endif -static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb) -{ -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - if (!net->ipv4.sysctl_tcp_l3mdev_accept && - skb && ipv4_l3mdev_skb(IPCB(skb)->flags)) - return true; -#endif - return false; -} - /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v4_sdif(struct sk_buff *skb) { -- cgit v1.2.3 From 144b0a0e608690d46e9a77819249bdd8d23bdcb6 Mon Sep 17 00:00:00 2001 From: Yaroslav Bolyukin Date: Sat, 29 Aug 2020 18:59:53 +0500 Subject: ipvs: remove dependency on ip6_tables This dependency was added because ipv6_find_hdr was in iptables specific code but is no longer required Fixes: f8f626754ebe ("ipv6: Move ipv6_find_hdr() out of Netfilter code.") Fixes: 63dca2c0b0e7 ("ipvs: Fix faulty IPv6 extension header handling in IPVS") Signed-off-by: Yaroslav Bolyukin Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 9a59a33787cb..d609e957a3ec 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -25,9 +25,6 @@ #include #include /* for struct ipv6hdr */ #include -#if IS_ENABLED(CONFIG_IP_VS_IPV6) -#include -#endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include #endif -- cgit v1.2.3 From c1077616142907bb6ee987ecd136d6857ffd8787 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 1 Sep 2020 15:10:08 -0700 Subject: ip: expose inet sockopts through inet_diag Expose all exisiting inet sockopt bits through inet_diag for debug purpose. Corresponding changes in iproute2 ss will be submitted to output all these values. Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: Mahesh Bandewar Signed-off-by: David S. Miller --- include/linux/inet_diag.h | 2 ++ include/uapi/linux/inet_diag.h | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/inet_diag.h b/include/linux/inet_diag.h index 0ef2d800fda7..84abb30a3fbb 100644 --- a/include/linux/inet_diag.h +++ b/include/linux/inet_diag.h @@ -75,6 +75,8 @@ static inline size_t inet_diag_msg_attrs_size(void) #ifdef CONFIG_SOCK_CGROUP_DATA + nla_total_size_64bit(sizeof(u64)) /* INET_DIAG_CGROUP_ID */ #endif + + nla_total_size(sizeof(struct inet_diag_sockopt)) + /* INET_DIAG_SOCKOPT */ ; } int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 5ba122c1949a..20ee93f0f876 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -160,6 +160,7 @@ enum { INET_DIAG_ULP_INFO, INET_DIAG_SK_BPF_STORAGES, INET_DIAG_CGROUP_ID, + INET_DIAG_SOCKOPT, __INET_DIAG_MAX, }; @@ -183,6 +184,23 @@ struct inet_diag_meminfo { __u32 idiag_tmem; }; +/* INET_DIAG_SOCKOPT */ + +struct inet_diag_sockopt { + __u8 recverr:1, + is_icsk:1, + freebind:1, + hdrincl:1, + mc_loop:1, + transparent:1, + mc_all:1, + nodefrag:1; + __u8 bind_address_no_port:1, + recverr_rfc4884:1, + defer_connect:1, + unused:5; +}; + /* INET_DIAG_VEGASINFO */ struct tcpvegas_info { -- cgit v1.2.3 From 0f7c5317b89058e432d3e97efa19467ff4c3b86b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 4 Sep 2020 14:37:29 -0700 Subject: of: Export of_remove_property() to modules We will need to remove some OF properties in drivers/net/dsa/bcm_sf2.c with a subsequent commit. Export of_remove_property() to modules so we can keep bcm_sf2 modular and provide an empty stub for when CONFIG_OF is disabled to maintain the ability to compile test. Signed-off-by: Florian Fainelli Acked-by: Rob Herring Signed-off-by: Jakub Kicinski --- include/linux/of.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/of.h b/include/linux/of.h index 5cf7ae0465d1..481ec0467285 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -929,6 +929,11 @@ static inline int of_machine_is_compatible(const char *compat) return 0; } +static inline int of_remove_property(struct device_node *np, struct property *prop) +{ + return 0; +} + static inline bool of_console_check(const struct device_node *dn, const char *name, int index) { return false; -- cgit v1.2.3 From 938c3efd9e650ca343d04e70d11a17c64119e17c Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 4 Sep 2020 17:14:53 +0100 Subject: bpf: Fix formatting in documentation for BPF helpers Fix a formatting error in the description of bpf_load_hdr_opt() (rst2man complains about a wrong indentation, but what is missing is actually a blank line before the bullet list). Fix and harmonise the formatting for other helpers. Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200904161454.31135-3-quentin@isovalent.com --- include/uapi/linux/bpf.h | 87 +++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8dda13880957..90359cab501d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3349,38 +3349,38 @@ union bpf_attr { * Description * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) * Description * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. * Return - * *sk* if casting is valid, or NULL otherwise. + * *sk* if casting is valid, or **NULL** otherwise. * * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. * To achieve this, the helper needs *task*, which is a valid - * pointer to struct task_struct. To store the stacktrace, the - * bpf program provides *buf* with a nonnegative *size*. + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. * * The last argument, *flags*, holds the number of stack frames to * skip (from 0 to 255), masked with @@ -3410,12 +3410,12 @@ union bpf_attr { * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description * Load header option. Support reading a particular TCP header - * option for bpf program (BPF_PROG_TYPE_SOCK_OPS). + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). * * If *flags* is 0, it will search the option from the - * sock_ops->skb_data. The comment in "struct bpf_sock_ops" + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** * has details on what skb_data contains under different - * sock_ops->op. + * *skops*\ **->op**. * * The first byte of the *searchby_res* specifies the * kind that it wants to search. @@ -3435,7 +3435,7 @@ union bpf_attr { * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. * * To search for the standard window scale option (3), - * the searchby_res should be [ 3, 0, 0, .... 0 ]. + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. * Note, kind-length must be 0 for regular option. * * Searching for No-Op (0) and End-of-Option-List (1) are @@ -3445,27 +3445,30 @@ union bpf_attr { * of a header option. * * Supported flags: + * * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the * saved_syn packet or the just-received syn packet. * * Return - * >0 when found, the header option is copied to *searchby_res*. - * The return value is the total length copied. + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: * - * **-EINVAL** If param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOMSG** The option is not found + * **-ENOMSG** if the option is not found. * - * **-ENOENT** No syn packet available when - * **BPF_LOAD_HDR_OPT_TCP_SYN** is used + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. * - * **-ENOSPC** Not enough space. Only *len* number of - * bytes are copied. + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. * - * **-EFAULT** Cannot parse the header options in the packet + * **-EFAULT** on failure to parse the header options in the + * packet. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) * Description @@ -3483,44 +3486,44 @@ union bpf_attr { * by searching the same option in the outgoing skb. * * This helper can only be called during - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** If param is invalid + * **-EINVAL** If param is invalid. * - * **-ENOSPC** Not enough space in the header. - * Nothing has been written + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written * - * **-EEXIST** The option has already existed + * **-EEXIST** if the option already exists. * - * **-EFAULT** Cannot parse the existing header options + * **-EFAULT** on failrue to parse the existing header options. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) * Description * Reserve *len* bytes for the bpf header option. The - * space will be used by bpf_store_hdr_opt() later in - * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. * - * If bpf_reserve_hdr_opt() is called multiple times, + * If **bpf_reserve_hdr_opt**\ () is called multiple times, * the total number of bytes will be reserved. * * This helper can only be called during - * BPF_SOCK_OPS_HDR_OPT_LEN_CB. + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. * * Return * 0 on success, or negative error in case of failure: * - * **-EINVAL** if param is invalid + * **-EINVAL** if a parameter is invalid. * - * **-ENOSPC** Not enough space in the header. + * **-ENOSPC** if there is not enough space in the header. * - * **-EPERM** This helper cannot be used under the - * current sock_ops->op. + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. * * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) * Description @@ -3560,9 +3563,9 @@ union bpf_attr { * * long bpf_d_path(struct path *path, char *buf, u32 sz) * Description - * Return full path for given 'struct path' object, which - * needs to be the kernel BTF 'path' object. The path is - * returned in the provided buffer 'buf' of size 'sz' and + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and * is zero terminated. * * Return @@ -3573,7 +3576,7 @@ union bpf_attr { * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) * Description * Read *size* bytes from user space address *user_ptr* and store - * the data in *dst*. This is a wrapper of copy_from_user(). + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. */ -- cgit v1.2.3 From 5205e919c9f0c5b48678f2c787871c96f665ca1b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 7 Sep 2020 12:56:08 +0300 Subject: net: bridge: mcast: add support for src list and filter mode dumping Support per port group src list (address and timer) and filter mode dumping. Protected by either multicast_lock or rcu. v3: add IPv6 support v2: require RCU or multicast_lock to traverse src groups Signed-off-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index c1227aecd38f..75a2ac479247 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -455,10 +455,31 @@ enum { enum { MDBA_MDB_EATTR_UNSPEC, MDBA_MDB_EATTR_TIMER, + MDBA_MDB_EATTR_SRC_LIST, + MDBA_MDB_EATTR_GROUP_MODE, __MDBA_MDB_EATTR_MAX }; #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) +/* per mdb entry source */ +enum { + MDBA_MDB_SRCLIST_UNSPEC, + MDBA_MDB_SRCLIST_ENTRY, + __MDBA_MDB_SRCLIST_MAX +}; +#define MDBA_MDB_SRCLIST_MAX (__MDBA_MDB_SRCLIST_MAX - 1) + +/* per mdb entry per source attributes + * these are embedded in MDBA_MDB_SRCLIST_ENTRY + */ +enum { + MDBA_MDB_SRCATTR_UNSPEC, + MDBA_MDB_SRCATTR_ADDRESS, + MDBA_MDB_SRCATTR_TIMER, + __MDBA_MDB_SRCATTR_MAX +}; +#define MDBA_MDB_SRCATTR_MAX (__MDBA_MDB_SRCATTR_MAX - 1) + /* multicast router types */ enum { MDB_RTR_TYPE_DISABLED, -- cgit v1.2.3 From 0db0c34cfbc9838c1a14cb04dd880602abd699a7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Sep 2020 16:14:31 -0700 Subject: net: tighten the definition of interface statistics This patch is born out of an investigation into which IEEE statistics correspond to which struct rtnl_link_stats64 members. Turns out that there seems to be reasonable consensus on the matter, among many drivers. To save others the time (and it took more time than I'm comfortable admitting) I'm adding comments referring to IEEE attributes to struct rtnl_link_stats64. Up until now we had two forms of documentation for stats - in Documentation/ABI/testing/sysfs-class-net-statistics and the comments on struct rtnl_link_stats64 itself. While the former is very cautious in defining the expected behavior, the latter feel quite dated and may not be easy to understand for modern day driver author (e.g. rx_over_errors). At the same time modern systems are far more complex and once obvious definitions lost their clarity. For example - does rx_packet count at the MAC layer (aFramesReceivedOK)? packets processed correctly by hardware? received by the driver? or maybe received by the stack? I tried to clarify the expectations, further clarifications from others are very welcome. The part hardest to untangle is rx_over_errors vs rx_fifo_errors vs rx_missed_errors. After much deliberation I concluded that for modern HW only two of the counters will make sense. The distinction between internal FIFO overflow and packets dropped due to back-pressure from the host is likely too implementation (driver and device) specific to expose in the standard stats. Now - which two of those counters we select to use is anyone's pick: sysfs documentation suggests rx_over_errors counts packets which did not fit into buffers due to MTU being too small, which I reused. There don't seem to be many modern drivers using it (well, CAN drivers seem to love this statistic). Of the remaining two I picked rx_missed_errors to report device drops. bnxt reports it and it's folded into "drop"s in procfs (while rx_fifo_errors is an error, and modern devices usually receive the frame OK, they just can't admit it into the pipeline). Of the drivers I looked at only AMD Lance-like and NS8390-like use all three of these counters. rx_missed_errors counts missed frames, rx_over_errors counts overflow events, and rx_fifo_errors counts frames which were truncated because they didn't fit into buffers. This suggests that rx_fifo_errors may be the correct stat for truncated packets, but I'd think a FIFO stat counting truncated packets would be very confusing to a modern reader. v2: - add driver developer notes about ethtool stat count and reset - replace Ethernet with IEEE 802.3 to better indicate source of attrs - mention byte counters don't count FCS - clarify RX counter is from device to host - drop "sightly" from sysfs paragraph - add examples of ethtool stats - s/incoming/received/ s/incoming/transmitted/ Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_link.h | 204 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 187 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 7fba4de511de..bf4667403cab 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -40,26 +40,197 @@ struct rtnl_link_stats { __u32 rx_nohandler; /* dropped, no handler found */ }; -/* The main device statistics structure */ +/** + * struct rtnl_link_stats64 - The main device statistics structure. + * + * @rx_packets: Number of good packets received by the interface. + * For hardware interfaces counts all good packets received from the device + * by the host, including packets which host had to drop at various stages + * of processing (even in the driver). + * + * @tx_packets: Number of packets successfully transmitted. + * For hardware interfaces counts packets which host was able to successfully + * hand over to the device, which does not necessarily mean that packets + * had been successfully transmitted out of the device, only that device + * acknowledged it copied them out of host memory. + * + * @rx_bytes: Number of good received bytes, corresponding to @rx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @tx_bytes: Number of good transmitted bytes, corresponding to @tx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @rx_errors: Total number of bad packets received on this network device. + * This counter must include events counted by @rx_length_errors, + * @rx_crc_errors, @rx_frame_errors and other errors not otherwise + * counted. + * + * @tx_errors: Total number of transmit problems. + * This counter must include events counter by @tx_aborted_errors, + * @tx_carrier_errors, @tx_fifo_errors, @tx_heartbeat_errors, + * @tx_window_errors and other errors not otherwise counted. + * + * @rx_dropped: Number of packets received but not processed, + * e.g. due to lack of resources or unsupported protocol. + * For hardware interfaces this counter should not include packets + * dropped by the device which are counted separately in + * @rx_missed_errors (since procfs folds those two counters together). + * + * @tx_dropped: Number of packets dropped on their way to transmission, + * e.g. due to lack of resources. + * + * @multicast: Multicast packets received. + * For hardware interfaces this statistic is commonly calculated + * at the device level (unlike @rx_packets) and therefore may include + * packets which did not reach the host. + * + * For IEEE 802.3 devices this counter may be equivalent to: + * + * - 30.3.1.1.21 aMulticastFramesReceivedOK + * + * @collisions: Number of collisions during packet transmissions. + * + * @rx_length_errors: Number of packets dropped due to invalid length. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to a sum + * of the following attributes: + * + * - 30.3.1.1.23 aInRangeLengthErrors + * - 30.3.1.1.24 aOutOfRangeLengthField + * - 30.3.1.1.25 aFrameTooLongErrors + * + * @rx_over_errors: Receiver FIFO overflow event counter. + * + * Historically the count of overflow events. Such events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * The recommended interpretation for high speed interfaces is - + * number of packets dropped because they did not fit into buffers + * provided by the host, e.g. packets larger than MTU or next buffer + * in the ring was not available for a scatter transfer. + * + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * This statistics was historically used interchangeably with + * @rx_fifo_errors. + * + * This statistic corresponds to hardware events and is not commonly used + * on software devices. + * + * @rx_crc_errors: Number of packets received with a CRC error. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.6 aFrameCheckSequenceErrors + * + * @rx_frame_errors: Receiver frame alignment errors. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to: + * + * - 30.3.1.1.7 aAlignmentErrors + * + * @rx_fifo_errors: Receiver FIFO error counter. + * + * Historically the count of overflow events. Those events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * This statistics was used interchangeably with @rx_over_errors. + * Not recommended for use in drivers for high speed interfaces. + * + * This statistic is used on software devices, e.g. to count software + * packet queue overflow (can) or sequencing errors (GRE). + * + * @rx_missed_errors: Count of packets missed by the host. + * Folded into the "drop" counter in `/proc/net/dev`. + * + * Counts number of packets dropped by the device due to lack + * of buffer space. This usually indicates that the host interface + * is slower than the network interface, or host is not keeping up + * with the receive packet rate. + * + * This statistic corresponds to hardware events and is not used + * on software devices. + * + * @tx_aborted_errors: + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * For IEEE 802.3 devices capable of half-duplex operation this counter + * must be equivalent to: + * + * - 30.3.1.1.11 aFramesAbortedDueToXSColls + * + * High speed interfaces may use this counter as a general device + * discard counter. + * + * @tx_carrier_errors: Number of frame transmission errors due to loss + * of carrier during transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.13 aCarrierSenseErrors + * + * @tx_fifo_errors: Number of frame transmission errors due to device + * FIFO underrun / underflow. This condition occurs when the device + * begins transmission of a frame but is unable to deliver the + * entire frame to the transmitter in time for transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * @tx_heartbeat_errors: Number of Heartbeat / SQE Test errors for + * old half-duplex Ethernet. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices possibly equivalent to: + * + * - 30.3.2.1.4 aSQETestErrors + * + * @tx_window_errors: Number of frame transmission errors due + * to late collisions (for Ethernet - after the first 64B of transmission). + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.10 aLateCollisions + * + * @rx_compressed: Number of correctly received compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @tx_compressed: Number of transmitted compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @rx_nohandler: Number of packets received on the interface + * but dropped by the networking stack because the device is + * not designated to receive packets (e.g. backup link in a bond). + */ struct rtnl_link_stats64 { - __u64 rx_packets; /* total packets received */ - __u64 tx_packets; /* total packets transmitted */ - __u64 rx_bytes; /* total bytes received */ - __u64 tx_bytes; /* total bytes transmitted */ - __u64 rx_errors; /* bad packets received */ - __u64 tx_errors; /* packet transmit problems */ - __u64 rx_dropped; /* no space in linux buffers */ - __u64 tx_dropped; /* no space available in linux */ - __u64 multicast; /* multicast packets received */ + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; __u64 collisions; /* detailed rx_errors: */ __u64 rx_length_errors; - __u64 rx_over_errors; /* receiver ring buff overflow */ - __u64 rx_crc_errors; /* recved pkt with crc error */ - __u64 rx_frame_errors; /* recv'd frame alignment error */ - __u64 rx_fifo_errors; /* recv'r fifo overrun */ - __u64 rx_missed_errors; /* receiver missed packet */ + __u64 rx_over_errors; + __u64 rx_crc_errors; + __u64 rx_frame_errors; + __u64 rx_fifo_errors; + __u64 rx_missed_errors; /* detailed tx_errors */ __u64 tx_aborted_errors; @@ -71,8 +242,7 @@ struct rtnl_link_stats64 { /* for cslip etc */ __u64 rx_compressed; __u64 tx_compressed; - - __u64 rx_nohandler; /* dropped, no handler found */ + __u64 rx_nohandler; }; /* The struct should be in sync with struct ifmap */ -- cgit v1.2.3 From b131c96496b369c7b14125e7c50e89ac7cec8051 Mon Sep 17 00:00:00 2001 From: "Jose M. Guisado Gomez" Date: Tue, 8 Sep 2020 13:01:41 +0200 Subject: netfilter: nf_tables: add userdata support for nft_object Enables storing userdata for nft_object. Initially this will store an optional comment but can be extended in the future as needed. Adds new attribute NFTA_OBJ_USERDATA to nft_object. Signed-off-by: Jose M. Guisado Gomez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 97a7e147a59a..99c1b3188b1e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1123,6 +1123,8 @@ struct nft_object { u32 genmask:2, use:30; u64 handle; + u16 udlen; + u8 *udata; /* runtime data below here */ const struct nft_object_ops *ops ____cacheline_aligned; unsigned char data[] diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 543dc697b796..2a6e09dea1a0 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1559,6 +1559,7 @@ enum nft_ct_expectation_attributes { * @NFTA_OBJ_DATA: stateful object data (NLA_NESTED) * @NFTA_OBJ_USE: number of references to this expression (NLA_U32) * @NFTA_OBJ_HANDLE: object handle (NLA_U64) + * @NFTA_OBJ_USERDATA: user data (NLA_BINARY) */ enum nft_object_attributes { NFTA_OBJ_UNSPEC, @@ -1569,6 +1570,7 @@ enum nft_object_attributes { NFTA_OBJ_USE, NFTA_OBJ_HANDLE, NFTA_OBJ_PAD, + NFTA_OBJ_USERDATA, __NFTA_OBJ_MAX }; #define NFTA_OBJ_MAX (__NFTA_OBJ_MAX - 1) -- cgit v1.2.3 From 245500d853e9f20036cec7df4f6984ece4c6bf26 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 1 Jul 2020 11:15:32 +0100 Subject: rxrpc: Rewrite the client connection manager Rewrite the rxrpc client connection manager so that it can support multiple connections for a given security key to a peer. The following changes are made: (1) For each open socket, the code currently maintains an rbtree with the connections placed into it, keyed by communications parameters. This is tricky to maintain as connections can be culled from the tree or replaced within it. Connections can require replacement for a number of reasons, e.g. their IDs span too great a range for the IDR data type to represent efficiently, the call ID numbers on that conn would overflow or the conn got aborted. This is changed so that there's now a connection bundle object placed in the tree, keyed on the same parameters. The bundle, however, does not need to be replaced. (2) An rxrpc_bundle object can now manage the available channels for a set of parallel connections. The lock that manages this is moved there from the rxrpc_connection struct (channel_lock). (3) There'a a dummy bundle for all incoming connections to share so that they have a channel_lock too. It might be better to give each incoming connection its own bundle. This bundle is not needed to manage which channels incoming calls are made on because that's the solely at whim of the client. (4) The restrictions on how many client connections are around are removed. Instead, a previous patch limits the number of client calls that can be allocated. Ordinarily, client connections are reaped after 2 minutes on the idle queue, but when more than a certain number of connections are in existence, the reaper starts reaping them after 2s of idleness instead to get the numbers back down. It could also be made such that new call allocations are forced to wait until the number of outstanding connections subsides. Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 33 ++++----------------------------- 1 file changed, 4 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index c33079b986e8..3b67d5981224 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -68,21 +68,14 @@ enum rxrpc_client_trace { rxrpc_client_chan_activate, rxrpc_client_chan_disconnect, rxrpc_client_chan_pass, - rxrpc_client_chan_unstarted, rxrpc_client_chan_wait_failed, rxrpc_client_cleanup, - rxrpc_client_count, rxrpc_client_discard, rxrpc_client_duplicate, rxrpc_client_exposed, rxrpc_client_replace, rxrpc_client_to_active, - rxrpc_client_to_culled, rxrpc_client_to_idle, - rxrpc_client_to_inactive, - rxrpc_client_to_upgrade, - rxrpc_client_to_waiting, - rxrpc_client_uncount, }; enum rxrpc_call_trace { @@ -271,29 +264,14 @@ enum rxrpc_tx_point { EM(rxrpc_client_chan_activate, "ChActv") \ EM(rxrpc_client_chan_disconnect, "ChDisc") \ EM(rxrpc_client_chan_pass, "ChPass") \ - EM(rxrpc_client_chan_unstarted, "ChUnst") \ EM(rxrpc_client_chan_wait_failed, "ChWtFl") \ EM(rxrpc_client_cleanup, "Clean ") \ - EM(rxrpc_client_count, "Count ") \ EM(rxrpc_client_discard, "Discar") \ EM(rxrpc_client_duplicate, "Duplic") \ EM(rxrpc_client_exposed, "Expose") \ EM(rxrpc_client_replace, "Replac") \ EM(rxrpc_client_to_active, "->Actv") \ - EM(rxrpc_client_to_culled, "->Cull") \ - EM(rxrpc_client_to_idle, "->Idle") \ - EM(rxrpc_client_to_inactive, "->Inac") \ - EM(rxrpc_client_to_upgrade, "->Upgd") \ - EM(rxrpc_client_to_waiting, "->Wait") \ - E_(rxrpc_client_uncount, "Uncoun") - -#define rxrpc_conn_cache_states \ - EM(RXRPC_CONN_CLIENT_INACTIVE, "Inac") \ - EM(RXRPC_CONN_CLIENT_WAITING, "Wait") \ - EM(RXRPC_CONN_CLIENT_ACTIVE, "Actv") \ - EM(RXRPC_CONN_CLIENT_UPGRADE, "Upgd") \ - EM(RXRPC_CONN_CLIENT_CULLED, "Cull") \ - E_(RXRPC_CONN_CLIENT_IDLE, "Idle") \ + E_(rxrpc_client_to_idle, "->Idle") #define rxrpc_call_traces \ EM(rxrpc_call_connected, "CON") \ @@ -594,23 +572,20 @@ TRACE_EVENT(rxrpc_client, __field(int, channel ) __field(int, usage ) __field(enum rxrpc_client_trace, op ) - __field(enum rxrpc_conn_cache_state, cs ) ), TP_fast_assign( - __entry->conn = conn->debug_id; + __entry->conn = conn ? conn->debug_id : 0; __entry->channel = channel; - __entry->usage = atomic_read(&conn->usage); + __entry->usage = conn ? atomic_read(&conn->usage) : -2; __entry->op = op; __entry->cid = conn->proto.cid; - __entry->cs = conn->cache_state; ), - TP_printk("C=%08x h=%2d %s %s i=%08x u=%d", + TP_printk("C=%08x h=%2d %s i=%08x u=%d", __entry->conn, __entry->channel, __print_symbolic(__entry->op, rxrpc_client_traces), - __print_symbolic(__entry->cs, rxrpc_conn_cache_states), __entry->cid, __entry->usage) ); -- cgit v1.2.3 From 2efbe6aebea00269425ac7de622d47c2a397a871 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 9 Sep 2020 07:50:34 +0300 Subject: devlink: Add comment block for missing port attributes Add comment block for physical, PF and VF port attributes. Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Reviewed-by: Roi Dayan Signed-off-by: David S. Miller --- include/net/devlink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 8f3c8a443238..3c7ba3e1f490 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -73,6 +73,9 @@ struct devlink_port_pci_vf_attrs { * @splittable: indicates if the port can be split. * @lanes: maximum number of lanes the port supports. 0 value is not passed to netlink. * @switch_id: if the port is part of switch, this is buffer with ID, otherwise this is NULL + * @phys: physical port attributes + * @pci_pf: PCI PF port attributes + * @pci_vf: PCI VF port attributes */ struct devlink_port_attrs { u8 split:1, -- cgit v1.2.3 From ff03e63ad1673eb75cce214556013fc2e52a1b77 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 9 Sep 2020 07:50:35 +0300 Subject: devlink: Move structure comments outside of structure To add more fields to the PCI PF and VF port attributes, follow standard structure comment format. Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 3c7ba3e1f490..efff9274d248 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -57,13 +57,22 @@ struct devlink_port_phys_attrs { u32 split_subport_number; /* If the port is split, this is the number of subport. */ }; +/** + * struct devlink_port_pci_pf_attrs - devlink port's PCI PF attributes + * @pf: Associated PCI PF number for this port. + */ struct devlink_port_pci_pf_attrs { - u16 pf; /* Associated PCI PF for this port. */ + u16 pf; }; +/** + * struct devlink_port_pci_vf_attrs - devlink port's PCI VF attributes + * @pf: Associated PCI PF number for this port. + * @vf: Associated PCI VF for of the PCI PF for this port. + */ struct devlink_port_pci_vf_attrs { - u16 pf; /* Associated PCI PF for this port. */ - u16 vf; /* Associated PCI VF for of the PCI PF for this port. */ + u16 pf; + u16 vf; }; /** -- cgit v1.2.3 From 05b595e9c44acaca94192c6db430a489c1b212a7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 9 Sep 2020 07:50:36 +0300 Subject: devlink: Introduce external controller flag A devlink eswitch port may represent PCI PF/VF ports of a controller. A controller either located on same system or it can be an external controller located in host where such NIC is plugged in. Add the ability for driver to specify if a port is for external controller. Use such flag in the mlx5_core driver. An example of an external controller having VF1 of PF0 belong to controller 1. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 8 ++++++-- include/uapi/linux/devlink.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index efff9274d248..2dad8c9151f4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -60,19 +60,23 @@ struct devlink_port_phys_attrs { /** * struct devlink_port_pci_pf_attrs - devlink port's PCI PF attributes * @pf: Associated PCI PF number for this port. + * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_pf_attrs { u16 pf; + u8 external:1; }; /** * struct devlink_port_pci_vf_attrs - devlink port's PCI VF attributes * @pf: Associated PCI PF number for this port. * @vf: Associated PCI VF for of the PCI PF for this port. + * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_vf_attrs { u16 pf; u16 vf; + u8 external:1; }; /** @@ -1215,9 +1219,9 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port, void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *devlink_port_attrs); -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf); +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf, bool external); void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, - u16 pf, u16 vf); + u16 pf, u16 vf, bool external); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, u16 egress_pools_count, u16 ingress_tc_count, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index cfef4245ea5a..40823ed7e05a 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -458,6 +458,7 @@ enum devlink_attr { DEVLINK_ATTR_PORT_LANES, /* u32 */ DEVLINK_ATTR_PORT_SPLITTABLE, /* u8 */ + DEVLINK_ATTR_PORT_EXTERNAL, /* u8 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From 3a2d9588c4f79adae6a0e986b64ebdd5b38085c6 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 9 Sep 2020 07:50:37 +0300 Subject: devlink: Introduce controller number A devlink port may be for a controller consist of PCI device. A devlink instance holds ports of two types of controllers. (1) controller discovered on same system where eswitch resides This is the case where PCI PF/VF of a controller and devlink eswitch instance both are located on a single system. (2) controller located on external host system. This is the case where a controller is located in one system and its devlink eswitch ports are located in a different system. When a devlink eswitch instance serves the devlink ports of both controllers together, PCI PF/VF numbers may overlap. Due to this a unique phys_port_name cannot be constructed. For example in below such system controller-0 and controller-1, each has PCI PF pf0 whose eswitch ports can be present in controller-0. These results in phys_port_name as "pf0" for both. Similar problem exists for VFs and upcoming Sub functions. An example view of two controller systems: --------------------------------------------------------- | | | --------- --------- ------- ------- | ----------- | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | server | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | pci rc |=== | pf0 |______/________/ | pf1 |___/_______/ | | connect | | ------- ------- | ----------- | | controller_num=1 (no eswitch) | ------|-------------------------------------------------- (internal wire) | --------------------------------------------------------- | devlink eswitch ports and reps | | ----------------------------------------------------- | | |ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 | ctrl-0 |ctrl-0 | | | |pf0 | pf0vfN | pf0sfN | pf1 | pf1vfN |pf1sfN | | | ----------------------------------------------------- | | |ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 | ctrl-1 |ctrl-1 | | | |pf1 | pf1vfN | pf1sfN | pf1 | pf1vfN |pf0sfN | | | ----------------------------------------------------- | | | | | | --------- --------- ------- ------- | | | vf(s) | | sf(s) | |vf(s)| |sf(s)| | | ------- ----/---- ---/----- ------- ---/--- ---/--- | | | pf0 |______/________/ | pf1 |___/_______/ | | ------- ------- | | | | local controller_num=0 (eswitch) | --------------------------------------------------------- An example devlink port for external controller with controller number = 1 for a VF 1 of PF 0: $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev ens2f0pf0vf1 flavour pcivf controller 1 pfnum 0 vfnum 1 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port show pci/0000:06:00.0/2 -jp { "port": { "pci/0000:06:00.0/2": { "type": "eth", "netdev": "ens2f0pf0vf1", "flavour": "pcivf", "controller": 1, "pfnum": 0, "vfnum": 1, "external": true, "splittable": false, "function": { "hw_addr": "00:00:00:00:00:00" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 9 +++++++-- include/uapi/linux/devlink.h | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 2dad8c9151f4..eaec0a8cc5ef 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -59,21 +59,25 @@ struct devlink_port_phys_attrs { /** * struct devlink_port_pci_pf_attrs - devlink port's PCI PF attributes + * @controller: Associated controller number * @pf: Associated PCI PF number for this port. * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_pf_attrs { + u32 controller; u16 pf; u8 external:1; }; /** * struct devlink_port_pci_vf_attrs - devlink port's PCI VF attributes + * @controller: Associated controller number * @pf: Associated PCI PF number for this port. * @vf: Associated PCI VF for of the PCI PF for this port. * @external: when set, indicates if a port is for an external controller */ struct devlink_port_pci_vf_attrs { + u32 controller; u16 pf; u16 vf; u8 external:1; @@ -1219,8 +1223,9 @@ void devlink_port_type_ib_set(struct devlink_port *devlink_port, void devlink_port_type_clear(struct devlink_port *devlink_port); void devlink_port_attrs_set(struct devlink_port *devlink_port, struct devlink_port_attrs *devlink_port_attrs); -void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u16 pf, bool external); -void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, +void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, + u16 pf, bool external); +void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u16 vf, bool external); int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u32 size, u16 ingress_pools_count, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 40823ed7e05a..40d35145c879 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -459,6 +459,7 @@ enum devlink_attr { DEVLINK_ATTR_PORT_SPLITTABLE, /* u8 */ DEVLINK_ATTR_PORT_EXTERNAL, /* u8 */ + DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, /* u32 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From 501cb008906631a019f3ab2104a17ef8b2651ed0 Mon Sep 17 00:00:00 2001 From: Paul Davey Date: Tue, 8 Sep 2020 10:04:06 +1200 Subject: ipmr: Add route table ID to netlink cache reports Insert the multicast route table ID as a Netlink attribute to Netlink cache report notifications. When multiple route tables are in use it is necessary to have a way to determine which route table a given cache report belongs to when receiving the cache report. Signed-off-by: Paul Davey Signed-off-by: David S. Miller --- include/uapi/linux/mroute.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index 11c8c1fc1124..918f1ef32ffe 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -169,6 +169,7 @@ enum { IPMRA_CREPORT_SRC_ADDR, IPMRA_CREPORT_DST_ADDR, IPMRA_CREPORT_PKT, + IPMRA_CREPORT_TABLE, __IPMRA_CREPORT_MAX }; #define IPMRA_CREPORT_MAX (__IPMRA_CREPORT_MAX - 1) -- cgit v1.2.3 From c8715a8e9f38906e73d6d78764216742db13ba0e Mon Sep 17 00:00:00 2001 From: Paul Davey Date: Tue, 8 Sep 2020 10:04:07 +1200 Subject: ipmr: Add high byte of VIF ID to igmpmsg Use the unused3 byte in struct igmpmsg to hold the high 8 bits of the VIF ID. If using more than 255 IPv4 multicast interfaces it is necessary to have access to a VIF ID for cache reports that is wider than 8 bits, the VIF ID present in the igmpmsg reports sent to mroute_sk was only 8 bits wide in the igmpmsg header. Adding the high 8 bits of the 16 bit VIF ID in the unused byte allows use of more than 255 IPv4 multicast interfaces. Signed-off-by: Paul Davey Signed-off-by: David S. Miller --- include/uapi/linux/mroute.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index 918f1ef32ffe..1a42f5f9b31b 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -113,8 +113,8 @@ struct igmpmsg { __u32 unused1,unused2; unsigned char im_msgtype; /* What is this */ unsigned char im_mbz; /* Must be zero */ - unsigned char im_vif; /* Interface (this ought to be a vifi_t!) */ - unsigned char unused3; + unsigned char im_vif; /* Low 8 bits of Interface */ + unsigned char im_vif_hi; /* High 8 bits of Interface */ struct in_addr im_src,im_dst; }; -- cgit v1.2.3 From 5198d545dba8ad893f5e5a029ca8d43ee7bcf011 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Sep 2020 10:37:51 -0700 Subject: net: remove napi_hash_del() from driver-facing API We allow drivers to call napi_hash_del() before calling netif_napi_del() to batch RCU grace periods. This makes the API asymmetric and leaks internal implementation details. Soon we will want the grace period to protect more than just the NAPI hash table. Restructure the API and have drivers call a new function - __netif_napi_del() if they want to take care of RCU waits. Note that only core was checking the return status from napi_hash_del() so the new helper does not report if the NAPI was actually deleted. Some notes on driver oddness: - veth observed the grace period before calling netif_napi_del() but that should not matter - myri10ge observed normal RCU flavor - bnx2x and enic did not actually observe the grace period (unless they did so implicitly) - virtio_net and enic only unhashed Rx NAPIs The last two points seem to indicate that the calls to napi_hash_del() were a left over rather than an optimization. Regardless, it's easy enough to correct them. This patch may introduce extra synchronize_net() calls for interfaces which set NAPI_STATE_NO_BUSY_POLL and depend on free_netdev() to call netif_napi_del(). This seems inevitable since we want to use RCU for netpoll dev->napi_list traversal, and almost no drivers set IFF_DISABLE_NETPOLL. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7f9fcfd15942..74ed95215091 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -70,6 +70,7 @@ struct udp_tunnel_nic; struct bpf_prog; struct xdp_buff; +void synchronize_net(void); void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); @@ -488,20 +489,6 @@ static inline bool napi_complete(struct napi_struct *n) return napi_complete_done(n, 0); } -/** - * napi_hash_del - remove a NAPI from global table - * @napi: NAPI context - * - * Warning: caller must observe RCU grace period - * before freeing memory containing @napi, if - * this function returns true. - * Note: core networking stack automatically calls it - * from netif_napi_del(). - * Drivers might want to call this helper to combine all - * the needed RCU grace periods into a single one. - */ -bool napi_hash_del(struct napi_struct *napi); - /** * napi_disable - prevent NAPI from scheduling * @n: NAPI context @@ -2367,13 +2354,27 @@ static inline void netif_tx_napi_add(struct net_device *dev, netif_napi_add(dev, napi, poll, weight); } +/** + * __netif_napi_del - remove a NAPI context + * @napi: NAPI context + * + * Warning: caller must observe RCU grace period before freeing memory + * containing @napi. Drivers might want to call this helper to combine + * all the needed RCU grace periods into a single one. + */ +void __netif_napi_del(struct napi_struct *napi); + /** * netif_napi_del - remove a NAPI context * @napi: NAPI context * * netif_napi_del() removes a NAPI context from the network device NAPI list */ -void netif_napi_del(struct napi_struct *napi); +static inline void netif_napi_del(struct napi_struct *napi) +{ + __netif_napi_del(napi); + synchronize_net(); +} struct napi_gro_cb { /* Virtual address of skb_shinfo(skb)->frags[0].page + offset. */ @@ -2797,7 +2798,6 @@ static inline void unregister_netdevice(struct net_device *dev) int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); void netdev_freemem(struct net_device *dev); -void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, -- cgit v1.2.3 From 4d092dd2041a5f328410ad16d63b8606c24e8604 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Sep 2020 10:37:52 -0700 Subject: net: manage napi add/del idempotence explicitly To RCUify napi->dev_list we need to replace list_del_init() with list_del_rcu(). There is no _init() version for RCU for obvious reasons. Up until now netif_napi_del() was idempotent so to make sure it remains such add a bit which is set when NAPI is listed, and cleared when it removed. Since we don't expect multiple calls to netif_napi_add() to be correct, add a warning on that side. Now that napi_hash_add / napi_hash_del are only called by napi_add / del we can actually steal its bit. We just need to make sure hash node is initialized correctly. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 74ed95215091..157e0242e9ee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -355,7 +355,7 @@ enum { NAPI_STATE_MISSED, /* reschedule a napi */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ - NAPI_STATE_HASHED, /* In NAPI hash (busy polling possible) */ + NAPI_STATE_LISTED, /* NAPI added to system lists */ NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */ NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */ }; @@ -365,7 +365,7 @@ enum { NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), - NAPIF_STATE_HASHED = BIT(NAPI_STATE_HASHED), + NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), }; -- cgit v1.2.3 From e9b12edc133b54e15ecd105620d51fb8e8fa8bde Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 9 Sep 2020 17:50:46 -0700 Subject: tcp: record received TOS value in the request socket A new field is added to the request sock to record the TOS value received on the listening socket during 3WHS: When not under syn flood, it is recording the TOS value sent in SYN. When under syn flood, it is recording the TOS value sent in the ACK. This is a preparation patch in order to do TOS reflection in the later commit. Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 56ff2952edaf..2f87377e9af7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -134,6 +134,7 @@ struct tcp_request_sock { * FastOpen it's the seq# * after data-in-SYN. */ + u8 syn_tos; }; static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) -- cgit v1.2.3 From de033b7d1568a8f1252055c96cdd99954d5450c4 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 9 Sep 2020 17:50:47 -0700 Subject: ip: pass tos into ip_build_and_send_pkt() This commit adds tos as a new passed in parameter to ip_build_and_send_pkt() which will be used in the later commit. This is a pure restructure and does not have any functional change. Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index b09c48d862cc..0f72bf8c0cbf 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -151,7 +151,7 @@ int igmp_mc_init(void); int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, __be32 saddr, __be32 daddr, - struct ip_options_rcu *opt); + struct ip_options_rcu *opt, u8 tos); int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); void ip_list_rcv(struct list_head *head, struct packet_type *pt, -- cgit v1.2.3 From ac8f1710c12bb4c3626280ce03f05459ba8feef6 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 9 Sep 2020 17:50:48 -0700 Subject: tcp: reflect tos value received in SYN to the socket This commit adds a new TCP feature to reflect the tos value received in SYN, and send it out on the SYN-ACK, and eventually set the tos value of the established socket with this reflected tos value. This provides a way to set the traffic class/QoS level for all traffic in the same connection to be the same as the incoming SYN request. It could be useful in data centers to provide equivalent QoS according to the incoming request. This feature is guarded by /proc/sys/net/ipv4/tcp_reflect_tos, and is by default turned off. Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9e36738c1fe1..8e4fcac4df72 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -183,6 +183,7 @@ struct netns_ipv4 { unsigned int sysctl_tcp_fastopen_blackhole_timeout; atomic_t tfo_active_disable_times; unsigned long tfo_active_disable_stamp; + int sysctl_tcp_reflect_tos; int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; -- cgit v1.2.3 From d66423fbe11e141206f117b232916aa899d44959 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 10 Sep 2020 12:02:48 +0100 Subject: bpf: Plug hole in struct bpf_sk_lookup_kern As Alexei points out, struct bpf_sk_lookup_kern has two 4-byte holes. This leads to suboptimal instructions being generated (IPv4, x86): 1372 struct bpf_sk_lookup_kern ctx = { 0xffffffff81b87f30 <+624>: xor %eax,%eax 0xffffffff81b87f32 <+626>: mov $0x6,%ecx 0xffffffff81b87f37 <+631>: lea 0x90(%rsp),%rdi 0xffffffff81b87f3f <+639>: movl $0x110002,0x88(%rsp) 0xffffffff81b87f4a <+650>: rep stos %rax,%es:(%rdi) 0xffffffff81b87f4d <+653>: mov 0x8(%rsp),%eax 0xffffffff81b87f51 <+657>: mov %r13d,0x90(%rsp) 0xffffffff81b87f59 <+665>: incl %gs:0x7e4970a0(%rip) 0xffffffff81b87f60 <+672>: mov %eax,0x8c(%rsp) 0xffffffff81b87f67 <+679>: movzwl 0x10(%rsp),%eax 0xffffffff81b87f6c <+684>: mov %ax,0xa8(%rsp) 0xffffffff81b87f74 <+692>: movzwl 0x38(%rsp),%eax 0xffffffff81b87f79 <+697>: mov %ax,0xaa(%rsp) Fix this by moving around sport and dport. pahole confirms there are no more holes: struct bpf_sk_lookup_kern { u16 family; /* 0 2 */ u16 protocol; /* 2 2 */ __be16 sport; /* 4 2 */ u16 dport; /* 6 2 */ struct { __be32 saddr; /* 8 4 */ __be32 daddr; /* 12 4 */ } v4; /* 8 8 */ struct { const struct in6_addr * saddr; /* 16 8 */ const struct in6_addr * daddr; /* 24 8 */ } v6; /* 16 16 */ struct sock * selected_sk; /* 32 8 */ bool no_reuseport; /* 40 1 */ /* size: 48, cachelines: 1, members: 8 */ /* padding: 7 */ /* last cacheline: 48 bytes */ }; The assembly also doesn't contain the pesky rep stos anymore: 1372 struct bpf_sk_lookup_kern ctx = { 0xffffffff81b87f60 <+624>: movzwl 0x10(%rsp),%eax 0xffffffff81b87f65 <+629>: movq $0x0,0xa8(%rsp) 0xffffffff81b87f71 <+641>: movq $0x0,0xb0(%rsp) 0xffffffff81b87f7d <+653>: mov %ax,0x9c(%rsp) 0xffffffff81b87f85 <+661>: movzwl 0x38(%rsp),%eax 0xffffffff81b87f8a <+666>: movq $0x0,0xb8(%rsp) 0xffffffff81b87f96 <+678>: mov %ax,0x9e(%rsp) 0xffffffff81b87f9e <+686>: mov 0x8(%rsp),%eax 0xffffffff81b87fa2 <+690>: movq $0x0,0xc0(%rsp) 0xffffffff81b87fae <+702>: movl $0x110002,0x98(%rsp) 0xffffffff81b87fb9 <+713>: mov %eax,0xa0(%rsp) 0xffffffff81b87fc0 <+720>: mov %r13d,0xa4(%rsp) 1: https://lore.kernel.org/bpf/CAADnVQKE6y9h2fwX6OS837v-Uf+aBXnT_JXiN_bbo2gitZQ3tA@mail.gmail.com/ Fixes: e9ddbb7707ff ("bpf: Introduce SK_LOOKUP program type with a dedicated attach point") Suggested-by: Alexei Starovoitov Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20200910110248.198326-1-lmb@cloudflare.com --- include/linux/filter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 995625950cc1..e962dd8117d8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1287,6 +1287,8 @@ int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); struct bpf_sk_lookup_kern { u16 family; u16 protocol; + __be16 sport; + u16 dport; struct { __be32 saddr; __be32 daddr; @@ -1295,8 +1297,6 @@ struct bpf_sk_lookup_kern { const struct in6_addr *saddr; const struct in6_addr *daddr; } v6; - __be16 sport; - u16 dport; struct sock *selected_sk; bool no_reuseport; }; -- cgit v1.2.3 From 1aef5b4391f0c75c0a1523706a7b0311846ee12f Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 10 Sep 2020 13:33:14 -0700 Subject: bpf: Fix comment for helper bpf_current_task_under_cgroup() This should be "current" not "skb". Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)") Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Cc: Link: https://lore.kernel.org/bpf/20200910203314.70018-1-songliubraving@fb.com --- include/uapi/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 90359cab501d..7dd314176df7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1447,8 +1447,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if the *skb* task belongs to the cgroup2. - * * 1, if the *skb* task does not belong to the cgroup2. + * * 0, if current task belongs to the cgroup2. + * * 1, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) -- cgit v1.2.3 From 8919a9b31eb4fb4c0a93e5fb350a626924302aa6 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:32 -0400 Subject: tcp: Only init congestion control if not initialized already Change tcp_init_transfer() to only initialize congestion control if it has not been initialized already. With this new approach, we can arrange things so that if the EBPF code sets the congestion control by calling setsockopt(TCP_CONGESTION) then tcp_init_transfer() will not re-initialize the CC module. This is an approach that has the following beneficial properties: (1) This allows CC module customizations made by the EBPF called in tcp_init_transfer() to persist, and not be wiped out by a later call to tcp_init_congestion_control() in tcp_init_transfer(). (2) Does not flip the order of EBPF and CC init, to avoid causing bugs for existing code upstream that depends on the current order. (3) Does not cause 2 initializations for for CC in the case where the EBPF called in tcp_init_transfer() wants to set the CC to a new CC algorithm. (4) Allows follow-on simplifications to the code in net/core/filter.c and net/ipv4/tcp_cong.c, which currently both have some complexity to special-case CC initialization to avoid double CC initialization if EBPF sets the CC. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- include/net/inet_connection_sock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index c738abeb3265..dc763ca9413c 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -96,7 +96,8 @@ struct inet_connection_sock { void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq); struct hlist_node icsk_listen_portaddr_node; unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); - __u8 icsk_ca_state:6, + __u8 icsk_ca_state:5, + icsk_ca_initialized:1, icsk_ca_setsockopt:1, icsk_ca_dst_locked:1; __u8 icsk_retransmits; -- cgit v1.2.3 From 29a949325c6c90f1431db9af64592275c83d9b2a Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 10 Sep 2020 15:35:34 -0400 Subject: tcp: simplify tcp_set_congestion_control(): Always reinitialize Now that the previous patches ensure that all call sites for tcp_set_congestion_control() want to initialize congestion control, we can simplify tcp_set_congestion_control() by removing the reinit argument and the code to support it. Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Yuchung Cheng Acked-by: Kevin Yang Cc: Lawrence Brakmo --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index e85d564446c6..f857146c17a5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1104,7 +1104,7 @@ void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, - bool reinit, bool cap_net_admin); + bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); -- cgit v1.2.3 From d5ea32da878d4572761ca4580631748e398542de Mon Sep 17 00:00:00 2001 From: Daniel Winkler Date: Tue, 25 Aug 2020 16:31:51 -0700 Subject: Bluetooth: Add MGMT capability flags for tx power and ext advertising For new advertising features, it will be important for userspace to know the capabilities of the controller and kernel. If the controller and kernel support extended advertising, we include flags indicating hardware offloading support and support for setting tx power of adv instances. In the future, vendor-specific commands may allow the setting of tx power in advertising instances, but for now this feature is only marked available if extended advertising is supported. This change is manually verified in userspace by ensuring the advertising manager's supported_flags field is updated with new flags on hatch chromebook (ext advertising supported). Signed-off-by: Daniel Winkler Signed-off-by: Marcel Holtmann --- include/net/bluetooth/mgmt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index beae5c3980f0..9ad505b9e694 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -572,6 +572,8 @@ struct mgmt_rp_add_advertising { #define MGMT_ADV_FLAG_SEC_1M BIT(7) #define MGMT_ADV_FLAG_SEC_2M BIT(8) #define MGMT_ADV_FLAG_SEC_CODED BIT(9) +#define MGMT_ADV_FLAG_CAN_SET_TX_POWER BIT(10) +#define MGMT_ADV_FLAG_HW_OFFLOAD BIT(11) #define MGMT_ADV_FLAG_SEC_MASK (MGMT_ADV_FLAG_SEC_1M | MGMT_ADV_FLAG_SEC_2M | \ MGMT_ADV_FLAG_SEC_CODED) -- cgit v1.2.3 From 568a36a69bad4f2efcfa4f94c83aa150a463735c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 10 Sep 2020 19:48:54 +0300 Subject: net: dsa: tag_8021q: include missing refcount.h The previous assumption was that the caller would already have this header file included. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 311aa04e7520..804750122c66 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -5,6 +5,7 @@ #ifndef _NET_DSA_8021Q_H #define _NET_DSA_8021Q_H +#include #include struct dsa_switch; -- cgit v1.2.3 From 7e092af2f3b33694b9117ffd978d42b04ec4f260 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 10 Sep 2020 19:48:55 +0300 Subject: net: dsa: tag_8021q: setup tagging via a single function call There is no point in calling dsa_port_setup_8021q_tagging for each individual port. Additionally, it will become more difficult to do that when we'll have a context structure to tag_8021q (next patch). So refactor this now. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 804750122c66..8586d8cdf956 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -25,8 +25,7 @@ struct dsa_8021q_crosschip_link { #if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) -int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, - bool enabled); +int dsa_8021q_setup(struct dsa_switch *ds, bool enabled); int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, struct dsa_switch *other_ds, @@ -57,8 +56,7 @@ bool vid_is_dsa_8021q(u16 vid); #else -int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int index, - bool enabled) +int dsa_8021q_setup(struct dsa_switch *ds, bool enabled) { return 0; } -- cgit v1.2.3 From 5899ee367ab3fec885aa04d9a2b573bf2e464e7f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 10 Sep 2020 19:48:56 +0300 Subject: net: dsa: tag_8021q: add a context structure While working on another tag_8021q driver implementation, some things became apparent: - It is not mandatory for a DSA driver to offload the tag_8021q VLANs by using the VLAN table per se. For example, it can add custom TCAM rules that simply encapsulate RX traffic, and redirect & decapsulate rules for TX traffic. For such a driver, it makes no sense to receive the tag_8021q configuration through the same callback as it receives the VLAN configuration from the bridge and the 8021q modules. - Currently, sja1105 (the only tag_8021q user) sets a priv->expect_dsa_8021q variable to distinguish between the bridge calling, and tag_8021q calling. That can be improved, to say the least. - The crosschip bridging operations are, in fact, stateful already. The list of crosschip_links must be kept by the caller and passed to the relevant tag_8021q functions. So it would be nice if the tag_8021q configuration was more self-contained. This patch attempts to do that. Create a struct dsa_8021q_context which encapsulates a struct dsa_switch, and has 2 function pointers for adding and deleting a VLAN. These will replace the previous channel to the driver, which was through the .port_vlan_add and .port_vlan_del callbacks of dsa_switch_ops. Also put the list of crosschip_links into this dsa_8021q_context. Drivers that don't support cross-chip bridging can simply omit to initialize this list, as long as they dont call any cross-chip function. The sja1105_vlan_add and sja1105_vlan_del functions are refactored into a smaller sja1105_vlan_add_one, which now has 2 entry points: - sja1105_vlan_add, from struct dsa_switch_ops - sja1105_dsa_8021q_vlan_add, from the tag_8021q ops But even this change is fairly trivial. It just reflects the fact that for sja1105, the VLANs from these 2 channels end up in the same hardware table. However that is not necessarily true in the general sense (and that's the reason for making this change). The rest of the patch is mostly plain refactoring of "ds" -> "ctx". The dsa_8021q_context structure needs to be propagated because adding a VLAN is now done through the ops function pointers inside of it. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 8586d8cdf956..2b003ae9fb38 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -12,30 +12,40 @@ struct dsa_switch; struct sk_buff; struct net_device; struct packet_type; +struct dsa_8021q_context; struct dsa_8021q_crosschip_link { struct list_head list; int port; - struct dsa_switch *other_ds; + struct dsa_8021q_context *other_ctx; int other_port; refcount_t refcount; }; +struct dsa_8021q_ops { + int (*vlan_add)(struct dsa_switch *ds, int port, u16 vid, u16 flags); + int (*vlan_del)(struct dsa_switch *ds, int port, u16 vid); +}; + +struct dsa_8021q_context { + const struct dsa_8021q_ops *ops; + struct dsa_switch *ds; + struct list_head crosschip_links; +}; + #define DSA_8021Q_N_SUBVLAN 8 #if IS_ENABLED(CONFIG_NET_DSA_TAG_8021Q) -int dsa_8021q_setup(struct dsa_switch *ds, bool enabled); +int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled); -int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, - struct list_head *crosschip_links); +int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port, + struct dsa_8021q_context *other_ctx, + int other_port); -int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, - struct list_head *crosschip_links); +int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port, + struct dsa_8021q_context *other_ctx, + int other_port); struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci); @@ -56,23 +66,21 @@ bool vid_is_dsa_8021q(u16 vid); #else -int dsa_8021q_setup(struct dsa_switch *ds, bool enabled) +int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled) { return 0; } -int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, - struct list_head *crosschip_links) +int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port, + struct dsa_8021q_context *other_ctx, + int other_port) { return 0; } -int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, - struct dsa_switch *other_ds, - int other_port, - struct list_head *crosschip_links) +int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port, + struct dsa_8021q_context *other_ctx, + int other_port) { return 0; } -- cgit v1.2.3 From 1623ad8ec04c771a54975fb84b22bc21c2dbcac1 Mon Sep 17 00:00:00 2001 From: Divya Koppera Date: Fri, 11 Sep 2020 18:48:44 +0530 Subject: net: phy: mchp: Add support for LAN8814 QUAD PHY LAN8814 is a low-power, quad-port triple-speed (10BASE-T/100BASETX/1000BASE-T) Ethernet physical layer transceiver (PHY). It supports transmission and reception of data on standard CAT-5, as well as CAT-5e and CAT-6, unshielded twisted pair (UTP) cables. LAN8814 supports industry-standard QSGMII (Quad Serial Gigabit Media Independent Interface) and Q-USGMII (Quad Universal Serial Gigabit Media Independent Interface) providing chip-to-chip connection to four Gigabit Ethernet MACs using a single serialized link (differential pair) in each direction. The LAN8814 SKU supports high-accuracy timestamping functions to support IEEE-1588 solutions using Microchip Ethernet switches, as well as customer solutions based on SoCs and FPGAs. The LAN8804 SKU has same features as that of LAN8814 SKU except that it does not support 1588, SyncE, or Q-USGMII with PCH/MCH. This adds support for 10BASE-T, 100BASE-TX, and 1000BASE-T, QSGMII link with the MAC. Signed-off-by: Divya Koppera Signed-off-by: David S. Miller --- include/linux/micrel_phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h index 75f880c25bb8..416ee6dd2574 100644 --- a/include/linux/micrel_phy.h +++ b/include/linux/micrel_phy.h @@ -27,6 +27,7 @@ #define PHY_ID_KSZ8061 0x00221570 #define PHY_ID_KSZ9031 0x00221620 #define PHY_ID_KSZ9131 0x00221640 +#define PHY_ID_LAN8814 0x00221660 #define PHY_ID_KSZ886X 0x00221430 #define PHY_ID_KSZ8863 0x00221435 -- cgit v1.2.3 From 346ce5b7d624e8cc2ec5a6abd0ea00f0e06ea8ac Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Fri, 11 Sep 2020 14:07:11 -0700 Subject: Bluetooth: Add mgmt suspend and resume events Add the controller suspend and resume events, which will signal when Bluetooth has completed preparing for suspend and when it's ready for resume. Signed-off-by: Abhishek Pandit-Subedi Reviewed-by: Miao-chen Chou Reviewed-by: Sonny Sasaka Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 3 +++ include/net/bluetooth/mgmt.h | 11 +++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 8caac20556b4..02a6ee056b23 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1750,6 +1750,9 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, s8 rssi, u8 *name, u8 name_len); void mgmt_discovering(struct hci_dev *hdev, u8 discovering); +void mgmt_suspending(struct hci_dev *hdev, u8 state); +void mgmt_resuming(struct hci_dev *hdev, u8 reason, bdaddr_t *bdaddr, + u8 addr_type); bool mgmt_powering_down(struct hci_dev *hdev); void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent); void mgmt_new_irk(struct hci_dev *hdev, struct smp_irk *irk, bool persistent); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 9ad505b9e694..e19e33c7b65c 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -1030,3 +1030,14 @@ struct mgmt_ev_adv_monitor_added { struct mgmt_ev_adv_monitor_removed { __le16 monitor_handle; } __packed; + +#define MGMT_EV_CONTROLLER_SUSPEND 0x002d +struct mgmt_ev_controller_suspend { + __u8 suspend_state; +} __packed; + +#define MGMT_EV_CONTROLLER_RESUME 0x002e +struct mgmt_ev_controller_resume { + __u8 wake_reason; + struct mgmt_addr_info addr; +} __packed; -- cgit v1.2.3 From f0cfc486f796dc3b67a4017357a6a8e76a8141c5 Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Fri, 11 Sep 2020 14:07:12 -0700 Subject: Bluetooth: Add suspend reason for device disconnect Update device disconnect event with reason 0x5 to indicate that device disconnected because the controller is suspending. Signed-off-by: Abhishek Pandit-Subedi Reviewed-by: Miao-chen Chou Reviewed-by: Sonny Sasaka Signed-off-by: Marcel Holtmann --- include/net/bluetooth/mgmt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index e19e33c7b65c..a4b8935e0db9 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -842,6 +842,7 @@ struct mgmt_ev_device_connected { #define MGMT_DEV_DISCONN_LOCAL_HOST 0x02 #define MGMT_DEV_DISCONN_REMOTE 0x03 #define MGMT_DEV_DISCONN_AUTH_FAILURE 0x04 +#define MGMT_DEV_DISCONN_LOCAL_HOST_SUSPEND 0x05 #define MGMT_EV_DEVICE_DISCONNECTED 0x000C struct mgmt_ev_device_disconnected { -- cgit v1.2.3 From 2f20216c1d6fe17c1a224f658be0dc6fab2269c7 Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Fri, 11 Sep 2020 14:07:13 -0700 Subject: Bluetooth: Emit controller suspend and resume events Emit controller suspend and resume events when we are ready for suspend and we've resumed from suspend. The controller suspend event will report whatever suspend state was successfully entered. The controller resume event will check the first HCI event that was received after we finished preparing for suspend and, if it was a connection event, store the address of the peer that caused the event. If it was not a connection event, we mark the wake reason as an unexpected event. Here is a sample btmon trace with these events: @ MGMT Event: Controller Suspended (0x002d) plen 1 Suspend state: Page scanning and/or passive scanning (2) @ MGMT Event: Controller Resumed (0x002e) plen 8 Wake reason: Remote wake due to peer device connection (2) LE Address: CD:F3:CD:13:C5:9A (OUI CD-F3-CD) Signed-off-by: Abhishek Pandit-Subedi Reviewed-by: Miao-chen Chou Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 3 +++ include/net/bluetooth/mgmt.h | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 02a6ee056b23..9873e1c8cd16 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -484,6 +484,9 @@ struct hci_dev { enum suspended_state suspend_state; bool scanning_paused; bool suspended; + u8 wake_reason; + bdaddr_t wake_addr; + u8 wake_addr_type; wait_queue_head_t suspend_wait_q; DECLARE_BITMAP(suspend_tasks, __SUSPEND_NUM_TASKS); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index a4b8935e0db9..6b55155e05e9 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -1042,3 +1042,7 @@ struct mgmt_ev_controller_resume { __u8 wake_reason; struct mgmt_addr_info addr; } __packed; + +#define MGMT_WAKE_REASON_NON_BT_WAKE 0x0 +#define MGMT_WAKE_REASON_UNEXPECTED 0x1 +#define MGMT_WAKE_REASON_REMOTE_WAKE 0x2 -- cgit v1.2.3 From 96a9c425e234aac193afb9da6ca11fafc679c7e2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 14 Sep 2020 13:02:58 +0100 Subject: rxrpc: Fix a missing NULL-pointer check in a trace Fix the rxrpc_client tracepoint to not dereference conn to get the cid if conn is NULL, as it does for other fields. RIP: 0010:trace_event_raw_event_rxrpc_client+0x7e/0xe0 [rxrpc] Call Trace: rxrpc_activate_channels+0x62/0xb0 [rxrpc] rxrpc_connect_call+0x481/0x650 [rxrpc] ? wake_up_q+0xa0/0xa0 ? rxrpc_kernel_begin_call+0x12a/0x1b0 [rxrpc] rxrpc_new_client_call+0x2a5/0x5e0 [rxrpc] Fixes: 245500d853e9 ("rxrpc: Rewrite the client connection manager") Reported-by: Marc Dionne Signed-off-by: David Howells Tested-by: Marc Dionne --- include/trace/events/rxrpc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 3b67d5981224..e70c90116eda 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -579,7 +579,7 @@ TRACE_EVENT(rxrpc_client, __entry->channel = channel; __entry->usage = conn ? atomic_read(&conn->usage) : -2; __entry->op = op; - __entry->cid = conn->proto.cid; + __entry->cid = conn ? conn->proto.cid : 0; ), TP_printk("C=%08x h=%2d %s i=%08x u=%d", -- cgit v1.2.3 From 1fa5cef283420b3dad93cd6ab04d7125bc1562de Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 18 Aug 2020 15:07:57 +0800 Subject: i40e: optimise prefetch page refcount refcount of rx_buffer page will be added here originally, so prefetchw is needed, but after commit 1793668c3b8c ("i40e/i40evf: Update code to better handle incrementing page count"), and refcount is not added every time, so change prefetchw as prefetch. Now it mainly services page_address(), but which accesses struct page only when WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL is defined otherwise it returns address based on offset, so we prefetch it conditionally. Jakub suggested to define prefetch_page_address in a common header. Reported-by: kernel test robot Suggested-by: Jakub Kicinski Signed-off-by: Li RongQing Reviewed-by: Jesse Brandeburg Tested-by: Aaron Brown Signed-off-by: Tony Nguyen --- include/linux/prefetch.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/prefetch.h b/include/linux/prefetch.h index 13eafebf3549..b83a3f944f28 100644 --- a/include/linux/prefetch.h +++ b/include/linux/prefetch.h @@ -15,6 +15,7 @@ #include #include +struct page; /* prefetch(x) attempts to pre-emptively get the memory pointed to by address "x" into the CPU L1 cache. @@ -62,4 +63,11 @@ static inline void prefetch_range(void *addr, size_t len) #endif } +static inline void prefetch_page_address(struct page *page) +{ +#if defined(WANT_PAGE_VIRTUAL) || defined(HASHED_PAGE_VIRTUAL) + prefetch(page); +#endif +} + #endif -- cgit v1.2.3 From c76c6956566f974bac2470bd72fc22fb923e04a1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 14 Sep 2020 10:01:18 +0200 Subject: mptcp: call tcp_cleanup_rbuf on subflows That is needed to let the subflows announce promptly when new space is available in the receive buffer. tcp_cleanup_rbuf() is currently a static function, drop the scope modifier and add a declaration in the TCP header. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index e85d564446c6..852f0d71dd40 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1414,6 +1414,8 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } +void tcp_cleanup_rbuf(struct sock *sk, int copied); + /* We provision sk_rcvbuf around 200% of sk_rcvlowat. * If 87.5 % (7/8) of the space has been consumed, we want to override * SO_RCVLOWAT constraint, since we are receiving skbs with too small -- cgit v1.2.3 From 0cbe6a8f089e5912a577537c97833546d558c357 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Sep 2020 03:20:27 -0700 Subject: tcp: remove SOCK_QUEUE_SHRUNK SOCK_QUEUE_SHRUNK is currently used by TCP as a temporary state that remembers if some room has been made in the rtx queue by an incoming ACK packet. This is later used from tcp_check_space() before considering to send EPOLLOUT. Problem is: If we receive SACK packets, and no packet is removed from RTX queue, we can send fresh packets, thus moving them from write queue to rtx queue and eventually empty the write queue. This stall can happen if TCP_NOTSENT_LOWAT is used. With this fix, we no longer risk stalling sends while holes are repaired, and we can fully use socket sndbuf. This also removes a cache line dirtying for typical RPC workloads. Fixes: c9bee3b7fdec ("tcp: TCP_NOTSENT_LOWAT socket option") Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/sock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 7dd3051551fb..eaa5cac5e836 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -845,7 +845,6 @@ enum sock_flags { SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ - SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ SOCK_MEMALLOC, /* VM depends on this socket for swapping */ SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ SOCK_FASYNC, /* fasync() active */ @@ -1526,7 +1525,6 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); sk_wmem_queued_add(sk, -skb->truesize); sk_mem_uncharge(sk, skb->truesize); if (static_branch_unlikely(&tcp_tx_skb_cache_key) && -- cgit v1.2.3 From fb609b5112bd74b4ba93c86d7af4089ffd9432c2 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Wed, 13 May 2020 11:06:47 +0300 Subject: net/mlx5: Always use container_of to find mdev pointer from clock struct Clock struct is part of struct mlx5_core_dev. Code was inconsistent, on some cases used container_of and on another used clock->mdev. Align code to use container_of amd remove clock->mdev pointer. While here, fix reverse xmas tree coding style. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c145de0473bc..8dc3da6e6480 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -643,7 +643,6 @@ struct mlx5_pps { }; struct mlx5_clock { - struct mlx5_core_dev *mdev; struct mlx5_nb pps_nb; seqlock_t lock; struct cyclecounter cycles; -- cgit v1.2.3 From b7cf0806e8783e38f881cae3c56f0869e70b8da2 Mon Sep 17 00:00:00 2001 From: Ofer Levi Date: Sun, 17 May 2020 10:16:49 +0300 Subject: net/mlx5e: Add CQE compression support for multi-strides packets Add CQE compression support for completions of packets that span multiple strides in a Striding RQ, per the HW capability. In our memory model, we use small strides (256B as of today) for the non-linear SKB mode. This feature allows CQE compression to work also for multiple strides packets. In this case decompressing the mini CQE array will use stride index provided by HW as part of the mini CQE. Before this feature, compression was possible only for single-strided packets, i.e. for packets of size up to 256 bytes when in non-linear mode, and the index was maintained by SW. This feature is supported for ConnectX-5 and above. Feature performance test: This was whitebox-tested, we reduced the PCI speed from 125Gb/s to 62.5Gb/s to overload pci and manipulated mlx5 driver to drop incoming packets before building the SKB to achieve low cpu utilization. Outcome is low cpu utilization and bottleneck on pci only. Test setup: Server: Intel(R) Xeon(R) Silver 4108 CPU @ 1.80GHz server, 32 cores NIC: ConnectX-6 DX. Sender side generates 300 byte packets at full pci bandwidth. Receiver side configuration: Single channel, one cpu processing with one ring allocated. Cpu utilization is ~20% while pci bandwidth is fully utilized. For the generated traffic and interface MTU of 4500B (to activate the non-linear SKB mode), packet rate improvement is about 19% from ~17.6Mpps to ~21Mpps. Without this feature, counters show no CQE compression blocks for this setup, while with the feature, counters show ~20.7Mpps compressed CQEs in ~500K compression blocks. Signed-off-by: Ofer Levi Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 4d3376e20f5e..81ca5989009b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -816,7 +816,7 @@ struct mlx5_mini_cqe8 { __be32 rx_hash_result; struct { __be16 checksum; - __be16 rsvd; + __be16 stridx; }; struct { __be16 wqe_counter; @@ -836,6 +836,7 @@ enum { enum { MLX5_CQE_FORMAT_CSUM = 0x1, + MLX5_CQE_FORMAT_CSUM_STRIDX = 0x3, }; #define MLX5_MINI_CQE_ARRAY_SIZE 8 -- cgit v1.2.3 From d05e8e68b07cef9d52bbf53e75fa5faea81e1da6 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 10 Sep 2020 19:23:48 +0200 Subject: bridge: Add SWITCHDEV_FDB_FLUSH_TO_BRIDGE notifier so the switchdev can notifiy the bridge to flush non-permanent fdb entries for this port. This is useful whenever the hardware fdb of the switchdev is reset, but the netdev and the bridgeport are not deleted. Note that this has the same effect as the IFLA_BRPORT_FLUSH attribute. CC: Jiri Pirko CC: Ivan Vecera CC: Roopa Prabhu CC: Nikolay Aleksandrov Signed-off-by: Alexandra Winter Signed-off-by: Julian Wiedmann Acked-by: Nikolay Aleksandrov Acked-by: Ivan Vecera Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index ff2246914301..53e8b4994296 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -203,6 +203,7 @@ enum switchdev_notifier_type { SWITCHDEV_FDB_ADD_TO_DEVICE, SWITCHDEV_FDB_DEL_TO_DEVICE, SWITCHDEV_FDB_OFFLOADED, + SWITCHDEV_FDB_FLUSH_TO_BRIDGE, SWITCHDEV_PORT_OBJ_ADD, /* Blocking. */ SWITCHDEV_PORT_OBJ_DEL, /* Blocking. */ -- cgit v1.2.3 From 9a27a33027f22a716ce362be48d70ae0eb012ab7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Sep 2020 17:11:52 -0700 Subject: ethtool: add standard pause stats Currently drivers have to report their pause frames statistics via ethtool -S, and there is a wide variety of names used for these statistics. Add the two statistics defined in IEEE 802.3x to the standard API. Create a new ethtool request header flag for including statistics in the response to GET commands. Always create the ETHTOOL_A_PAUSE_STATS nest in replies when flag is set. Testing if driver declares the op is not a reliable way of checking if any stats will actually be included and therefore we don't want to give the impression that presence of ETHTOOL_A_PAUSE_STATS indicates driver support. Note that this patch does not include PFC counters, which may fit better in dcbnl? But mostly I don't need them/have a setup to test them so I haven't looked deeply into exposing them :) v3: - add a helper for "uninitializing" stats, rather than a cryptic memset() (Andrew) Signed-off-by: Jakub Kicinski Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/ethtool.h | 26 ++++++++++++++++++++++++++ include/uapi/linux/ethtool_netlink.h | 18 +++++++++++++++++- 2 files changed, 43 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 969a80211df6..060b20f0b20f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -241,6 +241,27 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, ETHTOOL_COALESCE_PKT_RATE_LOW | ETHTOOL_COALESCE_PKT_RATE_HIGH | \ ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL) +#define ETHTOOL_STAT_NOT_SET (~0ULL) + +/** + * struct ethtool_pause_stats - statistics for IEEE 802.3x pause frames + * @tx_pause_frames: transmitted pause frame count. Reported to user space + * as %ETHTOOL_A_PAUSE_STAT_TX_FRAMES. + * + * Equivalent to `30.3.4.2 aPAUSEMACCtrlFramesTransmitted` + * from the standard. + * + * @rx_pause_frames: received pause frame count. Reported to user space + * as %ETHTOOL_A_PAUSE_STAT_RX_FRAMES. Equivalent to: + * + * Equivalent to `30.3.4.3 aPAUSEMACCtrlFramesReceived` + * from the standard. + */ +struct ethtool_pause_stats { + u64 tx_pause_frames; + u64 rx_pause_frames; +}; + /** * struct ethtool_ops - optional netdev operations * @supported_coalesce_params: supported types of interrupt coalescing. @@ -282,6 +303,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, * Returns a negative error code or zero. * @get_ringparam: Report ring sizes * @set_ringparam: Set ring sizes. Returns a negative error code or zero. + * @get_pause_stats: Report pause frame statistics. Drivers must not zero + * statistics which they don't report. The stats structure is initialized + * to ETHTOOL_STAT_NOT_SET indicating driver does not report statistics. * @get_pauseparam: Report pause parameters * @set_pauseparam: Set pause parameters. Returns a negative error code * or zero. @@ -418,6 +442,8 @@ struct ethtool_ops { struct ethtool_ringparam *); int (*set_ringparam)(struct net_device *, struct ethtool_ringparam *); + void (*get_pause_stats)(struct net_device *dev, + struct ethtool_pause_stats *pause_stats); void (*get_pauseparam)(struct net_device *, struct ethtool_pauseparam*); int (*set_pauseparam)(struct net_device *, diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 5dcd24cb33ea..9cee6df01a10 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -91,9 +91,12 @@ enum { #define ETHTOOL_FLAG_COMPACT_BITSETS (1 << 0) /* provide optional reply for SET or ACT requests */ #define ETHTOOL_FLAG_OMIT_REPLY (1 << 1) +/* request statistics, if supported by the driver */ +#define ETHTOOL_FLAG_STATS (1 << 2) #define ETHTOOL_FLAG_ALL (ETHTOOL_FLAG_COMPACT_BITSETS | \ - ETHTOOL_FLAG_OMIT_REPLY) + ETHTOOL_FLAG_OMIT_REPLY | \ + ETHTOOL_FLAG_STATS) enum { ETHTOOL_A_HEADER_UNSPEC, @@ -376,12 +379,25 @@ enum { ETHTOOL_A_PAUSE_AUTONEG, /* u8 */ ETHTOOL_A_PAUSE_RX, /* u8 */ ETHTOOL_A_PAUSE_TX, /* u8 */ + ETHTOOL_A_PAUSE_STATS, /* nest - _PAUSE_STAT_* */ /* add new constants above here */ __ETHTOOL_A_PAUSE_CNT, ETHTOOL_A_PAUSE_MAX = (__ETHTOOL_A_PAUSE_CNT - 1) }; +enum { + ETHTOOL_A_PAUSE_STAT_UNSPEC, + ETHTOOL_A_PAUSE_STAT_PAD, + + ETHTOOL_A_PAUSE_STAT_TX_FRAMES, + ETHTOOL_A_PAUSE_STAT_RX_FRAMES, + + /* add new constants above here */ + __ETHTOOL_A_PAUSE_STAT_CNT, + ETHTOOL_A_PAUSE_STAT_MAX = (__ETHTOOL_A_PAUSE_STAT_CNT - 1) +}; + /* EEE */ enum { -- cgit v1.2.3 From e2ce94dc1d89e0f76ddd202cea72e0f505083d0a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 15 Sep 2020 11:40:57 +0300 Subject: devlink: introduce the health reporter test command Introduce a test command for health reporters. User might use this command to trigger test event on a reporter if the reporter supports it. Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/devlink.h | 3 +++ include/uapi/linux/devlink.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index eaec0a8cc5ef..48b1c1ef1ebd 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -566,6 +566,7 @@ enum devlink_health_reporter_state { * @dump: callback to dump an object * if priv_ctx is NULL, run a full dump * @diagnose: callback to diagnose the current status + * @test: callback to trigger a test event */ struct devlink_health_reporter_ops { @@ -578,6 +579,8 @@ struct devlink_health_reporter_ops { int (*diagnose)(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg, struct netlink_ext_ack *extack); + int (*test)(struct devlink_health_reporter *reporter, + struct netlink_ext_ack *extack); }; /** diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 40d35145c879..631f5bdf1707 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -122,6 +122,8 @@ enum devlink_command { DEVLINK_CMD_TRAP_POLICER_NEW, DEVLINK_CMD_TRAP_POLICER_DEL, + DEVLINK_CMD_HEALTH_REPORTER_TEST, + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 -- cgit v1.2.3 From 7d61588f690def55ba2885f7f4b03d13ff45b163 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Sep 2020 14:40:59 +0300 Subject: nexthop: Remove unused function declaration from header file Not used or implemented anywhere. Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/nexthop.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 3a4f9e3b91a5..2e44efe5709b 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -109,9 +109,6 @@ enum nexthop_event_type { NEXTHOP_EVENT_DEL }; -int call_nexthop_notifier(struct notifier_block *nb, struct net *net, - enum nexthop_event_type event_type, - struct nexthop *nh); int register_nexthop_notifier(struct net *net, struct notifier_block *nb); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); -- cgit v1.2.3 From 52f7232a790a36da30eb64c6de6067a9e4ad194c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Sep 2020 14:41:00 +0300 Subject: nexthop: Remove NEXTHOP_EVENT_ADD Not used anywhere. Signed-off-by: Ido Schimmel Suggested-by: David Ahern Signed-off-by: David S. Miller --- include/net/nexthop.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 2e44efe5709b..2fd76a9b6dc8 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -105,7 +105,6 @@ struct nexthop { }; enum nexthop_event_type { - NEXTHOP_EVENT_ADD, NEXTHOP_EVENT_DEL }; -- cgit v1.2.3 From 80690ec6b595807db9a52ec5b225a2d88033ddb5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Sep 2020 14:41:01 +0300 Subject: nexthop: Convert to blocking notification chain Currently, the only listener of the nexthop notification chain is the VXLAN driver. Subsequent patches will add more listeners (e.g., device drivers such as netdevsim) that need to be able to block when processing notifications. Therefore, convert the notification chain to a blocking one. This is safe as notifications are always emitted from process context. Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/netns/nexthop.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/nexthop.h b/include/net/netns/nexthop.h index 1937476c94a0..1849e77eb68a 100644 --- a/include/net/netns/nexthop.h +++ b/include/net/netns/nexthop.h @@ -14,6 +14,6 @@ struct netns_nexthop { unsigned int seq; /* protected by rtnl_mutex */ u32 last_id_allocated; - struct atomic_notifier_head notifier_chain; + struct blocking_notifier_head notifier_chain; }; #endif -- cgit v1.2.3 From 984fe94f94756dacb3c8cc52904a23adf9e04da1 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:39 -0700 Subject: bpf: Mutex protect used_maps array and count To support modifying the used_maps array, we use a mutex to protect the use of the counter and the array. The mutex is initialized right after the prog aux is allocated, and destroyed right before prog aux is freed. This way we guarantee it's initialized for both cBPF and eBPF. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-2-sdf@google.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c6d9f2c444f4..5dcce0364634 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -751,6 +751,7 @@ struct bpf_prog_aux { struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; + struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */ struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ -- cgit v1.2.3 From ef15314aa5de955c6afd87d512e8b00f5ac08d06 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 15 Sep 2020 16:45:40 -0700 Subject: bpf: Add BPF_PROG_BIND_MAP syscall This syscall binds a map to a program. Returns success if the map is already bound to the program. Signed-off-by: YiFei Zhu Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Cc: YiFei Zhu Link: https://lore.kernel.org/bpf/20200915234543.3220146-3-sdf@google.com --- include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7dd314176df7..a22812561064 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -124,6 +124,7 @@ enum bpf_cmd { BPF_ENABLE_STATS, BPF_ITER_CREATE, BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, }; enum bpf_map_type { @@ -658,6 +659,12 @@ union bpf_attr { __u32 flags; } iter_create; + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From ba3a86e47232ad9f76160929f33ac9c64e4d0567 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 14 Sep 2020 15:44:37 -0700 Subject: rcu-tasks: Fix grace-period/unlock race in RCU Tasks Trace The more intense grace-period processing resulting from the 50x RCU Tasks Trace grace-period speedups exposed the following race condition: o Task A running on CPU 0 executes rcu_read_lock_trace(), entering a read-side critical section. o When Task A eventually invokes rcu_read_unlock_trace() to exit its read-side critical section, this function notes that the ->trc_reader_special.s flag is zero and and therefore invoke wil set ->trc_reader_nesting to zero using WRITE_ONCE(). But before that happens... o The RCU Tasks Trace grace-period kthread running on some other CPU interrogates Task A, but this fails because this task is currently running. This kthread therefore sends an IPI to CPU 0. o CPU 0 receives the IPI, and thus invokes trc_read_check_handler(). Because Task A has not yet cleared its ->trc_reader_nesting counter, this function sees that Task A is still within its read-side critical section. This function therefore sets the ->trc_reader_nesting.b.need_qs flag, AKA the .need_qs flag. Except that Task A has already checked the .need_qs flag, which is part of the ->trc_reader_special.s flag. The .need_qs flag therefore remains set until Task A's next rcu_read_unlock_trace(). o Task A now invokes synchronize_rcu_tasks_trace(), which cannot start a new grace period until the current grace period completes. And thus cannot return until after that time. But Task A's .need_qs flag is still set, which prevents the current grace period from completing. And because Task A is blocked, it will never execute rcu_read_unlock_trace() until its call to synchronize_rcu_tasks_trace() returns. We are therefore deadlocked. This race is improbable, but 80 hours of rcutorture made it happen twice. The race was possible before the grace-period speedup, but roughly 50x less probable. Several thousand hours of rcutorture would have been necessary to have a reasonable chance of making this happen before this 50x speedup. This commit therefore eliminates this deadlock by setting ->trc_reader_nesting to a large negative number before checking the .need_qs and zeroing (or decrementing with respect to its initial value) ->trc_reader_nesting. For its part, the IPI handler's trc_read_check_handler() function adds a check for negative values, deferring evaluation of the task in this case. Taken together, these changes avoid this deadlock scenario. Fixes: 276c410448db ("rcu-tasks: Split ->trc_reader_need_end") Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Jiri Olsa Cc: Cc: # 5.7.x Signed-off-by: Paul E. McKenney --- include/linux/rcupdate_trace.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h index d9015aac78c6..a6a6a3acab5a 100644 --- a/include/linux/rcupdate_trace.h +++ b/include/linux/rcupdate_trace.h @@ -50,6 +50,7 @@ static inline void rcu_read_lock_trace(void) struct task_struct *t = current; WRITE_ONCE(t->trc_reader_nesting, READ_ONCE(t->trc_reader_nesting) + 1); + barrier(); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) && t->trc_reader_special.b.need_mb) smp_mb(); // Pairs with update-side barriers @@ -72,6 +73,9 @@ static inline void rcu_read_unlock_trace(void) rcu_lock_release(&rcu_trace_lock_map); nesting = READ_ONCE(t->trc_reader_nesting) - 1; + barrier(); // Critical section before disabling. + // Disable IPI-based setting of .need_qs. + WRITE_ONCE(t->trc_reader_nesting, INT_MIN); if (likely(!READ_ONCE(t->trc_reader_special.s)) || nesting) { WRITE_ONCE(t->trc_reader_nesting, nesting); return; // We assume shallow reader nesting. -- cgit v1.2.3 From a748c6975dea325da540610c2ba9b5f332c603e6 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:05 +0200 Subject: bpf: propagate poke descriptors to subprograms Previously, there was no need for poke descriptors being present in subprogram's bpf_prog_aux struct since tailcalls were simply not allowed in them. Each subprog is JITed independently so in order to enable JITing subprograms that use tailcalls, do the following: - in fixup_bpf_calls() store the index of tailcall insn onto the generated poke descriptor, - in case when insn patching occurs, adjust the tailcall insn idx from bpf_patch_insn_data, - then in jit_subprogs() check whether the given poke descriptor belongs to the current subprog by checking if that previously stored absolute index of tail call insn is in the scope of the insns of given subprog, - update the insn->imm with new poke descriptor slot so that while JITing the proper poke descriptor will be grabbed This way each of the main program's poke descriptors are distributed across the subprograms poke descriptor array, so main program's descriptors can be untracked out of the prog array map. Add also subprog's aux struct to the BPF map poke_progs list by calling on it map_poke_track(). In case of any error, call the map_poke_untrack() on subprog's aux structs that have already been registered to prog array map. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5dcce0364634..a23e5eb728c8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -707,6 +707,7 @@ struct bpf_jit_poke_descriptor { bool ip_stable; u8 adj_off; u16 reason; + u32 insn_idx; }; /* reg_type info for ctx arguments */ -- cgit v1.2.3 From cf71b174d3464c7dc22f86f25d629a8d9d5c3519 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:06 +0200 Subject: bpf: rename poke descriptor's 'ip' member to 'tailcall_target' Reflect the actual purpose of poke->ip and rename it to poke->tailcall_target so that it will not the be confused with another poke target that will be introduced in next commit. While at it, do the same thing with poke->ip_stable - rename it to poke->tailcall_target_stable. Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a23e5eb728c8..f3790c9cf542 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -697,14 +697,14 @@ enum bpf_jit_poke_reason { /* Descriptor of pokes pointing /into/ the JITed image. */ struct bpf_jit_poke_descriptor { - void *ip; + void *tailcall_target; union { struct { struct bpf_map *map; u32 key; } tail_call; }; - bool ip_stable; + bool tailcall_target_stable; u8 adj_off; u16 reason; u32 insn_idx; -- cgit v1.2.3 From 2b7ea122a0c437442bf54a5f1c60155757df270c Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 16 Sep 2020 22:16:29 +0800 Subject: net/sched: Remove unused function qdisc_queue_drop_head() It is not used since commit a09ceb0e0814 ("sched: remove qdisc->drop") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- include/net/sch_generic.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index d60e7c39d60c..6c762457122f 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1047,12 +1047,6 @@ static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch, return 0; } -static inline unsigned int qdisc_queue_drop_head(struct Qdisc *sch, - struct sk_buff **to_free) -{ - return __qdisc_queue_drop_head(sch, &sch->q, to_free); -} - static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch) { const struct qdisc_skb_head *qh = &sch->q; -- cgit v1.2.3 From 5114b331051981ecbdf144b5ad33387ae8d0f0d5 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 16 Sep 2020 22:17:28 +0800 Subject: genetlink: Remove unused function genl_err_attr() It is never used, so can remove it. Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- include/net/genetlink.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 6e5f1e1aa822..b9eb92f3fe86 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -101,14 +101,6 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net) #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg) -static inline int genl_err_attr(struct genl_info *info, int err, - const struct nlattr *attr) -{ - info->extack->bad_attr = attr; - - return err; -} - enum genl_validate_flags { GENL_DONT_VALIDATE_STRICT = BIT(0), GENL_DONT_VALIDATE_DUMP = BIT(1), -- cgit v1.2.3 From 2492c205d2bbbc01f5c9e49fffe4b2e633c33f38 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 17 Sep 2020 10:19:10 +0800 Subject: netdev: Remove unused functions There is no callers in tree, so can remove it. Signed-off-by: YueHaibing Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 157e0242e9ee..909b1fbb0481 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4677,16 +4677,6 @@ int netdev_class_create_file_ns(const struct class_attribute *class_attr, void netdev_class_remove_file_ns(const struct class_attribute *class_attr, const void *ns); -static inline int netdev_class_create_file(const struct class_attribute *class_attr) -{ - return netdev_class_create_file_ns(class_attr, NULL); -} - -static inline void netdev_class_remove_file(const struct class_attribute *class_attr) -{ - netdev_class_remove_file_ns(class_attr, NULL); -} - extern const struct kobj_ns_type_operations net_ns_type_operations; const char *netdev_drivername(const struct net_device *dev); -- cgit v1.2.3 From 78a3ea5557137b0811f3c5a020afaafa7b61d6aa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Sep 2020 10:51:32 -0700 Subject: net: remove comments on struct rtnl_link_stats We removed the misleading comments from struct rtnl_link_stats64 when we added proper kdoc. struct rtnl_link_stats has the same inline comments, so remove them, too. Signed-off-by: Jakub Kicinski Reviewed-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index bf4667403cab..c4b23f06f69e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -7,24 +7,23 @@ /* This struct should be in sync with struct rtnl_link_stats64 */ struct rtnl_link_stats { - __u32 rx_packets; /* total packets received */ - __u32 tx_packets; /* total packets transmitted */ - __u32 rx_bytes; /* total bytes received */ - __u32 tx_bytes; /* total bytes transmitted */ - __u32 rx_errors; /* bad packets received */ - __u32 tx_errors; /* packet transmit problems */ - __u32 rx_dropped; /* no space in linux buffers */ - __u32 tx_dropped; /* no space available in linux */ - __u32 multicast; /* multicast packets received */ + __u32 rx_packets; + __u32 tx_packets; + __u32 rx_bytes; + __u32 tx_bytes; + __u32 rx_errors; + __u32 tx_errors; + __u32 rx_dropped; + __u32 tx_dropped; + __u32 multicast; __u32 collisions; - /* detailed rx_errors: */ __u32 rx_length_errors; - __u32 rx_over_errors; /* receiver ring buff overflow */ - __u32 rx_crc_errors; /* recved pkt with crc error */ - __u32 rx_frame_errors; /* recv'd frame alignment error */ - __u32 rx_fifo_errors; /* recv'r fifo overrun */ - __u32 rx_missed_errors; /* receiver missed packet */ + __u32 rx_over_errors; + __u32 rx_crc_errors; + __u32 rx_frame_errors; + __u32 rx_fifo_errors; + __u32 rx_missed_errors; /* detailed tx_errors */ __u32 tx_aborted_errors; @@ -37,7 +36,7 @@ struct rtnl_link_stats { __u32 rx_compressed; __u32 tx_compressed; - __u32 rx_nohandler; /* dropped, no handler found */ + __u32 rx_nohandler; }; /** -- cgit v1.2.3 From 7f6e4312e15a5c370e84eaa685879b6bdcc717e4 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:07 +0200 Subject: bpf: Limit caller's stack depth 256 for subprogs with tailcalls Protect against potential stack overflow that might happen when bpf2bpf calls get combined with tailcalls. Limit the caller's stack depth for such case down to 256 so that the worst case scenario would result in 8k stack size (32 which is tailcall limit * 256 = 8k). Suggested-by: Alexei Starovoitov Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 53c7bd568c5d..5026b75db972 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -358,6 +358,7 @@ struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ + bool has_tail_call; }; /* single container for all structs -- cgit v1.2.3 From ebf7d1f508a73871acf3b2bfbfa1323a477acdb3 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Wed, 16 Sep 2020 23:10:08 +0200 Subject: bpf, x64: rework pro/epilogue and tailcall handling in JIT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit serves two things: 1) it optimizes BPF prologue/epilogue generation 2) it makes possible to have tailcalls within BPF subprogram Both points are related to each other since without 1), 2) could not be achieved. In [1], Alexei says: "The prologue will look like: nop5 xor eax,eax  // two new bytes if bpf_tail_call() is used in this // function push rbp mov rbp, rsp sub rsp, rounded_stack_depth push rax // zero init tail_call counter variable number of push rbx,r13,r14,r15 Then bpf_tail_call will pop variable number rbx,.. and final 'pop rax' Then 'add rsp, size_of_current_stack_frame' jmp to next function and skip over 'nop5; xor eax,eax; push rpb; mov rbp, rsp' This way new function will set its own stack size and will init tail call counter with whatever value the parent had. If next function doesn't use bpf_tail_call it won't have 'xor eax,eax'. Instead it would need to have 'nop2' in there." Implement that suggestion. Since the layout of stack is changed, tail call counter handling can not rely anymore on popping it to rbx just like it have been handled for constant prologue case and later overwrite of rbx with actual value of rbx pushed to stack. Therefore, let's use one of the register (%rcx) that is considered to be volatile/caller-saved and pop the value of tail call counter in there in the epilogue. Drop the BUILD_BUG_ON in emit_prologue and in emit_bpf_tail_call_indirect where instruction layout is not constant anymore. Introduce new poke target, 'tailcall_bypass' to poke descriptor that is dedicated for skipping the register pops and stack unwind that are generated right before the actual jump to target program. For case when the target program is not present, BPF program will skip the pop instructions and nop5 dedicated for jmpq $target. An example of such state when only R6 of callee saved registers is used by program: ffffffffc0513aa1: e9 0e 00 00 00 jmpq 0xffffffffc0513ab4 ffffffffc0513aa6: 5b pop %rbx ffffffffc0513aa7: 58 pop %rax ffffffffc0513aa8: 48 81 c4 00 00 00 00 add $0x0,%rsp ffffffffc0513aaf: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) ffffffffc0513ab4: 48 89 df mov %rbx,%rdi When target program is inserted, the jump that was there to skip pops/nop5 will become the nop5, so CPU will go over pops and do the actual tailcall. One might ask why there simply can not be pushes after the nop5? In the following example snippet: ffffffffc037030c: 48 89 fb mov %rdi,%rbx (...) ffffffffc0370332: 5b pop %rbx ffffffffc0370333: 58 pop %rax ffffffffc0370334: 48 81 c4 00 00 00 00 add $0x0,%rsp ffffffffc037033b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) ffffffffc0370340: 48 81 ec 00 00 00 00 sub $0x0,%rsp ffffffffc0370347: 50 push %rax ffffffffc0370348: 53 push %rbx ffffffffc0370349: 48 89 df mov %rbx,%rdi ffffffffc037034c: e8 f7 21 00 00 callq 0xffffffffc0372548 There is the bpf2bpf call (at ffffffffc037034c) right after the tailcall and jump target is not present. ctx is in %rbx register and BPF subprogram that we will call into on ffffffffc037034c is relying on it, e.g. it will pick ctx from there. Such code layout is therefore broken as we would overwrite the content of %rbx with the value that was pushed on the prologue. That is the reason for the 'bypass' approach. Special care needs to be taken during the install/update/remove of tailcall target. In case when target program is not present, the CPU must not execute the pop instructions that precede the tailcall. To address that, the following states can be defined: A nop, unwind, nop B nop, unwind, tail C skip, unwind, nop D skip, unwind, tail A is forbidden (lead to incorrectness). The state transitions between tailcall install/update/remove will work as follows: First install tail call f: C->D->B(f) * poke the tailcall, after that get rid of the skip Update tail call f to f': B(f)->B(f') * poke the tailcall (poke->tailcall_target) and do NOT touch the poke->tailcall_bypass Remove tail call: B(f')->C(f') * poke->tailcall_bypass is poked back to jump, then we wait the RCU grace period so that other programs will finish its execution and after that we are safe to remove the poke->tailcall_target Install new tail call (f''): C(f')->D(f'')->B(f''). * same as first step This way CPU can never be exposed to "unwind, tail" state. Last but not least, when tailcalls get mixed with bpf2bpf calls, it would be possible to encounter the endless loop due to clearing the tailcall counter if for example we would use the tailcall3-like from BPF selftests program that would be subprogram-based, meaning the tailcall would be present within the BPF subprogram. This test, broken down to particular steps, would do: entry -> set tailcall counter to 0, bump it by 1, tailcall to func0 func0 -> call subprog_tail (we are NOT skipping the first 11 bytes of prologue and this subprogram has a tailcall, therefore we clear the counter...) subprog -> do the same thing as entry and then loop forever. To address this, the idea is to go through the call chain of bpf2bpf progs and look for a tailcall presence throughout whole chain. If we saw a single tail call then each node in this call chain needs to be marked as a subprog that can reach the tailcall. We would later feed the JIT with this info and: - set eax to 0 only when tailcall is reachable and this is the entry prog - if tailcall is reachable but there's no tailcall in insns of currently JITed prog then push rax anyway, so that it will be possible to propagate further down the call chain - finally if tailcall is reachable, then we need to precede the 'call' insn with mov rax, [rbp - (stack_depth + 8)] Tail call related cases from test_verifier kselftest are also working fine. Sample BPF programs that utilize tail calls (sockex3, tracex5) work properly as well. [1]: https://lore.kernel.org/bpf/20200517043227.2gpq22ifoq37ogst@ast-mbp.dhcp.thefacebook.com/ Suggested-by: Alexei Starovoitov Signed-off-by: Maciej Fijalkowski Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/linux/bpf_verifier.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f3790c9cf542..d7c5a6ed87e3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -698,6 +698,8 @@ enum bpf_jit_poke_reason { /* Descriptor of pokes pointing /into/ the JITed image. */ struct bpf_jit_poke_descriptor { void *tailcall_target; + void *tailcall_bypass; + void *bypass_addr; union { struct { struct bpf_map *map; @@ -738,6 +740,7 @@ struct bpf_prog_aux { bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ bool func_proto_unreliable; bool sleepable; + bool tail_call_reachable; enum bpf_tramp_prog_type trampoline_prog_type; struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5026b75db972..fbc964526ba3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -359,6 +359,7 @@ struct bpf_subprog_info { u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ bool has_tail_call; + bool tail_call_reachable; }; /* single container for all structs -- cgit v1.2.3 From 09b28d76eac48e922dc293da1aa2b2b85c32aeee Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 17 Sep 2020 19:09:18 -0700 Subject: bpf: Add abnormal return checks. LD_[ABS|IND] instructions may return from the function early. bpf_tail_call pseudo instruction is either fallthrough or return. Allow them in the subprograms only when subprograms are BTF annotated and have scalar return types. Allow ld_abs and tail_call in the main program even if it calls into subprograms. In the past that was not ok to do for ld_abs, since it was JITed with special exit sequence. Since bpf_gen_ld_abs() was introduced the ld_abs looks like normal exit insn from JIT point of view, so it's safe to allow them in the main program. Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index fbc964526ba3..2bb48a2c4d08 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -360,6 +360,7 @@ struct bpf_subprog_info { u16 stack_depth; /* max. stack depth used by this function */ bool has_tail_call; bool tail_call_reachable; + bool has_ld_abs; }; /* single container for all structs -- cgit v1.2.3 From 9d6e371dda7f3294e1b7d2a00d8e77a042b42988 Mon Sep 17 00:00:00 2001 From: Wright Feng Date: Tue, 8 Sep 2020 01:01:57 -0500 Subject: cfg80211: add more comments for ap_isolate in bss_parameters The value of struct bss_parameters::ap_isolate will be -1, 0 or 1. The value -1 means not to change. To prevent developers from thinking ap_isolate is only 0 or 1, I add more comments on it. Signed-off-by: Wright Feng Reviewed-by: Kalle Valo Link: https://lore.kernel.org/r/20200908060157.98846-1-wright.feng@cypress.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c9bce9bba511..7ad530912b21 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1787,6 +1787,7 @@ struct mpath_info { * (or NULL for no change) * @basic_rates_len: number of basic rates * @ap_isolate: do not forward packets between connected stations + * (0 = no, 1 = yes, -1 = do not change) * @ht_opmode: HT Operation mode * (u16 = opmode, -1 = do not change) * @p2p_ctwindow: P2P CT Window (-1 = no change) -- cgit v1.2.3 From 6aea26ce5a4cf854c1a86f3760753b5e2617578f Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 8 Sep 2020 14:36:53 +0200 Subject: mac80211: rework tx encapsulation offload API The current API (which lets the driver turn on/off per vif directly) has a number of limitations: - it does not deal with AP_VLAN - conditions for enabling (no tkip, no monitor) are only checked at add_interface time - no way to indicate 4-addr support In order to address this, store offload flags in struct ieee80211_vif (easy to extend for decap offload later). mac80211 initially sets the enable flag, but gives the driver a chance to modify it before its settings are applied. In addition to the .add_interface op, a .update_vif_offload op is introduced, which can be used for runtime changes. If a driver can't disable encap offload at runtime, or if it has some extra limitations, it can simply override the flags within those ops. Support for encap offload with 4-address mode interfaces can be enabled by setting a flag from .add_interface or .update_vif_offload. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20200908123702.88454-6-nbd@nbd.name [resolved conflict with commit aa2092a9bab3 ("ath11k: add raw mode and software crypto support")] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ec148b3e9c41..01612a82aacf 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1606,6 +1606,21 @@ enum ieee80211_vif_flags { IEEE80211_VIF_GET_NOA_UPDATE = BIT(3), }; + +/** + * enum ieee80211_offload_flags - virtual interface offload flags + * + * @IEEE80211_OFFLOAD_ENCAP_ENABLED: tx encapsulation offload is enabled + * The driver supports sending frames passed as 802.3 frames by mac80211. + * It must also support sending 802.11 packets for the same interface. + * @IEEE80211_OFFLOAD_ENCAP_4ADDR: support 4-address mode encapsulation offload + */ + +enum ieee80211_offload_flags { + IEEE80211_OFFLOAD_ENCAP_ENABLED = BIT(0), + IEEE80211_OFFLOAD_ENCAP_4ADDR = BIT(1), +}; + /** * struct ieee80211_vif - per-interface data * @@ -1626,6 +1641,11 @@ enum ieee80211_vif_flags { * these need to be set (or cleared) when the interface is added * or, if supported by the driver, the interface type is changed * at runtime, mac80211 will never touch this field + * @offloaad_flags: hardware offload capabilities/flags for this interface. + * These are initialized by mac80211 before calling .add_interface, + * .change_interface or .update_vif_offload and updated by the driver + * within these ops, based on supported features or runtime change + * restrictions. * @hw_queue: hardware queue for each AC * @cab_queue: content-after-beacon (DTIM beacon really) queue, AP mode only * @chanctx_conf: The channel context this interface is assigned to, or %NULL @@ -1662,6 +1682,7 @@ struct ieee80211_vif { struct ieee80211_chanctx_conf __rcu *chanctx_conf; u32 driver_flags; + u32 offload_flags; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; @@ -2328,6 +2349,9 @@ struct ieee80211_txq { * aggregating MPDUs with the same keyid, allowing mac80211 to keep Tx * A-MPDU sessions active while rekeying with Extended Key ID. * + * @IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD: Hardware supports tx encapsulation + * offload + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2380,6 +2404,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_MULTI_BSSID, IEEE80211_HW_SUPPORTS_ONLY_HE_MULTI_BSSID, IEEE80211_HW_AMPDU_KEYBORDER_SUPPORT, + IEEE80211_HW_SUPPORTS_TX_ENCAP_OFFLOAD, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS @@ -3814,6 +3839,8 @@ enum ieee80211_reconfig_type { * @set_tid_config: Apply TID specific configurations. This callback may sleep. * @reset_tid_config: Reset TID specific configuration for the peer. * This callback may sleep. + * @update_vif_config: Update virtual interface offload flags + * This callback may sleep. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -4125,6 +4152,8 @@ struct ieee80211_ops { int (*reset_tid_config)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, u8 tids); + void (*update_vif_offload)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif); }; /** -- cgit v1.2.3 From c74114d7d51521bd785bf6aa0f90ee87d99bee8a Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 8 Sep 2020 14:36:55 +0200 Subject: mac80211: remove tx status call to ieee80211_sta_register_airtime All drivers using airtime fairness are calling ieee80211_sta_register_airtime directly, now they must. Document this as well. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20200908123702.88454-8-nbd@nbd.name [johannes: update the documentation to suit] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 01612a82aacf..ca270f7d82b9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1002,7 +1002,8 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) * @status.ampdu_ack_len: AMPDU ack length * @status.ampdu_len: AMPDU length * @status.antenna: (legacy, kept only for iwlegacy) - * @status.tx_time: airtime consumed for transmission + * @status.tx_time: airtime consumed for transmission; note this is only + * used for WMM AC, not for airtime fairness * @status.is_valid_ack_signal: ACK signal is valid * @status.status_driver_data: driver use area * @ack: union part for pure ACK data @@ -5676,7 +5677,7 @@ void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid); /** * ieee80211_sta_register_airtime - register airtime usage for a sta/tid * - * Register airtime usage for a given sta on a given tid. The driver can call + * Register airtime usage for a given sta on a given tid. The driver must call * this function to notify mac80211 that a station used a certain amount of * airtime. This information will be used by the TXQ scheduler to schedule * stations in a way that ensures airtime fairness. -- cgit v1.2.3 From cc20ff2c6b5d3e28747c6d30ecd097ea1a4d2502 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 8 Sep 2020 14:36:57 +0200 Subject: mac80211: swap NEED_TXPROCESSING and HW_80211_ENCAP tx flags In order to unify the tx status path, the hw 802.11 encapsulation flag needs to survive the trip to the tx status call. Since we don't have any free bits in info->flags, we need to move one. IEEE80211_TX_INTFL_NEED_TXPROCESSING is only used internally in mac80211, and only before the call into the driver. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20200908123702.88454-10-nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/mac80211.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ca270f7d82b9..3a9ab3c10050 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -720,9 +720,8 @@ struct ieee80211_bss_conf { * @IEEE80211_TX_INTFL_OFFCHAN_TX_OK: Internal to mac80211. Used to indicate * that a frame can be transmitted while the queues are stopped for * off-channel operation. - * @IEEE80211_TX_INTFL_NEED_TXPROCESSING: completely internal to mac80211, - * used to indicate that a pending frame requires TX processing before - * it can be sent out. + * @IEEE80211_TX_CTL_HW_80211_ENCAP: This frame uses hardware encapsulation + * (header conversion) * @IEEE80211_TX_INTFL_RETRIED: completely internal to mac80211, * used to indicate that a frame was already retried due to PS * @IEEE80211_TX_INTFL_DONT_ENCRYPT: completely internal to mac80211, @@ -791,7 +790,7 @@ enum mac80211_tx_info_flags { IEEE80211_TX_STAT_AMPDU_NO_BACK = BIT(11), IEEE80211_TX_CTL_RATE_CTRL_PROBE = BIT(12), IEEE80211_TX_INTFL_OFFCHAN_TX_OK = BIT(13), - IEEE80211_TX_INTFL_NEED_TXPROCESSING = BIT(14), + IEEE80211_TX_CTL_HW_80211_ENCAP = BIT(14), IEEE80211_TX_INTFL_RETRIED = BIT(15), IEEE80211_TX_INTFL_DONT_ENCRYPT = BIT(16), IEEE80211_TX_CTL_NO_PS_BUFFER = BIT(17), @@ -823,8 +822,9 @@ enum mac80211_tx_info_flags { * @IEEE80211_TX_CTRL_AMSDU: This frame is an A-MSDU frame * @IEEE80211_TX_CTRL_FAST_XMIT: This frame is going through the fast_xmit path * @IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP: This frame skips mesh path lookup - * @IEEE80211_TX_CTRL_HW_80211_ENCAP: This frame uses hardware encapsulation - * (header conversion) + * @IEEE80211_TX_INTCFL_NEED_TXPROCESSING: completely internal to mac80211, + * used to indicate that a pending frame requires TX processing before + * it can be sent out. * @IEEE80211_TX_CTRL_NO_SEQNO: Do not overwrite the sequence number that * has already been assigned to this frame. * @@ -837,7 +837,7 @@ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_AMSDU = BIT(3), IEEE80211_TX_CTRL_FAST_XMIT = BIT(4), IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP = BIT(5), - IEEE80211_TX_CTRL_HW_80211_ENCAP = BIT(6), + IEEE80211_TX_INTCFL_NEED_TXPROCESSING = BIT(6), IEEE80211_TX_CTRL_NO_SEQNO = BIT(7), }; -- cgit v1.2.3 From 1ff4e8f2dec8b145b451f05320e4f9e01d254ae2 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 8 Sep 2020 14:37:01 +0200 Subject: mac80211: notify the driver when a sta uses 4-address mode This is needed for encapsulation offload of 4-address mode packets Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20200908123702.88454-14-nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 3a9ab3c10050..07c4dd7ab55f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3842,6 +3842,8 @@ enum ieee80211_reconfig_type { * This callback may sleep. * @update_vif_config: Update virtual interface offload flags * This callback may sleep. + * @sta_set_4addr: Called to notify the driver when a station starts/stops using + * 4-address mode */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, @@ -4155,6 +4157,8 @@ struct ieee80211_ops { struct ieee80211_sta *sta, u8 tids); void (*update_vif_offload)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); + void (*sta_set_4addr)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, + struct ieee80211_sta *sta, bool enabled); }; /** -- cgit v1.2.3 From f02dff93e26bef46f5511f1e8229061bd23c3074 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Tue, 8 Sep 2020 14:37:00 +0200 Subject: mac80211: extend ieee80211_tx_status_ext to support bulk free Store processed skbs ready to be freed in a list so the driver bulk free them Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20200908123702.88454-13-nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 07c4dd7ab55f..d3c43420779c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1096,12 +1096,14 @@ ieee80211_info_get_tx_time_est(struct ieee80211_tx_info *info) * @info: Basic tx status information * @skb: Packet skb (can be NULL if not provided by the driver) * @rate: The TX rate that was used when sending the packet + * @free_list: list where processed skbs are stored to be free'd by the driver */ struct ieee80211_tx_status { struct ieee80211_sta *sta; struct ieee80211_tx_info *info; struct sk_buff *skb; struct rate_info *rate; + struct list_head *free_list; }; /** -- cgit v1.2.3 From 37050e3ab0b3f02819e3d70ab01d97addb810b28 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Tue, 8 Sep 2020 12:03:02 -0700 Subject: ieee80211: redefine S1G bits with GENMASK The S1G capability fields were defined by ORing BITS() together, and expecting a custom macro to use the _SHIFT definitions. Use the Linux kernel GENMASK for the definitions now, and FIELD_{GET,PREP} to access the fields in the future. Take the chance to rename eg. S1G_CAPAB_B0 to the more compact S1G_CAP0. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200908190323.15814-2-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 156 +++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 78 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index c47f43e65a2f..53fba39d4ba6 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2330,84 +2330,84 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) } /* S1G Capabilities Information field */ -#define S1G_CAPAB_B0_S1G_LONG BIT(0) -#define S1G_CAPAB_B0_SGI_1MHZ BIT(1) -#define S1G_CAPAB_B0_SGI_2MHZ BIT(2) -#define S1G_CAPAB_B0_SGI_4MHZ BIT(3) -#define S1G_CAPAB_B0_SGI_8MHZ BIT(4) -#define S1G_CAPAB_B0_SGI_16MHZ BIT(5) -#define S1G_CAPAB_B0_SUPP_CH_WIDTH_MASK (BIT(6) | BIT(7)) -#define S1G_CAPAB_B0_SUPP_CH_WIDTH_SHIFT 6 - -#define S1G_CAPAB_B1_RX_LDPC BIT(0) -#define S1G_CAPAB_B1_TX_STBC BIT(1) -#define S1G_CAPAB_B1_RX_STBC BIT(2) -#define S1G_CAPAB_B1_SU_BFER BIT(3) -#define S1G_CAPAB_B1_SU_BFEE BIT(4) -#define S1G_CAPAB_B1_BFEE_STS_MASK (BIT(5) | BIT(6) | BIT(7)) -#define S1G_CAPAB_B1_BFEE_STS_SHIFT 5 - -#define S1G_CAPAB_B2_SOUNDING_DIMENSIONS_MASK (BIT(0) | BIT(1) | BIT(2)) -#define S1G_CAPAB_B2_SOUNDING_DIMENSIONS_SHIFT 0 -#define S1G_CAPAB_B2_MU_BFER BIT(3) -#define S1G_CAPAB_B2_MU_BFEE BIT(4) -#define S1G_CAPAB_B2_PLUS_HTC_VHT BIT(5) -#define S1G_CAPAB_B2_TRAVELING_PILOT_MASK (BIT(6) | BIT(7)) -#define S1G_CAPAB_B2_TRAVELING_PILOT_SHIFT 6 - -#define S1G_CAPAB_B3_RD_RESPONDER BIT(0) -#define S1G_CAPAB_B3_HT_DELAYED_BA BIT(1) -#define S1G_CAPAB_B3_MAX_MPDU_LEN BIT(2) -#define S1G_CAPAB_B3_MAX_AMPDU_LEN_EXP_MASK (BIT(3) | BIT(4)) -#define S1G_CAPAB_B3_MAX_AMPDU_LEN_EXP_SHIFT 3 -#define S1G_CAPAB_B3_MIN_MPDU_START_MASK (BIT(5) | BIT(6) | BIT(7)) -#define S1G_CAPAB_B3_MIN_MPDU_START_SHIFT 5 - -#define S1G_CAPAB_B4_UPLINK_SYNC BIT(0) -#define S1G_CAPAB_B4_DYNAMIC_AID BIT(1) -#define S1G_CAPAB_B4_BAT BIT(2) -#define S1G_CAPAB_B4_TIME_ADE BIT(3) -#define S1G_CAPAB_B4_NON_TIM BIT(4) -#define S1G_CAPAB_B4_GROUP_AID BIT(5) -#define S1G_CAPAB_B4_STA_TYPE_MASK (BIT(6) | BIT(7)) -#define S1G_CAPAB_B4_STA_TYPE_SHIFT 6 - -#define S1G_CAPAB_B5_CENT_AUTH_CONTROL BIT(0) -#define S1G_CAPAB_B5_DIST_AUTH_CONTROL BIT(1) -#define S1G_CAPAB_B5_AMSDU BIT(2) -#define S1G_CAPAB_B5_AMPDU BIT(3) -#define S1G_CAPAB_B5_ASYMMETRIC_BA BIT(4) -#define S1G_CAPAB_B5_FLOW_CONTROL BIT(5) -#define S1G_CAPAB_B5_SECTORIZED_BEAM_MASK (BIT(6) | BIT(7)) -#define S1G_CAPAB_B5_SECTORIZED_BEAM_SHIFT 6 - -#define S1G_CAPAB_B6_OBSS_MITIGATION BIT(0) -#define S1G_CAPAB_B6_FRAGMENT_BA BIT(1) -#define S1G_CAPAB_B6_NDP_PS_POLL BIT(2) -#define S1G_CAPAB_B6_RAW_OPERATION BIT(3) -#define S1G_CAPAB_B6_PAGE_SLICING BIT(4) -#define S1G_CAPAB_B6_TXOP_SHARING_IMP_ACK BIT(5) -#define S1G_CAPAB_B6_VHT_LINK_ADAPT_MASK (BIT(6) | BIT(7)) -#define S1G_CAPAB_B6_VHT_LINK_ADAPT_SHIFT 6 - -#define S1G_CAPAB_B7_TACK_AS_PS_POLL BIT(0) -#define S1G_CAPAB_B7_DUP_1MHZ BIT(1) -#define S1G_CAPAB_B7_MCS_NEGOTIATION BIT(2) -#define S1G_CAPAB_B7_1MHZ_CTL_RESPONSE_PREAMBLE BIT(3) -#define S1G_CAPAB_B7_NDP_BFING_REPORT_POLL BIT(4) -#define S1G_CAPAB_B7_UNSOLICITED_DYN_AID BIT(5) -#define S1G_CAPAB_B7_SECTOR_TRAINING_OPERATION BIT(6) -#define S1G_CAPAB_B7_TEMP_PS_MODE_SWITCH BIT(7) - -#define S1G_CAPAB_B8_TWT_GROUPING BIT(0) -#define S1G_CAPAB_B8_BDT BIT(1) -#define S1G_CAPAB_B8_COLOR_MASK (BIT(2) | BIT(3) | BIT(4)) -#define S1G_CAPAB_B8_COLOR_SHIFT 2 -#define S1G_CAPAB_B8_TWT_REQUEST BIT(5) -#define S1G_CAPAB_B8_TWT_RESPOND BIT(6) -#define S1G_CAPAB_B8_PV1_FRAME BIT(7) - -#define S1G_CAPAB_B9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) +#define S1G_CAP0_S1G_LONG BIT(0) +#define S1G_CAP0_SGI_1MHZ BIT(1) +#define S1G_CAP0_SGI_2MHZ BIT(2) +#define S1G_CAP0_SGI_4MHZ BIT(3) +#define S1G_CAP0_SGI_8MHZ BIT(4) +#define S1G_CAP0_SGI_16MHZ BIT(5) +#define S1G_CAP0_SUPP_CH_WIDTH GENMASK(7, 6) + +#define S1G_SUPP_CH_WIDTH_2 0 +#define S1G_SUPP_CH_WIDTH_4 1 +#define S1G_SUPP_CH_WIDTH_8 2 +#define S1G_SUPP_CH_WIDTH_16 3 +#define S1G_SUPP_CH_WIDTH_MAX(cap) ((1 << FIELD_GET(S1G_CAP0_SUPP_CH_WIDTH, \ + cap[0])) << 1) + +#define S1G_CAP1_RX_LDPC BIT(0) +#define S1G_CAP1_TX_STBC BIT(1) +#define S1G_CAP1_RX_STBC BIT(2) +#define S1G_CAP1_SU_BFER BIT(3) +#define S1G_CAP1_SU_BFEE BIT(4) +#define S1G_CAP1_BFEE_STS GENMASK(7, 5) + +#define S1G_CAP2_SOUNDING_DIMENSIONS GENMASK(2, 0) +#define S1G_CAP2_MU_BFER BIT(3) +#define S1G_CAP2_MU_BFEE BIT(4) +#define S1G_CAP2_PLUS_HTC_VHT BIT(5) +#define S1G_CAP2_TRAVELING_PILOT GENMASK(7, 6) + +#define S1G_CAP3_RD_RESPONDER BIT(0) +#define S1G_CAP3_HT_DELAYED_BA BIT(1) +#define S1G_CAP3_MAX_MPDU_LEN BIT(2) +#define S1G_CAP3_MAX_AMPDU_LEN_EXP GENMASK(4, 3) +#define S1G_CAP3_MIN_MPDU_START GENMASK(7, 5) + +#define S1G_CAP4_UPLINK_SYNC BIT(0) +#define S1G_CAP4_DYNAMIC_AID BIT(1) +#define S1G_CAP4_BAT BIT(2) +#define S1G_CAP4_TIME_ADE BIT(3) +#define S1G_CAP4_NON_TIM BIT(4) +#define S1G_CAP4_GROUP_AID BIT(5) +#define S1G_CAP4_STA_TYPE GENMASK(7, 6) + +#define S1G_CAP5_CENT_AUTH_CONTROL BIT(0) +#define S1G_CAP5_DIST_AUTH_CONTROL BIT(1) +#define S1G_CAP5_AMSDU BIT(2) +#define S1G_CAP5_AMPDU BIT(3) +#define S1G_CAP5_ASYMMETRIC_BA BIT(4) +#define S1G_CAP5_FLOW_CONTROL BIT(5) +#define S1G_CAP5_SECTORIZED_BEAM GENMASK(7, 6) + +#define S1G_CAP6_OBSS_MITIGATION BIT(0) +#define S1G_CAP6_FRAGMENT_BA BIT(1) +#define S1G_CAP6_NDP_PS_POLL BIT(2) +#define S1G_CAP6_RAW_OPERATION BIT(3) +#define S1G_CAP6_PAGE_SLICING BIT(4) +#define S1G_CAP6_TXOP_SHARING_IMP_ACK BIT(5) +#define S1G_CAP6_VHT_LINK_ADAPT GENMASK(7, 6) + +#define S1G_CAP7_TACK_AS_PS_POLL BIT(0) +#define S1G_CAP7_DUP_1MHZ BIT(1) +#define S1G_CAP7_MCS_NEGOTIATION BIT(2) +#define S1G_CAP7_1MHZ_CTL_RESPONSE_PREAMBLE BIT(3) +#define S1G_CAP7_NDP_BFING_REPORT_POLL BIT(4) +#define S1G_CAP7_UNSOLICITED_DYN_AID BIT(5) +#define S1G_CAP7_SECTOR_TRAINING_OPERATION BIT(6) +#define S1G_CAP7_TEMP_PS_MODE_SWITCH BIT(7) + +#define S1G_CAP8_TWT_GROUPING BIT(0) +#define S1G_CAP8_BDT BIT(1) +#define S1G_CAP8_COLOR GENMASK(4, 2) +#define S1G_CAP8_TWT_REQUEST BIT(5) +#define S1G_CAP8_TWT_RESPOND BIT(6) +#define S1G_CAP8_PV1_FRAME BIT(7) + +#define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) + +#define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ BIT(0) +#define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 -- cgit v1.2.3 From d65a977087f94f3bb97f351798d864556063109a Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Tue, 8 Sep 2020 12:03:03 -0700 Subject: nl80211: advertise supported channel width in S1G S1G supports 5 channel widths: 1, 2, 4, 8, and 16. One channel width is allowed per frequency in each operating class, so it makes more sense to advertise the specific channel width allowed. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200908190323.15814-3-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 15 +++++++++++++++ include/uapi/linux/nl80211.h | 15 +++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7ad530912b21..2a7561743717 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -96,6 +96,16 @@ struct wiphy; * @IEEE80211_CHAN_NO_10MHZ: 10 MHz bandwidth is not permitted * on this channel. * @IEEE80211_CHAN_NO_HE: HE operation is not permitted on this channel. + * @IEEE80211_CHAN_1MHZ: 1 MHz bandwidth is permitted + * on this channel. + * @IEEE80211_CHAN_2MHZ: 2 MHz bandwidth is permitted + * on this channel. + * @IEEE80211_CHAN_4MHZ: 4 MHz bandwidth is permitted + * on this channel. + * @IEEE80211_CHAN_8MHZ: 8 MHz bandwidth is permitted + * on this channel. + * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted + * on this channel. * */ enum ieee80211_channel_flags { @@ -113,6 +123,11 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_NO_20MHZ = 1<<11, IEEE80211_CHAN_NO_10MHZ = 1<<12, IEEE80211_CHAN_NO_HE = 1<<13, + IEEE80211_CHAN_1MHZ = 1<<14, + IEEE80211_CHAN_2MHZ = 1<<15, + IEEE80211_CHAN_4MHZ = 1<<16, + IEEE80211_CHAN_8MHZ = 1<<17, + IEEE80211_CHAN_16MHZ = 1<<18, }; #define IEEE80211_CHAN_NO_HT40 \ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 0584e0d349f0..4e119c6afa31 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3737,6 +3737,16 @@ enum nl80211_wmm_rule { * @NL80211_FREQUENCY_ATTR_NO_HE: HE operation is not allowed on this channel * in current regulatory domain. * @NL80211_FREQUENCY_ATTR_OFFSET: frequency offset in KHz + * @NL80211_FREQUENCY_ATTR_1MHZ: 1 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_2MHZ: 2 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_4MHZ: 4 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_8MHZ: 8 MHz operation is allowed + * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_16MHZ: 16 MHz operation is allowed + * on this channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -3768,6 +3778,11 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_WMM, NL80211_FREQUENCY_ATTR_NO_HE, NL80211_FREQUENCY_ATTR_OFFSET, + NL80211_FREQUENCY_ATTR_1MHZ, + NL80211_FREQUENCY_ATTR_2MHZ, + NL80211_FREQUENCY_ATTR_4MHZ, + NL80211_FREQUENCY_ATTR_8MHZ, + NL80211_FREQUENCY_ATTR_16MHZ, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, -- cgit v1.2.3 From 11b34737b18a70c74d5cf13ee58d36e95879013c Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Tue, 8 Sep 2020 12:03:06 -0700 Subject: nl80211: support setting S1G channels S1G channels have a single width defined per frequency, so derive it from the channel flags with ieee80211_s1g_channel_width(). Also support setting an S1G channel where control frequency may differ from operating, and add some basic validation to ensure the control channel is with the operating. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200908190323.15814-6-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 2a7561743717..44db9f80e495 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5294,6 +5294,16 @@ ieee80211_channel_to_khz(const struct ieee80211_channel *chan) return MHZ_TO_KHZ(chan->center_freq) + chan->freq_offset; } +/** + * ieee80211_s1g_channel_width - get allowed channel width from @chan + * + * Only allowed for band NL80211_BAND_S1GHZ + * @chan: channel + * Return: The allowed channel width for this center_freq + */ +enum nl80211_chan_width +ieee80211_s1g_channel_width(const struct ieee80211_channel *chan); + /** * ieee80211_channel_to_freq_khz - convert channel number to frequency * @chan: channel number -- cgit v1.2.3 From 291c49ded2fda1fd0d7bd6056de99fe47d2332e6 Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Fri, 11 Sep 2020 00:05:29 +0000 Subject: nl80211: Add FILS discovery support FILS discovery attribute, NL80211_ATTR_FILS_DISCOVERY, is nested which supports following parameters as given in IEEE Std 802.11ai-2016, Annex C.3 MIB detail: (1) NL80211_FILS_DISCOVERY_ATTR_INT_MIN - Minimum packet interval (2) NL80211_FILS_DISCOVERY_ATTR_INT_MAX - Maximum packet interval (3) NL80211_FILS_DISCOVERY_ATTR_TMPL - Template data Signed-off-by: Aloka Dixit Link: https://lore.kernel.org/r/20200805011838.28166-2-alokad@codeaurora.org [fix attribute and other names, use NLA_RANGE(), use policy only once] Link: https://lore.kernel.org/r/010101747a7b38a8-306f06b2-9061-4baf-81c1-054a42a18e22-000000@us-west-2.amazonses.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 19 +++++++++++++++++++ include/uapi/linux/nl80211.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 44db9f80e495..c90700727945 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1082,6 +1082,23 @@ struct cfg80211_acl_data { struct mac_address mac_addrs[]; }; +/** + * struct cfg80211_fils_discovery - FILS discovery parameters from + * IEEE Std 802.11ai-2016, Annex C.3 MIB detail. + * + * @min_interval: Minimum packet interval in TUs (0 - 10000) + * @max_interval: Maximum packet interval in TUs (0 - 10000) + * @tmpl_len: Template length + * @tmpl: Template data for FILS discovery frame including the action + * frame headers. + */ +struct cfg80211_fils_discovery { + u32 min_interval; + u32 max_interval; + size_t tmpl_len; + const u8 *tmpl; +}; + /** * enum cfg80211_ap_settings_flags - AP settings flags * @@ -1129,6 +1146,7 @@ enum cfg80211_ap_settings_flags { * @he_obss_pd: OBSS Packet Detection settings * @he_bss_color: BSS Color settings * @he_oper: HE operation IE (or %NULL if HE isn't enabled) + * @fils_discovery: FILS discovery transmission parameters */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1159,6 +1177,7 @@ struct cfg80211_ap_settings { u32 flags; struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; + struct cfg80211_fils_discovery fils_discovery; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 4e119c6afa31..ad2bea3b07e3 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2513,6 +2513,10 @@ enum nl80211_commands { * @NL80211_ATTR_HE_6GHZ_CAPABILITY: HE 6 GHz Band Capability element (from * association request when used with NL80211_CMD_NEW_STATION). * + * @NL80211_ATTR_FILS_DISCOVERY: Optional parameter to configure FILS + * discovery. It is a nested attribute, see + * &enum nl80211_fils_discovery_attributes. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2995,6 +2999,8 @@ enum nl80211_attrs { NL80211_ATTR_HE_6GHZ_CAPABILITY, + NL80211_ATTR_FILS_DISCOVERY, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5867,6 +5873,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SAE_OFFLOAD_AP: Device wants to do SAE authentication * in AP mode (SAE password is passed as part of the start AP command). * + * @NL80211_EXT_FEATURE_FILS_DISCOVERY: Driver/device supports FILS discovery + * frames transmission + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5925,6 +5934,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_OPERATING_CHANNEL_VALIDATION, NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, NL80211_EXT_FEATURE_SAE_OFFLOAD_AP, + NL80211_EXT_FEATURE_FILS_DISCOVERY, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -7019,4 +7029,38 @@ enum nl80211_iftype_akm_attributes { NL80211_IFTYPE_AKM_ATTR_MAX = __NL80211_IFTYPE_AKM_ATTR_LAST - 1, }; +/** + * enum nl80211_fils_discovery_attributes - FILS discovery configuration + * from IEEE Std 802.11ai-2016, Annex C.3 MIB detail. + * + * @__NL80211_FILS_DISCOVERY_ATTR_INVALID: Invalid + * + * @NL80211_FILS_DISCOVERY_ATTR_INT_MIN: Minimum packet interval (u32, TU). + * Allowed range: 0..10000 (TU = Time Unit) + * @NL80211_FILS_DISCOVERY_ATTR_INT_MAX: Maximum packet interval (u32, TU). + * Allowed range: 0..10000 (TU = Time Unit) + * @NL80211_FILS_DISCOVERY_ATTR_TMPL: Template data for FILS discovery action + * frame including the headers. + * + * @__NL80211_FILS_DISCOVERY_ATTR_LAST: Internal + * @NL80211_FILS_DISCOVERY_ATTR_MAX: highest attribute + */ +enum nl80211_fils_discovery_attributes { + __NL80211_FILS_DISCOVERY_ATTR_INVALID, + + NL80211_FILS_DISCOVERY_ATTR_INT_MIN, + NL80211_FILS_DISCOVERY_ATTR_INT_MAX, + NL80211_FILS_DISCOVERY_ATTR_TMPL, + + /* keep last */ + __NL80211_FILS_DISCOVERY_ATTR_LAST, + NL80211_FILS_DISCOVERY_ATTR_MAX = __NL80211_FILS_DISCOVERY_ATTR_LAST - 1 +}; + +/* + * FILS discovery template minimum length with action frame headers and + * mandatory fields. + */ +#define NL80211_FILS_DISCOVERY_TMPL_MIN_LEN 42 + #endif /* __LINUX_NL80211_H */ -- cgit v1.2.3 From 295b02c4be74bebf988593b8322369513fcecf68 Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Fri, 11 Sep 2020 00:05:31 +0000 Subject: mac80211: Add FILS discovery support This patch adds mac80211 support to configure FILS discovery transmission. Changes include functions to store and retrieve FILS discovery template, minimum and maximum packet intervals. Signed-off-by: Aloka Dixit Link: https://lore.kernel.org/r/20200805011838.28166-3-alokad@codeaurora.org [remove SUPPORTS_FILS_DISCOVERY, driver can just set wiphy info] Link: https://lore.kernel.org/r/010101747a7b3cbb-6edaa89c-436d-4391-8765-61456d7f5f4e-000000@us-west-2.amazonses.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d3c43420779c..9381f00d0942 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -317,6 +317,7 @@ struct ieee80211_vif_chanctx_switch { * @BSS_CHANGED_TWT: TWT status changed * @BSS_CHANGED_HE_OBSS_PD: OBSS Packet Detection status changed. * @BSS_CHANGED_HE_BSS_COLOR: BSS Color has changed + * @BSS_CHANGED_FILS_DISCOVERY: FILS discovery status changed. * */ enum ieee80211_bss_change { @@ -350,6 +351,7 @@ enum ieee80211_bss_change { BSS_CHANGED_TWT = 1<<27, BSS_CHANGED_HE_OBSS_PD = 1<<28, BSS_CHANGED_HE_BSS_COLOR = 1<<29, + BSS_CHANGED_FILS_DISCOVERY = 1<<30, /* when adding here, make sure to change ieee80211_reconfig */ }; @@ -490,6 +492,18 @@ struct ieee80211_ftm_responder_params { size_t civicloc_len; }; +/** + * struct ieee80211_fils_discovery - FILS discovery parameters from + * IEEE Std 802.11ai-2016, Annex C.3 MIB detail. + * + * @min_interval: Minimum packet interval in TUs (0 - 10000) + * @max_interval: Maximum packet interval in TUs (0 - 10000) + */ +struct ieee80211_fils_discovery { + u32 min_interval; + u32 max_interval; +}; + /** * struct ieee80211_bss_conf - holds the BSS's changing parameters * @@ -607,6 +621,7 @@ struct ieee80211_ftm_responder_params { * @he_oper: HE operation information of the AP we are connected to * @he_obss_pd: OBSS Packet Detection parameters. * @he_bss_color: BSS coloring settings, if BSS supports HE + * @fils_discovery: FILS discovery configuration */ struct ieee80211_bss_conf { const u8 *bssid; @@ -674,6 +689,7 @@ struct ieee80211_bss_conf { } he_oper; struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; + struct ieee80211_fils_discovery fils_discovery; }; /** @@ -6629,4 +6645,15 @@ u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw, */ bool ieee80211_set_hw_80211_encap(struct ieee80211_vif *vif, bool enable); +/** + * ieee80211_get_fils_discovery_tmpl - Get FILS discovery template. + * @hw: pointer obtained from ieee80211_alloc_hw(). + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * + * The driver is responsible for freeing the returned skb. + * + * Return: FILS discovery template. %NULL on error. + */ +struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, + struct ieee80211_vif *vif); #endif /* MAC80211_H */ -- cgit v1.2.3 From 7443dcd1f1718a355e9c4ebeb7e95c3f9f27bb5f Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Fri, 11 Sep 2020 00:33:00 +0000 Subject: nl80211: Unsolicited broadcast probe response support This patch adds new attributes to support unsolicited broadcast probe response transmission used for in-band discovery in 6GHz band (IEEE P802.11ax/D6.0 26.17.2.3.2, AP behavior for fast passive scanning). The new attribute, NL80211_ATTR_UNSOL_BCAST_PROBE_RESP, is nested which supports following parameters: (1) NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT - Packet interval (2) NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL - Template data Signed-off-by: Aloka Dixit Link: https://lore.kernel.org/r/010101747a946698-aac263ae-2ed3-4dab-9590-0bc7131214e1-000000@us-west-2.amazonses.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 18 ++++++++++++++++++ include/uapi/linux/nl80211.h | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index c90700727945..93d666a571da 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1099,6 +1099,22 @@ struct cfg80211_fils_discovery { const u8 *tmpl; }; +/** + * struct cfg80211_unsol_bcast_probe_resp - Unsolicited broadcast probe + * response parameters in 6GHz. + * + * @interval: Packet interval in TUs. Maximum allowed is 20 TU, as mentioned + * in IEEE P802.11ax/D6.0 26.17.2.3.2 - AP behavior for fast passive + * scanning + * @tmpl_len: Template length + * @tmpl: Template data for probe response + */ +struct cfg80211_unsol_bcast_probe_resp { + u32 interval; + size_t tmpl_len; + const u8 *tmpl; +}; + /** * enum cfg80211_ap_settings_flags - AP settings flags * @@ -1147,6 +1163,7 @@ enum cfg80211_ap_settings_flags { * @he_bss_color: BSS Color settings * @he_oper: HE operation IE (or %NULL if HE isn't enabled) * @fils_discovery: FILS discovery transmission parameters + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1178,6 +1195,7 @@ struct cfg80211_ap_settings { struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; struct cfg80211_fils_discovery fils_discovery; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ad2bea3b07e3..bdc90b8dfd24 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2517,6 +2517,10 @@ enum nl80211_commands { * discovery. It is a nested attribute, see * &enum nl80211_fils_discovery_attributes. * + * @NL80211_ATTR_UNSOL_BCAST_PROBE_RESP: Optional parameter to configure + * unsolicited broadcast probe response. It is a nested attribute, see + * &enum nl80211_unsol_bcast_probe_resp_attributes. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3001,6 +3005,8 @@ enum nl80211_attrs { NL80211_ATTR_FILS_DISCOVERY, + NL80211_ATTR_UNSOL_BCAST_PROBE_RESP, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5876,6 +5882,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_FILS_DISCOVERY: Driver/device supports FILS discovery * frames transmission * + * @NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP: Driver/device supports + * unsolicited broadcast probe response transmission + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5935,6 +5944,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_4WAY_HANDSHAKE_AP_PSK, NL80211_EXT_FEATURE_SAE_OFFLOAD_AP, NL80211_EXT_FEATURE_FILS_DISCOVERY, + NL80211_EXT_FEATURE_UNSOL_BCAST_PROBE_RESP, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -7063,4 +7073,30 @@ enum nl80211_fils_discovery_attributes { */ #define NL80211_FILS_DISCOVERY_TMPL_MIN_LEN 42 +/** + * enum nl80211_unsol_bcast_probe_resp_attributes - Unsolicited broadcast probe + * response configuration. Applicable only in 6GHz. + * + * @__NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INVALID: Invalid + * + * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT: Maximum packet interval (u32, TU). + * Allowed range: 0..20 (TU = Time Unit). IEEE P802.11ax/D6.0 + * 26.17.2.3.2 (AP behavior for fast passive scanning). + * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL: Unsolicited broadcast probe response + * frame template (binary). + * + * @__NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_LAST: Internal + * @NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX: highest attribute + */ +enum nl80211_unsol_bcast_probe_resp_attributes { + __NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INVALID, + + NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_INT, + NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_TMPL, + + /* keep last */ + __NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_LAST, + NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_MAX = + __NL80211_UNSOL_BCAST_PROBE_RESP_ATTR_LAST - 1 +}; #endif /* __LINUX_NL80211_H */ -- cgit v1.2.3 From 632189a0180fdaae6715c83c68cc5c8998d6c841 Mon Sep 17 00:00:00 2001 From: Aloka Dixit Date: Fri, 11 Sep 2020 00:33:01 +0000 Subject: mac80211: Unsolicited broadcast probe response support This patch adds mac80211 support to configure unsolicited broadcast probe response transmission for in-band discovery in 6GHz. Changes include functions to store and retrieve probe response template, and packet interval (0 - 20 TUs). Setting interval to 0 disables the unsolicited broadcast probe response transmission. Signed-off-by: Aloka Dixit Link: https://lore.kernel.org/r/010101747a946b35-ad25858a-1f1f-48df-909e-dc7bf26d9169-000000@us-west-2.amazonses.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 9381f00d0942..f0908b567d65 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -318,6 +318,8 @@ struct ieee80211_vif_chanctx_switch { * @BSS_CHANGED_HE_OBSS_PD: OBSS Packet Detection status changed. * @BSS_CHANGED_HE_BSS_COLOR: BSS Color has changed * @BSS_CHANGED_FILS_DISCOVERY: FILS discovery status changed. + * @BSS_CHANGED_UNSOL_BCAST_PROBE_RESP: Unsolicited broadcast probe response + * status changed. * */ enum ieee80211_bss_change { @@ -352,6 +354,7 @@ enum ieee80211_bss_change { BSS_CHANGED_HE_OBSS_PD = 1<<28, BSS_CHANGED_HE_BSS_COLOR = 1<<29, BSS_CHANGED_FILS_DISCOVERY = 1<<30, + BSS_CHANGED_UNSOL_BCAST_PROBE_RESP = 1<<31, /* when adding here, make sure to change ieee80211_reconfig */ }; @@ -622,6 +625,8 @@ struct ieee80211_fils_discovery { * @he_obss_pd: OBSS Packet Detection parameters. * @he_bss_color: BSS coloring settings, if BSS supports HE * @fils_discovery: FILS discovery configuration + * @unsol_bcast_probe_resp_interval: Unsolicited broadcast probe response + * interval. */ struct ieee80211_bss_conf { const u8 *bssid; @@ -690,6 +695,7 @@ struct ieee80211_bss_conf { struct ieee80211_he_obss_pd he_obss_pd; struct cfg80211_he_bss_color he_bss_color; struct ieee80211_fils_discovery fils_discovery; + u32 unsol_bcast_probe_resp_interval; }; /** @@ -6656,4 +6662,18 @@ bool ieee80211_set_hw_80211_encap(struct ieee80211_vif *vif, bool enable); */ struct sk_buff *ieee80211_get_fils_discovery_tmpl(struct ieee80211_hw *hw, struct ieee80211_vif *vif); + +/** + * ieee80211_get_unsol_bcast_probe_resp_tmpl - Get unsolicited broadcast + * probe response template. + * @hw: pointer obtained from ieee80211_alloc_hw(). + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * + * The driver is responsible for freeing the returned skb. + * + * Return: Unsolicited broadcast probe response template. %NULL on error. + */ +struct sk_buff * +ieee80211_get_unsol_bcast_probe_resp_tmpl(struct ieee80211_hw *hw, + struct ieee80211_vif *vif); #endif /* MAC80211_H */ -- cgit v1.2.3 From 9ff167e178224069221a5771c12dfea9737bf3a3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Sep 2020 13:19:22 +0200 Subject: cfg80211: add missing kernel-doc for S1G band capabilities Add missing kernel-doc for the S1G band capabilities in the per band data. Link: https://lore.kernel.org/r/20200918131921.08c893cd73a1.Id71583c37baca8a9a3329426e02b66d9ab65ac03@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 93d666a571da..10c2cc8f0efc 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -465,6 +465,7 @@ struct ieee80211_sta_s1g_cap { * @ht_cap: HT capabilities in this band * @vht_cap: VHT capabilities in this band * @edmg_cap: EDMG capabilities in this band + * @s1g_cap: S1G capabilities in this band (S1B band only, of course) * @n_iftype_data: number of iftype data entries * @iftype_data: interface type data entries. Note that the bits in * @types_mask inside this structure cannot overlap (i.e. only -- cgit v1.2.3 From 7fba53ebb5b2d89d95b697f4c42c73c6fb7ba0c6 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 18 Sep 2020 13:21:16 +0200 Subject: mac80211: fix some encapsulation offload kernel-doc Add a missing kernel-doc entry for the offload_flags, and correct the name of the update_vif_offload method. Link: https://lore.kernel.org/r/20200918132115.d46a0915ba8a.Ibba536d04a5a5fb655f8ef6e51b247457bfda4ca@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f0908b567d65..e90089d104b0 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1690,6 +1690,8 @@ enum ieee80211_offload_flags { * @txq: the multicast data TX queue (if driver uses the TXQ abstraction) * @txqs_stopped: per AC flag to indicate that intermediate TXQs are stopped, * protected by fq->lock. + * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see + * &enum ieee80211_offload_flags. */ struct ieee80211_vif { enum nl80211_iftype type; @@ -3864,7 +3866,7 @@ enum ieee80211_reconfig_type { * @set_tid_config: Apply TID specific configurations. This callback may sleep. * @reset_tid_config: Reset TID specific configuration for the peer. * This callback may sleep. - * @update_vif_config: Update virtual interface offload flags + * @update_vif_offload: Update virtual interface offload flags * This callback may sleep. * @sta_set_4addr: Called to notify the driver when a station starts/stops using * 4-address mode -- cgit v1.2.3 From f92970c694b36a4dbac2b650b173c78c0f0954cc Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 17 Sep 2020 18:13:23 -0700 Subject: devlink: add timeout information to status_notify Add a timeout element to the DEVLINK_CMD_FLASH_UPDATE_STATUS netlink message for use by a userland utility to show that a particular firmware flash activity may take a long but bounded time to finish. Also add a handy helper for drivers to make use of the new timeout value. UI usage hints: - if non-zero, add timeout display to the end of the status line [component] status_msg ( Xm Ys : Am Bs ) using the timeout value for Am Bs and updating the Xm Ys every second - if the timeout expires while awaiting the next update, display something like [component] status_msg ( timeout reached : Am Bs ) - if new status notify messages are received, remove the timeout and start over Signed-off-by: Shannon Nelson Reviewed-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ include/uapi/linux/devlink.h | 3 +++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 48b1c1ef1ebd..be132c17fbcc 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1403,6 +1403,10 @@ void devlink_flash_update_status_notify(struct devlink *devlink, const char *component, unsigned long done, unsigned long total); +void devlink_flash_update_timeout_notify(struct devlink *devlink, + const char *status_msg, + const char *component, + unsigned long timeout); int devlink_traps_register(struct devlink *devlink, const struct devlink_trap *traps, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 631f5bdf1707..a2ecc8b00611 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -462,6 +462,9 @@ enum devlink_attr { DEVLINK_ATTR_PORT_EXTERNAL, /* u8 */ DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, /* u32 */ + + DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, /* u64 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From 6700acc5f1fe97b5705832f2678cba9e9756a0dc Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 17 Sep 2020 18:13:24 -0700 Subject: devlink: collect flash notify params into a struct The dev flash status notify function parameter lists are getting rather long, so add a struct to be filled and passed rather than continuously changing the function signatures. Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index be132c17fbcc..73065f07bf17 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -391,6 +391,25 @@ struct devlink_param_gset_ctx { enum devlink_param_cmode cmode; }; +/** + * struct devlink_flash_notify - devlink dev flash notify data + * @status_msg: current status string + * @component: firmware component being updated + * @done: amount of work completed of total amount + * @total: amount of work expected to be done + * @timeout: expected max timeout in seconds + * + * These are values to be given to userland to be displayed in order + * to show current activity in a firmware update process. + */ +struct devlink_flash_notify { + const char *status_msg; + const char *component; + unsigned long done; + unsigned long total; + unsigned long timeout; +}; + /** * struct devlink_param - devlink configuration parameter data * @name: name of the parameter -- cgit v1.2.3 From daef1ee3798b25e8464b8eb618eaa74b8f423ac7 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 18 Sep 2020 08:17:27 +0700 Subject: tipc: introduce encryption master key In addition to the supported cluster & per-node encryption keys for the en/decryption of TIPC messages, we now introduce one option for user to set a cluster key as 'master key', which is simply a symmetric key like the former but has a longer life cycle. It has two purposes: - Authentication of new member nodes in the cluster. New nodes, having no knowledge of current session keys in the cluster will still be able to join the cluster as long as they know the master key. This is because all neighbor discovery (LINK_CONFIG) messages must be encrypted with this key. - Encryption of session encryption keys during automatic exchange and update of those.This is a feature we will introduce in a later commit in this series. We insert the new key into the currently unused slot 0 in the key array and start using it immediately once the user has set it. After joining, a node only knowing the master key should be fully communicable to existing nodes in the cluster, although those nodes may have their own session keys activated (i.e. not the master one). To support this, we define a 'grace period', starting from the time a node itself reports having no RX keys, so the existing nodes will use the master key for encryption instead. The grace period can be extended but will automatically stop after e.g. 5 seconds without a new report. This is also the basis for later key exchanging feature as the new node will be impossible to decrypt anything without the support from master key. For user to set a master key, we define a new netlink flag - 'TIPC_NLA_NODE_KEY_MASTER', so it can be added to the current 'set key' netlink command to specify the setting key to be a master key. Above all, the traditional cluster/per-node key mechanism is guaranteed to work when user comes not to use this master key option. This is also compatible to legacy nodes without the feature supported. Even this master key can be updated without any interruption of cluster connectivity but is so is needed, this has to be coordinated and set by the user. Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc_netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index dc0d23a50e69..d484baa9d365 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -165,6 +165,7 @@ enum { TIPC_NLA_NODE_UP, /* flag */ TIPC_NLA_NODE_ID, /* data */ TIPC_NLA_NODE_KEY, /* data */ + TIPC_NLA_NODE_KEY_MASTER, /* flag */ __TIPC_NLA_NODE_MAX, TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 -- cgit v1.2.3 From 23700da29b83e859a8c3727fddd33ba74c4f3a39 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Fri, 18 Sep 2020 08:17:29 +0700 Subject: tipc: add automatic rekeying for encryption key Rekeying is required for security since a key is less secure when using for a long time. Also, key will be detached when its nonce value (or seqno ...) is exhausted. We now make the rekeying process automatic and configurable by user. Basically, TIPC will at a specific interval generate a new key by using the kernel 'Random Number Generator' cipher, then attach it as the node TX key and securely distribute to others in the cluster as RX keys (- the key exchange). The automatic key switching will then take over, and make the new key active shortly. Afterwards, the traffic from this node will be encrypted with the new session key. The same can happen in peer nodes but not necessarily at the same time. For simplicity, the automatically generated key will be initiated as a per node key. It is not too hard to also support a cluster key rekeying (e.g. a given node will generate a unique cluster key and update to the others in the cluster...), but that doesn't bring much benefit, while a per-node key is even more secure. We also enable user to force a rekeying or change the rekeying interval via netlink, the new 'set key' command option: 'TIPC_NLA_NODE_REKEYING' is added for these purposes as follows: - A value >= 1 will be set as the rekeying interval (in minutes); - A value of 0 will disable the rekeying; - A value of 'TIPC_REKEYING_NOW' (~0) will force an immediate rekeying; The default rekeying interval is (60 * 24) minutes i.e. done every day. There isn't any restriction for the value but user shouldn't set it too small or too large which results in an "ineffective" rekeying (thats ok for testing though). Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- include/uapi/linux/tipc.h | 2 ++ include/uapi/linux/tipc_netlink.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index add01db1daef..80ea15e12113 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -254,6 +254,8 @@ static inline int tipc_aead_key_size(struct tipc_aead_key *key) return sizeof(*key) + key->keylen; } +#define TIPC_REKEYING_NOW (~0U) + /* The macros and functions below are deprecated: */ diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index d484baa9d365..d847dd671d79 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -166,6 +166,7 @@ enum { TIPC_NLA_NODE_ID, /* data */ TIPC_NLA_NODE_KEY, /* data */ TIPC_NLA_NODE_KEY_MASTER, /* flag */ + TIPC_NLA_NODE_REKEYING, /* u32 */ __TIPC_NLA_NODE_MAX, TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 -- cgit v1.2.3 From 3753d9779038ab011e01b949253492aaa37bf57a Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Thu, 17 Sep 2020 22:08:32 -0700 Subject: net: fix build without CONFIG_SYSCTL definition Earlier commit 316cdaa1158a ("net: add option to not create fall-back tunnels in root-ns as well") removed the CONFIG_SYSCTL to enable the kernel-commandline to work. However, this variable gets defined only when CONFIG_SYSCTL option is selected. With this change the behavior would default to creating fall-back tunnels in all namespaces when CONFIG_SYSCTL is not selected and the kernel commandline option will be ignored. Fixes: 316cdaa1158a ("net: add option to not create fall-back tunnels in root-ns as well") Signed-off-by: Mahesh Bandewar Reported-by: Randy Dunlap Reported-by: kernel test robot Acked-by: Randy Dunlap # build-tested Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 909b1fbb0481..fef0eb96cf69 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -634,8 +634,9 @@ extern int sysctl_devconf_inherit_init_net; */ static inline bool net_has_fallback_tunnels(const struct net *net) { - return (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1) || - !sysctl_fb_tunnels_only_for_init_net; + return !IS_ENABLED(CONFIG_SYSCTL) || + !sysctl_fb_tunnels_only_for_init_net || + (net == &init_net && sysctl_fb_tunnels_only_for_init_net == 1); } static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) -- cgit v1.2.3 From 6d23d831e9bd0b1d2bcd9a1ecdc6ac8e6d162c36 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Fri, 18 Sep 2020 17:48:01 +0800 Subject: ptp_qoriq: support FIPER3 The FIPER3 (fixed interval period pulse generator) is supported on DPAA2 and ENETC network controller hardware. This patch is to support it in ptp_qoriq driver. Signed-off-by: Yangbo Lu Acked-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/fsl/ptp_qoriq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/fsl/ptp_qoriq.h b/include/linux/fsl/ptp_qoriq.h index 884b8f8ca06d..01acebe37fab 100644 --- a/include/linux/fsl/ptp_qoriq.h +++ b/include/linux/fsl/ptp_qoriq.h @@ -136,6 +136,7 @@ struct ptp_qoriq_registers { #define DEFAULT_TMR_PRSC 2 #define DEFAULT_FIPER1_PERIOD 1000000000 #define DEFAULT_FIPER2_PERIOD 1000000000 +#define DEFAULT_FIPER3_PERIOD 1000000000 struct ptp_qoriq { void __iomem *base; @@ -147,6 +148,7 @@ struct ptp_qoriq { struct dentry *debugfs_root; struct device *dev; bool extts_fifo_support; + bool fiper3_support; int irq; int phc_index; u32 tclk_period; /* nanoseconds */ @@ -155,6 +157,7 @@ struct ptp_qoriq { u32 cksel; u32 tmr_fiper1; u32 tmr_fiper2; + u32 tmr_fiper3; u32 (*read)(unsigned __iomem *addr); void (*write)(unsigned __iomem *addr, u32 val); }; -- cgit v1.2.3 From 881321b6ed9e983ad733268404245fcf0be0b23c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 18 Sep 2020 13:57:51 +0300 Subject: net: mscc: ocelot: make ocelot_init_timestamp take a const struct ptp_clock_info It is a good measure to ensure correctness if the structures that are meant to remain constant are only processed by functions that thake constant arguments. Signed-off-by: Vladimir Oltean Reviewed-by: Alexandre Belloni Signed-off-by: David S. Miller --- include/soc/mscc/ocelot_ptp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot_ptp.h b/include/soc/mscc/ocelot_ptp.h index 4a6b2f71b6b2..6a7388fa7cc5 100644 --- a/include/soc/mscc/ocelot_ptp.h +++ b/include/soc/mscc/ocelot_ptp.h @@ -53,6 +53,7 @@ int ocelot_ptp_verify(struct ptp_clock_info *ptp, unsigned int pin, enum ptp_pin_function func, unsigned int chan); int ocelot_ptp_enable(struct ptp_clock_info *ptp, struct ptp_clock_request *rq, int on); -int ocelot_init_timestamp(struct ocelot *ocelot, struct ptp_clock_info *info); +int ocelot_init_timestamp(struct ocelot *ocelot, + const struct ptp_clock_info *info); int ocelot_deinit_timestamp(struct ocelot *ocelot); #endif -- cgit v1.2.3 From e14e05e71d106aef973e2cf100e540d911703a6e Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 18 Sep 2020 21:11:01 +0200 Subject: net: devlink: regions: Add a priv member to the regions ops struct The driver may have multiple regions which can be dumped using one function. However, for this to work, additional information is needed. Add a priv member to the ops structure for the driver to use however it likes. Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 73065f07bf17..b68e483d9267 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -561,12 +561,14 @@ struct devlink_info_req; * the data variable must be updated to point to the snapshot data. * The function will be called while the devlink instance lock is * held. + * @priv: Pointer to driver private data for the region operation */ struct devlink_region_ops { const char *name; void (*destructor)(const void *data); int (*snapshot)(struct devlink *devlink, struct netlink_ext_ack *extack, u8 **data); + void *priv; }; struct devlink_fmsg; -- cgit v1.2.3 From d4602a9f47196dd62deba66ec361b5897f1ae62b Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 18 Sep 2020 21:11:02 +0200 Subject: net: devlink: region: Pass the region ops to the snapshot function Pass the region to be snapshotted to the function performing the snapshot. This allows one function to operate on numerous regions. v4: Add missing kerneldoc for ICE Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/devlink.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index b68e483d9267..4883dbae7faf 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -566,7 +566,9 @@ struct devlink_info_req; struct devlink_region_ops { const char *name; void (*destructor)(const void *data); - int (*snapshot)(struct devlink *devlink, struct netlink_ext_ack *extack, + int (*snapshot)(struct devlink *devlink, + const struct devlink_region_ops *ops, + struct netlink_ext_ack *extack, u8 **data); void *priv; }; -- cgit v1.2.3 From ccc3e6b0191c58784c4b4ffc735f81970df33a11 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 18 Sep 2020 21:11:03 +0200 Subject: net: dsa: Add helper to convert from devlink to ds Given a devlink instance, return the dsa switch it is associated to. Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 75c8fac82017..42ae6d4d9d43 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -664,6 +664,13 @@ struct dsa_devlink_priv { struct dsa_switch *ds; }; +static inline struct dsa_switch *dsa_devlink_to_ds(struct devlink *dl) +{ + struct dsa_devlink_priv *dl_priv = devlink_priv(dl); + + return dl_priv->ds; +} + struct dsa_switch_driver { struct list_head list; const struct dsa_switch_ops *ops; -- cgit v1.2.3 From 97c82c23135187878acea76bc5f0b03007e17ac7 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 18 Sep 2020 21:11:04 +0200 Subject: net: dsa: Add devlink regions support to DSA Allow DSA drivers to make use of devlink regions, via simple wrappers. Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 42ae6d4d9d43..431efb5098be 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -658,6 +658,12 @@ void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, void *occ_get_priv); void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, u64 resource_id); +struct devlink_region * +dsa_devlink_region_create(struct dsa_switch *ds, + const struct devlink_region_ops *ops, + u32 region_max_snapshots, u64 region_size); +void dsa_devlink_region_destroy(struct devlink_region *region); + struct dsa_port *dsa_port_from_netdev(struct net_device *netdev); struct dsa_devlink_priv { -- cgit v1.2.3 From 0f06b855a93c3b449253b91abc94c4d483af0a44 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 18 Sep 2020 21:11:08 +0200 Subject: net: dsa: wire up devlink info get Allow the DSA drivers to implement the devlink call to get info info, e.g. driver name, firmware version, ASIC ID, etc. v2: Combine declaration and the assignment on a single line. Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 431efb5098be..d16057c5987a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -612,11 +612,14 @@ struct dsa_switch_ops { bool (*port_rxtstamp)(struct dsa_switch *ds, int port, struct sk_buff *skb, unsigned int type); - /* Devlink parameters */ + /* Devlink parameters, etc */ int (*devlink_param_get)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); int (*devlink_param_set)(struct dsa_switch *ds, u32 id, struct devlink_param_gset_ctx *ctx); + int (*devlink_info_get)(struct dsa_switch *ds, + struct devlink_info_req *req, + struct netlink_ext_ack *extack); /* * MTU change functionality. Switches can also adjust their MRU through -- cgit v1.2.3 From 55f13311785cebd60b9bab9ca7fd64205436c462 Mon Sep 17 00:00:00 2001 From: Dan Murphy Date: Fri, 18 Sep 2020 14:14:51 -0500 Subject: ethtool: Add 100base-FX link mode entries Add entries for the 100base-FX full and half duplex supported modes. $ ethtool eth0 Supported ports: [ FIBRE ] Supported link modes: 100baseFX/Half 100baseFX/Full Supported pause frame use: Symmetric Receive-only Supports auto-negotiation: No Supported FEC modes: Not reported Advertised link modes: 100baseFX/Half 100baseFX/Full Advertised pause frame use: No Advertised auto-negotiation: No Advertised FEC modes: Not reported Speed: 100Mb/s Duplex: Full Auto-negotiation: off Port: MII PHYAD: 1 Transceiver: external Supports Wake-on: gs Wake-on: d SecureOn password: 00:00:00:00:00:00 Current message level: 0x00000000 (0) Link detected: yes Signed-off-by: Dan Murphy Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index b4f2d134e713..9ca87bc73c44 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1617,6 +1617,8 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_400000baseLR4_ER4_FR4_Full_BIT = 87, ETHTOOL_LINK_MODE_400000baseDR4_Full_BIT = 88, ETHTOOL_LINK_MODE_400000baseCR4_Full_BIT = 89, + ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, + ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS }; -- cgit v1.2.3 From bbed0bbdddaf46260aeb1a8910a3b32941e321a2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Sep 2020 03:10:30 +0300 Subject: net: dsa: tag_8021q: add VLANs to the master interface too The whole purpose of tag_8021q is to send VLAN-tagged traffic to the CPU, from which the driver can decode the source port and switch id. Currently this only works if the VLAN filtering on the master is disabled. Change that by explicitly adding code to tag_8021q.c to add the VLANs corresponding to the tags to the filter of the master interface. Because we now need to call vlan_vid_add, then we also need to hold the RTNL mutex. Propagate that requirement to the callers of dsa_8021q_setup and modify the existing call sites as appropriate. Note that one call path, sja1105_best_effort_vlan_filtering_set -> sja1105_vlan_filtering -> sja1105_setup_8021q_tagging, was already holding this lock. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 2b003ae9fb38..88cd72dfa4e0 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -31,6 +31,8 @@ struct dsa_8021q_context { const struct dsa_8021q_ops *ops; struct dsa_switch *ds; struct list_head crosschip_links; + /* EtherType of RX VID, used for filtering on master interface */ + __be16 proto; }; #define DSA_8021Q_N_SUBVLAN 8 -- cgit v1.2.3 From 49347755a84052fd1317c3897c7cea954c8f89fe Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 16 Sep 2020 00:34:53 +0200 Subject: can: include: fix spelling mistakes This patch fixes spelling erros found by "codespell" in the include/linux/can subtree. Link: https://lore.kernel.org/r/20200915223527.1417033-4-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/core.h | 2 +- include/linux/can/dev.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/can/core.h b/include/linux/can/core.h index e20a0cd09ba5..7da9f1f82e8e 100644 --- a/include/linux/can/core.h +++ b/include/linux/can/core.h @@ -2,7 +2,7 @@ /* * linux/can/core.h * - * Protoypes and definitions for CAN protocol modules using the PF_CAN core + * Prototypes and definitions for CAN protocol modules using the PF_CAN core * * Authors: Oliver Hartkopp * Urs Thuermann diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 5e3d45525bd3..516892566ac9 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -108,7 +108,7 @@ static inline bool can_skb_headroom_valid(struct net_device *dev, skb->ip_summed = CHECKSUM_UNNECESSARY; - /* preform proper loopback on capable devices */ + /* perform proper loopback on capable devices */ if (dev->flags & IFF_ECHO) skb->pkt_type = PACKET_LOOPBACK; else -- cgit v1.2.3 From 80a71815d8cd1be6481ad16fad3167f095045a06 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 16 Sep 2020 00:35:01 +0200 Subject: can: dev: can_put_echo_skb(): propagate error in case of errors The function can_put_echo_skb() can fail for several reasons. It may fail due to OOM, but when it fails it's usually due to locking problems in the driver. In order to help developing and debugging of new drivers propagate error value in case of errors. Link: https://lore.kernel.org/r/20200915223527.1417033-12-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 516892566ac9..ed0482b2f4b2 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -201,8 +201,8 @@ void can_bus_off(struct net_device *dev); void can_change_state(struct net_device *dev, struct can_frame *cf, enum can_state tx_state, enum can_state rx_state); -void can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, - unsigned int idx); +int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, + unsigned int idx); struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, u8 *len_ptr); unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx); -- cgit v1.2.3 From 728fc9ff73d3f25220f6b8a52aaf063ec51ef294 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Wed, 16 Sep 2020 00:35:22 +0200 Subject: can: rx-offload: can_rx_offload_add_manual(): add new initialization function This patch adds a new initialization function: can_rx_offload_add_manual() It should be used to add support rx-offload to a driver, if the callback mechanism should not be used. Use e.g. can_rx_offload_queue_sorted() to queue skbs into rx-offload. Link: https://lore.kernel.org/r/20200915223527.1417033-33-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/rx-offload.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/can/rx-offload.h b/include/linux/can/rx-offload.h index 1b78a0cfb615..f1b38088b765 100644 --- a/include/linux/can/rx-offload.h +++ b/include/linux/can/rx-offload.h @@ -35,6 +35,9 @@ int can_rx_offload_add_timestamp(struct net_device *dev, int can_rx_offload_add_fifo(struct net_device *dev, struct can_rx_offload *offload, unsigned int weight); +int can_rx_offload_add_manual(struct net_device *dev, + struct can_rx_offload *offload, + unsigned int weight); int can_rx_offload_irq_offload_timestamp(struct can_rx_offload *offload, u64 reg); int can_rx_offload_irq_offload_fifo(struct can_rx_offload *offload); -- cgit v1.2.3 From 2af30f115d6957f372ce3096c7198763ff253d97 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:17 +0100 Subject: btf: Make btf_set_contains take a const pointer bsearch doesn't modify the contents of the array, so we can take a const pointer. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200921121227.255763-2-lmb@cloudflare.com --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d7c5a6ed87e3..0478b20d335b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1905,6 +1905,6 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); struct btf_id_set; -bool btf_id_set_contains(struct btf_id_set *set, u32 id); +bool btf_id_set_contains(const struct btf_id_set *set, u32 id); #endif /* _LINUX_BPF_H */ -- cgit v1.2.3 From 27774b7073b5d520c80f1fcb8e9993fc139f21bd Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:19 +0100 Subject: btf: Add BTF_ID_LIST_SINGLE macro Add a convenience macro that allows defining a BTF ID list with a single item. This lets us cut down on repetitive macros. Suggested-by: Andrii Nakryiko Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-4-lmb@cloudflare.com --- include/linux/btf_ids.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 210b086188a3..57890b357f85 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -76,6 +76,13 @@ extern u32 name[]; #define BTF_ID_LIST_GLOBAL(name) \ __BTF_ID_LIST(name, globl) +/* The BTF_ID_LIST_SINGLE macro defines a BTF_ID_LIST with + * a single entry. + */ +#define BTF_ID_LIST_SINGLE(name, prefix, typename) \ + BTF_ID_LIST(name) \ + BTF_ID(prefix, typename) + /* * The BTF_ID_UNUSED macro defines 4 zero bytes. * It's used when we want to define 'unused' entry @@ -140,6 +147,7 @@ extern struct btf_id_set name; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED #define BTF_ID_LIST_GLOBAL(name) u32 name[1]; +#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; #define BTF_SET_START(name) static struct btf_id_set name = { 0 }; #define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; #define BTF_SET_END(name) -- cgit v1.2.3 From 9436ef6e862b9ca22e5b12f87b106e07d5af4cae Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:20 +0100 Subject: bpf: Allow specifying a BTF ID per argument in function protos Function prototypes using ARG_PTR_TO_BTF_ID currently use two ways to signal which BTF IDs are acceptable. First, bpf_func_proto.btf_id is an array of IDs, one for each argument. This array is only accessed up to the highest numbered argument that uses ARG_PTR_TO_BTF_ID and may therefore be less than five arguments long. It usually points at a BTF_ID_LIST. Second, check_btf_id is a function pointer that is called by the verifier if present. It gets the actual BTF ID of the register, and the argument number we're currently checking. It turns out that the only user check_arg_btf_id ignores the argument, and is simply used to check whether the BTF ID has a struct sock_common at it's start. Replace both of these mechanisms with an explicit BTF ID for each argument in a function proto. Thanks to btf_struct_ids_match this is very flexible: check_arg_btf_id can be replaced by requiring struct sock_common. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-5-lmb@cloudflare.com --- include/linux/bpf.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0478b20d335b..87b0d5dcc1ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -326,12 +326,16 @@ struct bpf_func_proto { }; enum bpf_arg_type arg_type[5]; }; - int *btf_id; /* BTF ids of arguments */ - bool (*check_btf_id)(u32 btf_id, u32 arg); /* if the argument btf_id is - * valid. Often used if more - * than one btf id is permitted - * for this argument. - */ + union { + struct { + u32 *arg1_btf_id; + u32 *arg2_btf_id; + u32 *arg3_btf_id; + u32 *arg4_btf_id; + u32 *arg5_btf_id; + }; + u32 *arg_btf_id[5]; + }; int *ret_btf_id; /* return value btf_id */ bool (*allowed)(const struct bpf_prog *prog); }; @@ -1385,8 +1389,6 @@ int btf_struct_access(struct bpf_verifier_log *log, u32 *next_btf_id); bool btf_struct_ids_match(struct bpf_verifier_log *log, int off, u32 id, u32 need_type_id); -int btf_resolve_helper_id(struct bpf_verifier_log *log, - const struct bpf_func_proto *fn, int); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, -- cgit v1.2.3 From f79e7ea571732a6e16f15c6e2f000c347e2d7431 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 21 Sep 2020 13:12:27 +0100 Subject: bpf: Use a table to drive helper arg type checks The mapping between bpf_arg_type and bpf_reg_type is encoded in a big hairy if statement that is hard to follow. The debug output also leaves to be desired: if a reg_type doesn't match we only print one of the options, instead printing all the valid ones. Convert the if statement into a table which is then used to drive type checking. If none of the reg_types match we print all options, e.g.: R2 type=rdonly_buf expected=fp, pkt, pkt_meta, map_value Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200921121227.255763-12-lmb@cloudflare.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 87b0d5dcc1ff..fc5c901c7542 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -292,6 +292,7 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ + __BPF_ARG_TYPE_MAX, }; /* type of values returned from helper functions */ -- cgit v1.2.3 From 8a8b9047a8975ad4bdee34b7affad66edfbe626d Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 16 Sep 2020 22:16:56 +0800 Subject: netfilter: nf_tables: Remove ununsed function nft_data_debug It is never used, so can be removed. Signed-off-by: YueHaibing Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 8ceca0e419b3..c4c526507ddb 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -148,13 +148,6 @@ static inline void nft_data_copy(u32 *dst, const struct nft_data *src, memcpy(dst, src, len); } -static inline void nft_data_debug(const struct nft_data *data) -{ - pr_debug("data[0]=%x data[1]=%x data[2]=%x data[3]=%x\n", - data->data[0], data->data[1], - data->data[2], data->data[3]); -} - /** * struct nft_ctx - nf_tables rule/set context * -- cgit v1.2.3 From 92ec804f3dbf0d986f8e10850bfff14f316d7aaf Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 21 Sep 2020 15:10:53 -0700 Subject: net: phy: bcm7xxx: Add an entry for BCM72113 BCM72113 features a 28nm integrated EPHY, add an entry to the driver for it. Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 6ad4c000661a..d0bd226d6bd9 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -30,6 +30,7 @@ #define PHY_ID_BCM57780 0x03625d90 #define PHY_ID_BCM89610 0x03625cd0 +#define PHY_ID_BCM72113 0x35905310 #define PHY_ID_BCM7250 0xae025280 #define PHY_ID_BCM7255 0xae025120 #define PHY_ID_BCM7260 0xae025190 -- cgit v1.2.3 From e1ac11859a057ddcf7d6219bd090c7483541767d Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:15 +0300 Subject: net: bridge: add src field to br_ip Add a new src field to struct br_ip which will be used to lookup S, G entries. When SSM option is added we will enable full br_ip lookups. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 6479a38e52fa..4fb9c4954f3a 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -18,6 +18,12 @@ struct br_ip { __be32 ip4; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr ip6; +#endif + } src; + union { + __be32 ip4; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr ip6; #endif } u; __be16 proto; -- cgit v1.2.3 From eab3227b1240bdcc06c0a01a3fc5bfd2bc12f406 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:17 +0300 Subject: net: bridge: mcast: rename br_ip's u member to dst Since now we have src in br_ip, u no longer makes sense so rename it to dst. No functional changes. v2: fix build with CONFIG_BATMAN_ADV_MCAST CC: Marek Lindner CC: Simon Wunderlich CC: Antonio Quartulli CC: Sven Eckelmann CC: b.a.t.m.a.n@lists.open-mesh.org Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 4fb9c4954f3a..556caed00258 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -25,7 +25,7 @@ struct br_ip { #if IS_ENABLED(CONFIG_IPV6) struct in6_addr ip6; #endif - } u; + } dst; __be16 proto; __u16 vid; }; -- cgit v1.2.3 From 9c4258c78a2a7624c79b797f40ae2dbfd2555e26 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:18 +0300 Subject: net: bridge: mdb: add support to extend add/del commands Since the MDB add/del code expects an exact struct br_mdb_entry we can't really add any extensions, thus add a new nested attribute at the level of MDBA_SET_ENTRY called MDBA_SET_ENTRY_ATTRS which will be used to pass all new options via netlink attributes. This patch doesn't change anything functionally since the new attribute is not used yet, only parsed. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 75a2ac479247..dc52f8cffa0d 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -530,10 +530,22 @@ struct br_mdb_entry { enum { MDBA_SET_ENTRY_UNSPEC, MDBA_SET_ENTRY, + MDBA_SET_ENTRY_ATTRS, __MDBA_SET_ENTRY_MAX, }; #define MDBA_SET_ENTRY_MAX (__MDBA_SET_ENTRY_MAX - 1) +/* [MDBA_SET_ENTRY_ATTRS] = { + * [MDBE_ATTR_xxx] + * ... + * } + */ +enum { + MDBE_ATTR_UNSPEC, + __MDBE_ATTR_MAX, +}; +#define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) + /* Embedded inside LINK_XSTATS_TYPE_BRIDGE */ enum { BRIDGE_XSTATS_UNSPEC, -- cgit v1.2.3 From 88d4bd180419a7cde3947f191dc4e26fbb19f80b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:19 +0300 Subject: net: bridge: mdb: add support for add/del/dump of entries with source Add new mdb attributes (MDBE_ATTR_SOURCE for setting, MDBA_MDB_EATTR_SOURCE for dumping) to allow add/del and dump of mdb entries with a source address (S,G). New S,G entries are created with filter mode of MCAST_INCLUDE. The same attributes are used for IPv4 and IPv6, they're validated and parsed based on their protocol. S,G host joined entries which are added by user are not allowed yet. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index dc52f8cffa0d..3e6377c865eb 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -457,6 +457,7 @@ enum { MDBA_MDB_EATTR_TIMER, MDBA_MDB_EATTR_SRC_LIST, MDBA_MDB_EATTR_GROUP_MODE, + MDBA_MDB_EATTR_SOURCE, __MDBA_MDB_EATTR_MAX }; #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) @@ -542,6 +543,7 @@ enum { */ enum { MDBE_ATTR_UNSPEC, + MDBE_ATTR_SOURCE, __MDBE_ATTR_MAX, }; #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) -- cgit v1.2.3 From 8f8cb77e0b22d9044d8d57ab3bb18ea8d0474752 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:21 +0300 Subject: net: bridge: mcast: add rt_protocol field to the port group struct We need to be able to differentiate between pg entries created by user-space and the kernel when we start generating S,G entries for IGMPv3/MLDv2's fast path. User-space entries are created by default as RTPROT_STATIC and the kernel entries are RTPROT_KERNEL. Later we can allow user-space to provide the entry rt_protocol so we can differentiate between who added the entries specifically (e.g. clag, admin, frr etc). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 3e6377c865eb..1054f151078d 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -458,6 +458,7 @@ enum { MDBA_MDB_EATTR_SRC_LIST, MDBA_MDB_EATTR_GROUP_MODE, MDBA_MDB_EATTR_SOURCE, + MDBA_MDB_EATTR_RTPROT, __MDBA_MDB_EATTR_MAX }; #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1) -- cgit v1.2.3 From 8266a0491e92d39dc9af739e8380a0daa9b8836b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:24 +0300 Subject: net: bridge: mcast: handle port group filter modes We need to handle group filter mode transitions and initial state. To change a port group's INCLUDE -> EXCLUDE mode (or when we have added a new port group in EXCLUDE mode) we need to add that port to all of *,G ports' S,G entries for proper replication. When the EXCLUDE state is changed from IGMPv3 report, br_multicast_fwd_filter_exclude() must be called after the source list processing because the assumption is that all of the group's S,G entries will be created before transitioning to EXCLUDE mode, i.e. most importantly its blocked entries will already be added so it will not get automatically added to them. The transition EXCLUDE -> INCLUDE happens only when a port group timer expires, it requires us to remove that port from all of *,G ports' S,G entries where it was automatically added previously. Finally when we are adding a new S,G entry we must add all of *,G's EXCLUDE ports to it. In order to distinguish automatically added *,G EXCLUDE ports we have a new port group flag - MDB_PG_FLAGS_STAR_EXCL. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 1054f151078d..e4bd30a25f6b 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -518,6 +518,7 @@ struct br_mdb_entry { __u8 state; #define MDB_FLAGS_OFFLOAD (1 << 0) #define MDB_FLAGS_FAST_LEAVE (1 << 1) +#define MDB_FLAGS_STAR_EXCL (1 << 2) __u8 flags; __u16 vid; struct { -- cgit v1.2.3 From 9116ffbf1dd71f953ffda4198d01f82d3ca16df8 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 22 Sep 2020 10:30:25 +0300 Subject: net: bridge: mcast: add support for blocked port groups When excluding S,G entries we need a way to block a particular S,G,port. The new port group flag is managed based on the source's timer as per RFCs 3376 and 3810. When a source expires and its port group is in EXCLUDE mode, it will be blocked. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index e4bd30a25f6b..4c687686aa8f 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -519,6 +519,7 @@ struct br_mdb_entry { #define MDB_FLAGS_OFFLOAD (1 << 0) #define MDB_FLAGS_FAST_LEAVE (1 << 1) #define MDB_FLAGS_STAR_EXCL (1 << 2) +#define MDB_FLAGS_BLOCKED (1 << 3) __u8 flags; __u16 vid; struct { -- cgit v1.2.3 From 39097ab66dbe89b16438dd6d178d49e75cb922ed Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 23 Sep 2020 00:29:02 +0200 Subject: net: phy: Fixup kernel doc Add missing parameter documentation, or fixup wrong parameter names. Reviewed-by: Florian Fainelli Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/mdio.h | 3 ++- include/linux/phy.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 3a88b699b758..dbd69b3d170b 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -306,7 +306,7 @@ static inline u32 linkmode_adv_to_mii_10gbt_adv_t(unsigned long *advertising) /** * mii_10gbt_stat_mod_linkmode_lpa_t * @advertising: target the linkmode advertisement settings - * @adv: value of the C45 10GBASE-T AN STATUS register + * @lpa: value of the C45 10GBASE-T AN STATUS register * * A small helper function that translates C45 10GBASE-T AN STATUS register bits * to linkmode advertisement settings. Other bits in advertising aren't changed. @@ -371,6 +371,7 @@ struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr); /** * mdio_module_driver() - Helper macro for registering mdio drivers + * @_mdio_driver: driver to register * * Helper macro for MDIO drivers which do not do anything special in module * init/exit. Each module may only use this macro once, and calling it diff --git a/include/linux/phy.h b/include/linux/phy.h index 3a09d2bf69ea..4901b2637059 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1566,8 +1566,9 @@ static inline int mdiobus_register_board_info(const struct mdio_board_info *i, /** - * module_phy_driver() - Helper macro for registering PHY drivers + * phy_module_driver() - Helper macro for registering PHY drivers * @__phy_drivers: array of PHY drivers to register + * @__count: Numbers of members in array * * Helper macro for PHY drivers which do not do anything special in module * init/exit. Each module may only use this macro once, and calling it -- cgit v1.2.3 From 4069a572d423b73919ae40a500dfc4b10f8a6f32 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Wed, 23 Sep 2020 00:29:03 +0200 Subject: net: phy: Document core PHY structures Add kerneldoc for the core PHY data structures, a few inline functions and exported functions which are not already documented. v2 Typos g/phy/PHY/s Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 423 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 292 insertions(+), 131 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 4901b2637059..eb3cb1a98b45 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -82,7 +82,39 @@ extern const int phy_10gbit_features_array[1]; #define PHY_POLL_CABLE_TEST 0x00000004 #define MDIO_DEVICE_IS_PHY 0x80000000 -/* Interface Mode definitions */ +/** + * enum phy_interface_t - Interface Mode definitions + * + * @PHY_INTERFACE_MODE_NA: Not Applicable - don't touch + * @PHY_INTERFACE_MODE_INTERNAL: No interface, MAC and PHY combined + * @PHY_INTERFACE_MODE_MII: Median-independent interface + * @PHY_INTERFACE_MODE_GMII: Gigabit median-independent interface + * @PHY_INTERFACE_MODE_SGMII: Serial gigabit media-independent interface + * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface + * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface + * @PHY_INTERFACE_MODE_RMII: Reduced Media Independent Interface + * @PHY_INTERFACE_MODE_RGMII: Reduced gigabit media-independent interface + * @PHY_INTERFACE_MODE_RGMII_ID: RGMII with Internal RX+TX delay + * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay + * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay + * @PHY_INTERFACE_MODE_RTBI: Reduced TBI + * @PHY_INTERFACE_MODE_SMII: ??? MII + * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface + * @PHY_INTERFACE_MODE_XLGMII:40 gigabit media-independent interface + * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax + * @PHY_INTERFACE_MODE_QSGMII: Quad SGMII + * @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII + * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX + * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX + * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI + * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface + * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR + * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII + * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN + * @PHY_INTERFACE_MODE_MAX: Book keeping + * + * Describes the interface between the MAC and PHY. + */ typedef enum { PHY_INTERFACE_MODE_NA, PHY_INTERFACE_MODE_INTERNAL, @@ -116,8 +148,8 @@ typedef enum { } phy_interface_t; /** - * phy_supported_speeds - return all speeds currently supported by a phy device - * @phy: The phy device to return supported speeds of. + * phy_supported_speeds - return all speeds currently supported by a PHY device + * @phy: The PHY device to return supported speeds of. * @speeds: buffer to store supported speeds in. * @size: size of speeds buffer. * @@ -134,9 +166,9 @@ unsigned int phy_supported_speeds(struct phy_device *phy, * phy_modes - map phy_interface_t enum to device tree binding of phy-mode * @interface: enum phy_interface_t value * - * Description: maps 'enum phy_interface_t' defined in this file + * Description: maps enum &phy_interface_t defined in this file * into the device tree binding of 'phy-mode', so that Ethernet - * device driver can get phy interface from device tree. + * device driver can get PHY interface from device tree. */ static inline const char *phy_modes(phy_interface_t interface) { @@ -215,6 +247,14 @@ struct sfp_bus; struct sfp_upstream_ops; struct sk_buff; +/** + * struct mdio_bus_stats - Statistics counters for MDIO busses + * @transfers: Total number of transfers, i.e. @writes + @reads + * @errors: Number of MDIO transfers that returned an error + * @writes: Number of write transfers + * @reads: Number of read transfers + * @syncp: Synchronisation for incrementing statistics + */ struct mdio_bus_stats { u64_stats_t transfers; u64_stats_t errors; @@ -224,7 +264,15 @@ struct mdio_bus_stats { struct u64_stats_sync syncp; }; -/* Represents a shared structure between different phydev's in the same +/** + * struct phy_package_shared - Shared information in PHY packages + * @addr: Common PHY address used to combine PHYs in one package + * @refcnt: Number of PHYs connected to this shared data + * @flags: Initialization of PHY package + * @priv_size: Size of the shared private data @priv + * @priv: Driver private data shared across a PHY package + * + * Represents a shared structure between different phydev's in the same * package, for example a quad PHY. See phy_package_join() and * phy_package_leave(). */ @@ -247,7 +295,14 @@ struct phy_package_shared { #define PHY_SHARED_F_INIT_DONE 0 #define PHY_SHARED_F_PROBE_DONE 1 -/* +/** + * struct mii_bus - Represents an MDIO bus + * + * @owner: Who owns this device + * @name: User friendly name for this MDIO device, or driver name + * @id: Unique identifier for this bus, typical from bus hierarchy + * @priv: Driver private data + * * The Bus class for PHYs. Devices which provide access to * PHYs should register using this structure */ @@ -256,49 +311,58 @@ struct mii_bus { const char *name; char id[MII_BUS_ID_SIZE]; void *priv; + /** @read: Perform a read transfer on the bus */ int (*read)(struct mii_bus *bus, int addr, int regnum); + /** @write: Perform a write transfer on the bus */ int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val); + /** @reset: Perform a reset of the bus */ int (*reset)(struct mii_bus *bus); + + /** @stats: Statistic counters per device on the bus */ struct mdio_bus_stats stats[PHY_MAX_ADDR]; - /* - * A lock to ensure that only one thing can read/write + /** + * @mdio_lock: A lock to ensure that only one thing can read/write * the MDIO bus at a time */ struct mutex mdio_lock; + /** @parent: Parent device of this bus */ struct device *parent; + /** @state: State of bus structure */ enum { MDIOBUS_ALLOCATED = 1, MDIOBUS_REGISTERED, MDIOBUS_UNREGISTERED, MDIOBUS_RELEASED, } state; + + /** @dev: Kernel device representation */ struct device dev; - /* list of all PHYs on bus */ + /** @mdio_map: list of all MDIO devices on bus */ struct mdio_device *mdio_map[PHY_MAX_ADDR]; - /* PHY addresses to be ignored when probing */ + /** @phy_mask: PHY addresses to be ignored when probing */ u32 phy_mask; - /* PHY addresses to ignore the TA/read failure */ + /** @phy_ignore_ta_mask: PHY addresses to ignore the TA/read failure */ u32 phy_ignore_ta_mask; - /* - * An array of interrupts, each PHY's interrupt at the index + /** + * @irq: An array of interrupts, each PHY's interrupt at the index * matching its address */ int irq[PHY_MAX_ADDR]; - /* GPIO reset pulse width in microseconds */ + /** @reset_delay_us: GPIO reset pulse width in microseconds */ int reset_delay_us; - /* GPIO reset deassert delay in microseconds */ + /** @reset_post_delay_us: GPIO reset deassert delay in microseconds */ int reset_post_delay_us; - /* RESET GPIO descriptor pointer */ + /** @reset_gpiod: Reset GPIO descriptor pointer */ struct gpio_desc *reset_gpiod; - /* bus capabilities, used for probing */ + /** @probe_capabilities: bus capabilities, used for probing */ enum { MDIOBUS_NO_CAP = 0, MDIOBUS_C22, @@ -306,15 +370,22 @@ struct mii_bus { MDIOBUS_C22_C45, } probe_capabilities; - /* protect access to the shared element */ + /** @shared_lock: protect access to the shared element */ struct mutex shared_lock; - /* shared state across different PHYs */ + /** @shared: shared state across different PHYs */ struct phy_package_shared *shared[PHY_MAX_ADDR]; }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) -struct mii_bus *mdiobus_alloc_size(size_t); +struct mii_bus *mdiobus_alloc_size(size_t size); + +/** + * mdiobus_alloc - Allocate an MDIO bus structure + * + * The internal state of the MDIO bus will be set of MDIOBUS_ALLOCATED ready + * for the driver to register the bus. + */ static inline struct mii_bus *mdiobus_alloc(void) { return mdiobus_alloc_size(0); @@ -341,40 +412,41 @@ struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr); #define PHY_INTERRUPT_DISABLED false #define PHY_INTERRUPT_ENABLED true -/* PHY state machine states: +/** + * enum phy_state - PHY state machine states: * - * DOWN: PHY device and driver are not ready for anything. probe + * @PHY_DOWN: PHY device and driver are not ready for anything. probe * should be called if and only if the PHY is in this state, * given that the PHY device exists. - * - PHY driver probe function will set the state to READY + * - PHY driver probe function will set the state to @PHY_READY * - * READY: PHY is ready to send and receive packets, but the + * @PHY_READY: PHY is ready to send and receive packets, but the * controller is not. By default, PHYs which do not implement * probe will be set to this state by phy_probe(). * - start will set the state to UP * - * UP: The PHY and attached device are ready to do work. + * @PHY_UP: The PHY and attached device are ready to do work. * Interrupts should be started here. - * - timer moves to NOLINK or RUNNING + * - timer moves to @PHY_NOLINK or @PHY_RUNNING * - * NOLINK: PHY is up, but not currently plugged in. - * - irq or timer will set RUNNING if link comes back - * - phy_stop moves to HALTED + * @PHY_NOLINK: PHY is up, but not currently plugged in. + * - irq or timer will set @PHY_RUNNING if link comes back + * - phy_stop moves to @PHY_HALTED * - * RUNNING: PHY is currently up, running, and possibly sending + * @PHY_RUNNING: PHY is currently up, running, and possibly sending * and/or receiving packets - * - irq or timer will set NOLINK if link goes down - * - phy_stop moves to HALTED + * - irq or timer will set @PHY_NOLINK if link goes down + * - phy_stop moves to @PHY_HALTED * - * CABLETEST: PHY is performing a cable test. Packet reception/sending + * @PHY_CABLETEST: PHY is performing a cable test. Packet reception/sending * is not expected to work, carrier will be indicated as down. PHY will be * poll once per second, or on interrupt for it current state. * Once complete, move to UP to restart the PHY. - * - phy_stop aborts the running test and moves to HALTED + * - phy_stop aborts the running test and moves to @PHY_HALTED * - * HALTED: PHY is up, but no polling or interrupts are done. Or + * @PHY_HALTED: PHY is up, but no polling or interrupts are done. Or * PHY is in an error state. - * - phy_start moves to UP + * - phy_start moves to @PHY_UP */ enum phy_state { PHY_DOWN = 0, @@ -403,34 +475,67 @@ struct phy_c45_device_ids { struct macsec_context; struct macsec_ops; -/* phy_device: An instance of a PHY +/** + * struct phy_device - An instance of a PHY * - * drv: Pointer to the driver for this PHY instance - * phy_id: UID for this device found during discovery - * c45_ids: 802.3-c45 Device Identifers if is_c45. - * is_c45: Set to true if this phy uses clause 45 addressing. - * is_internal: Set to true if this phy is internal to a MAC. - * is_pseudo_fixed_link: Set to true if this phy is an Ethernet switch, etc. - * is_gigabit_capable: Set to true if PHY supports 1000Mbps - * has_fixups: Set to true if this phy has fixups/quirks. - * suspended: Set to true if this phy has been suspended successfully. - * suspended_by_mdio_bus: Set to true if this phy was suspended by MDIO bus. - * sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. - * loopback_enabled: Set true if this phy has been loopbacked successfully. - * downshifted_rate: Set true if link speed has been downshifted. - * state: state of the PHY for management purposes - * dev_flags: Device-specific flags used by the PHY driver. - * irq: IRQ number of the PHY's interrupt (-1 if none) - * phy_timer: The timer for handling the state machine - * sfp_bus_attached: flag indicating whether the SFP bus has been attached - * sfp_bus: SFP bus attached to this PHY's fiber port - * attached_dev: The attached enet driver's device instance ptr - * adjust_link: Callback for the enet controller to respond to - * changes in the link state. - * macsec_ops: MACsec offloading ops. + * @mdio: MDIO bus this PHY is on + * @drv: Pointer to the driver for this PHY instance + * @phy_id: UID for this device found during discovery + * @c45_ids: 802.3-c45 Device Identifiers if is_c45. + * @is_c45: Set to true if this PHY uses clause 45 addressing. + * @is_internal: Set to true if this PHY is internal to a MAC. + * @is_pseudo_fixed_link: Set to true if this PHY is an Ethernet switch, etc. + * @is_gigabit_capable: Set to true if PHY supports 1000Mbps + * @has_fixups: Set to true if this PHY has fixups/quirks. + * @suspended: Set to true if this PHY has been suspended successfully. + * @suspended_by_mdio_bus: Set to true if this PHY was suspended by MDIO bus. + * @sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. + * @loopback_enabled: Set true if this PHY has been loopbacked successfully. + * @downshifted_rate: Set true if link speed has been downshifted. + * @state: State of the PHY for management purposes + * @dev_flags: Device-specific flags used by the PHY driver. + * @irq: IRQ number of the PHY's interrupt (-1 if none) + * @phy_timer: The timer for handling the state machine + * @phylink: Pointer to phylink instance for this PHY + * @sfp_bus_attached: Flag indicating whether the SFP bus has been attached + * @sfp_bus: SFP bus attached to this PHY's fiber port + * @attached_dev: The attached enet driver's device instance ptr + * @adjust_link: Callback for the enet controller to respond to changes: in the + * link state. + * @phy_link_change: Callback for phylink for notification of link change + * @macsec_ops: MACsec offloading ops. * - * speed, duplex, pause, supported, advertising, lp_advertising, - * and autoneg are used like in mii_if_info + * @speed: Current link speed + * @duplex: Current duplex + * @pause: Current pause + * @asym_pause: Current asymmetric pause + * @supported: Combined MAC/PHY supported linkmodes + * @advertising: Currently advertised linkmodes + * @adv_old: Saved advertised while power saving for WoL + * @lp_advertising: Current link partner advertised linkmodes + * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited + * @autoneg: Flag autoneg being used + * @link: Current link state + * @autoneg_complete: Flag auto negotiation of the link has completed + * @mdix: Current crossover + * @mdix_ctrl: User setting of crossover + * @interrupts: Flag interrupts have been enabled + * @interface: enum phy_interface_t value + * @skb: Netlink message for cable diagnostics + * @nest: Netlink nest used for cable diagnostics + * @ehdr: nNtlink header for cable diagnostics + * @phy_led_triggers: Array of LED triggers + * @phy_num_led_triggers: Number of triggers in @phy_led_triggers + * @led_link_trigger: LED trigger for link up/down + * @last_triggered: last LED trigger for link speed + * @master_slave_set: User requested master/slave configuration + * @master_slave_get: Current master/slave advertisement + * @master_slave_state: Current master/slave configuration + * @mii_ts: Pointer to time stamper callbacks + * @lock: Mutex for serialization access to PHY + * @state_queue: Work queue for state machine + * @shared: Pointer to private data shared by phys in one package + * @priv: Pointer to driver private data * * interrupts currently only supports enabled or disabled, * but could be changed in the future to support enabling @@ -550,9 +655,18 @@ struct phy_device { #define to_phy_device(d) container_of(to_mdio_device(d), \ struct phy_device, mdio) -/* A structure containing possible configuration parameters +/** + * struct phy_tdr_config - Configuration of a TDR raw test + * + * @first: Distance for first data collection point + * @last: Distance for last data collection point + * @step: Step between data collection points + * @pair: Bitmap of cable pairs to collect data for + * + * A structure containing possible configuration parameters * for a TDR cable test. The driver does not need to implement * all the parameters, but should report what is actually used. + * All distances are in centimeters. */ struct phy_tdr_config { u32 first; @@ -562,18 +676,20 @@ struct phy_tdr_config { }; #define PHY_PAIR_ALL -1 -/* struct phy_driver: Driver structure for a particular PHY type +/** + * struct phy_driver - Driver structure for a particular PHY type * - * driver_data: static driver data - * phy_id: The result of reading the UID registers of this PHY + * @mdiodrv: Data common to all MDIO devices + * @phy_id: The result of reading the UID registers of this PHY * type, and ANDing them with the phy_id_mask. This driver * only works for PHYs with IDs which match this field - * name: The friendly name of this PHY type - * phy_id_mask: Defines the important bits of the phy_id - * features: A mandatory list of features (speed, duplex, etc) + * @name: The friendly name of this PHY type + * @phy_id_mask: Defines the important bits of the phy_id + * @features: A mandatory list of features (speed, duplex, etc) * supported by this PHY - * flags: A bitfield defining certain other features this PHY + * @flags: A bitfield defining certain other features this PHY * supports (like interrupts) + * @driver_data: Static driver data * * All functions are optional. If config_aneg or read_status * are not implemented, the phy core uses the genphy versions. @@ -592,151 +708,178 @@ struct phy_driver { u32 flags; const void *driver_data; - /* - * Called to issue a PHY software reset + /** + * @soft_reset: Called to issue a PHY software reset */ int (*soft_reset)(struct phy_device *phydev); - /* - * Called to initialize the PHY, + /** + * @config_init: Called to initialize the PHY, * including after a reset */ int (*config_init)(struct phy_device *phydev); - /* - * Called during discovery. Used to set + /** + * @probe: Called during discovery. Used to set * up device-specific structures, if any */ int (*probe)(struct phy_device *phydev); - /* - * Probe the hardware to determine what abilities it has. - * Should only set phydev->supported. + /** + * @get_features: Probe the hardware to determine what + * abilities it has. Should only set phydev->supported. */ int (*get_features)(struct phy_device *phydev); /* PHY Power Management */ + /** @suspend: Suspend the hardware, saving state if needed */ int (*suspend)(struct phy_device *phydev); + /** @resume: Resume the hardware, restoring state if needed */ int (*resume)(struct phy_device *phydev); - /* - * Configures the advertisement and resets + /** + * @config_aneg: Configures the advertisement and resets * autonegotiation if phydev->autoneg is on, * forces the speed to the current settings in phydev * if phydev->autoneg is off */ int (*config_aneg)(struct phy_device *phydev); - /* Determines the auto negotiation result */ + /** @aneg_done: Determines the auto negotiation result */ int (*aneg_done)(struct phy_device *phydev); - /* Determines the negotiated speed and duplex */ + /** @read_status: Determines the negotiated speed and duplex */ int (*read_status)(struct phy_device *phydev); - /* Clears any pending interrupts */ + /** @ack_interrupt: Clears any pending interrupts */ int (*ack_interrupt)(struct phy_device *phydev); - /* Enables or disables interrupts */ + /** @config_intr: Enables or disables interrupts */ int (*config_intr)(struct phy_device *phydev); - /* - * Checks if the PHY generated an interrupt. + /** + * @did_interrupt: Checks if the PHY generated an interrupt. * For multi-PHY devices with shared PHY interrupt pin * Set interrupt bits have to be cleared. */ int (*did_interrupt)(struct phy_device *phydev); - /* Override default interrupt handling */ + /** @handle_interrupt: Override default interrupt handling */ irqreturn_t (*handle_interrupt)(struct phy_device *phydev); - /* Clears up any memory if needed */ + /** @remove: Clears up any memory if needed */ void (*remove)(struct phy_device *phydev); - /* Returns true if this is a suitable driver for the given - * phydev. If NULL, matching is based on phy_id and - * phy_id_mask. + /** + * @match_phy_device: Returns true if this is a suitable + * driver for the given phydev. If NULL, matching is based on + * phy_id and phy_id_mask. */ int (*match_phy_device)(struct phy_device *phydev); - /* Some devices (e.g. qnap TS-119P II) require PHY register changes to - * enable Wake on LAN, so set_wol is provided to be called in the - * ethernet driver's set_wol function. */ + /** + * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY + * register changes to enable Wake on LAN, so set_wol is + * provided to be called in the ethernet driver's set_wol + * function. + */ int (*set_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); - /* See set_wol, but for checking whether Wake on LAN is enabled. */ + /** + * @get_wol: See set_wol, but for checking whether Wake on LAN + * is enabled. + */ void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); - /* - * Called to inform a PHY device driver when the core is about to - * change the link state. This callback is supposed to be used as - * fixup hook for drivers that need to take action when the link - * state changes. Drivers are by no means allowed to mess with the + /** + * @link_change_notify: Called to inform a PHY device driver + * when the core is about to change the link state. This + * callback is supposed to be used as fixup hook for drivers + * that need to take action when the link state + * changes. Drivers are by no means allowed to mess with the * PHY device structure in their implementations. */ void (*link_change_notify)(struct phy_device *dev); - /* - * Phy specific driver override for reading a MMD register. - * This function is optional for PHY specific drivers. When - * not provided, the default MMD read function will be used - * by phy_read_mmd(), which will use either a direct read for - * Clause 45 PHYs or an indirect read for Clause 22 PHYs. - * devnum is the MMD device number within the PHY device, - * regnum is the register within the selected MMD device. + /** + * @read_mmd: PHY specific driver override for reading a MMD + * register. This function is optional for PHY specific + * drivers. When not provided, the default MMD read function + * will be used by phy_read_mmd(), which will use either a + * direct read for Clause 45 PHYs or an indirect read for + * Clause 22 PHYs. devnum is the MMD device number within the + * PHY device, regnum is the register within the selected MMD + * device. */ int (*read_mmd)(struct phy_device *dev, int devnum, u16 regnum); - /* - * Phy specific driver override for writing a MMD register. - * This function is optional for PHY specific drivers. When - * not provided, the default MMD write function will be used - * by phy_write_mmd(), which will use either a direct write for - * Clause 45 PHYs, or an indirect write for Clause 22 PHYs. - * devnum is the MMD device number within the PHY device, - * regnum is the register within the selected MMD device. - * val is the value to be written. + /** + * @write_mmd: PHY specific driver override for writing a MMD + * register. This function is optional for PHY specific + * drivers. When not provided, the default MMD write function + * will be used by phy_write_mmd(), which will use either a + * direct write for Clause 45 PHYs, or an indirect write for + * Clause 22 PHYs. devnum is the MMD device number within the + * PHY device, regnum is the register within the selected MMD + * device. val is the value to be written. */ int (*write_mmd)(struct phy_device *dev, int devnum, u16 regnum, u16 val); + /** @read_page: Return the current PHY register page number */ int (*read_page)(struct phy_device *dev); + /** @write_page: Set the current PHY register page number */ int (*write_page)(struct phy_device *dev, int page); - /* Get the size and type of the eeprom contained within a plug-in - * module */ + /** + * @module_info: Get the size and type of the eeprom contained + * within a plug-in module + */ int (*module_info)(struct phy_device *dev, struct ethtool_modinfo *modinfo); - /* Get the eeprom information from the plug-in module */ + /** + * @module_eeprom: Get the eeprom information from the plug-in + * module + */ int (*module_eeprom)(struct phy_device *dev, struct ethtool_eeprom *ee, u8 *data); - /* Start a cable test */ + /** @cable_test_start: Start a cable test */ int (*cable_test_start)(struct phy_device *dev); - /* Start a raw TDR cable test */ + /** @cable_test_tdr_start: Start a raw TDR cable test */ int (*cable_test_tdr_start)(struct phy_device *dev, const struct phy_tdr_config *config); - /* Once per second, or on interrupt, request the status of the - * test. + /** + * @cable_test_get_status: Once per second, or on interrupt, + * request the status of the test. */ int (*cable_test_get_status)(struct phy_device *dev, bool *finished); - /* Get statistics from the phy using ethtool */ + /* Get statistics from the PHY using ethtool */ + /** @get_sset_count: Number of statistic counters */ int (*get_sset_count)(struct phy_device *dev); + /** @get_strings: Names of the statistic counters */ void (*get_strings)(struct phy_device *dev, u8 *data); + /** @get_stats: Return the statistic counter values */ void (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data); /* Get and Set PHY tunables */ + /** @get_tunable: Return the value of a tunable */ int (*get_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, void *data); + /** @set_tunable: Set the value of a tunable */ int (*set_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, const void *data); + /** @set_loopback: Set the loopback mood of the PHY */ int (*set_loopback)(struct phy_device *dev, bool enable); + /** @get_sqi: Get the signal quality indication */ int (*get_sqi)(struct phy_device *dev); + /** @get_sqi_max: Get the maximum signal quality indication */ int (*get_sqi_max)(struct phy_device *dev); }; #define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ @@ -890,6 +1033,24 @@ static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum, */ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); +/** + * phy_read_mmd_poll_timeout - Periodically poll a PHY register until a + * condition is met or a timeout occurs + * + * @phydev: The phy_device struct + * @devaddr: The MMD to read from + * @regnum: The register on the MMD to read + * @val: Variable to read the register into + * @cond: Break condition (usually involving @val) + * @sleep_us: Maximum time to sleep between reads in us (0 + * tight-loops). Should be less than ~20ms since usleep_range + * is used (see Documentation/timers/timers-howto.rst). + * @timeout_us: Timeout in us, 0 means never timeout + * @sleep_before_read: if it is true, sleep @sleep_us before read. + * Returns 0 on success and -ETIMEDOUT upon a timeout. In either + * case, the last read value at @args is stored in @val. Must not + * be called from atomic context if sleep_us or timeout_us are used. + */ #define phy_read_mmd_poll_timeout(phydev, devaddr, regnum, val, cond, \ sleep_us, timeout_us, sleep_before_read) \ ({ \ @@ -1161,7 +1322,7 @@ static inline bool phy_is_internal(struct phy_device *phydev) /** * phy_interface_mode_is_rgmii - Convenience function for testing if a * PHY interface mode is RGMII (all variants) - * @mode: the phy_interface_t enum + * @mode: the &phy_interface_t enum */ static inline bool phy_interface_mode_is_rgmii(phy_interface_t mode) { @@ -1170,11 +1331,11 @@ static inline bool phy_interface_mode_is_rgmii(phy_interface_t mode) }; /** - * phy_interface_mode_is_8023z() - does the phy interface mode use 802.3z + * phy_interface_mode_is_8023z() - does the PHY interface mode use 802.3z * negotiation * @mode: one of &enum phy_interface_t * - * Returns true if the phy interface mode uses the 16-bit negotiation + * Returns true if the PHY interface mode uses the 16-bit negotiation * word as defined in 802.3z. (See 802.3-2015 37.2.1 Config_Reg encoding) */ static inline bool phy_interface_mode_is_8023z(phy_interface_t mode) @@ -1193,7 +1354,7 @@ static inline bool phy_interface_is_rgmii(struct phy_device *phydev) return phy_interface_mode_is_rgmii(phydev->interface); }; -/* +/** * phy_is_pseudo_fixed_link - Convenience function for testing if this * PHY is the CPU port facing side of an Ethernet switch, or similar. * @phydev: the phy_device struct -- cgit v1.2.3 From c9e7c76d70fa50582ca96759829c93d0dd024662 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 21 Sep 2020 15:36:51 +0100 Subject: xfrm: Provide API to register translator module Add a skeleton for xfrm_compat module and provide API to register it in xfrm_state.ko. struct xfrm_translator will have function pointers to translate messages received from 32-bit userspace or to be sent to it from 64-bit kernel. module_get()/module_put() are used instead of rcu_read_lock() as the module will vmalloc() memory for translation. The new API is registered with xfrm_state module, not with xfrm_user as the former needs translator for user_policy set by setsockopt() and xfrm_user already uses functions from xfrm_state. Signed-off-by: Dmitry Safonov Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2737d24ec244..fe2e3717da14 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2000,6 +2000,25 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, return 0; } +struct xfrm_translator { + struct module *owner; +}; + +#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT) +extern int xfrm_register_translator(struct xfrm_translator *xtr); +extern int xfrm_unregister_translator(struct xfrm_translator *xtr); +extern struct xfrm_translator *xfrm_get_translator(void); +extern void xfrm_put_translator(struct xfrm_translator *xtr); +#else +static inline struct xfrm_translator *xfrm_get_translator(void) +{ + return NULL; +} +static inline void xfrm_put_translator(struct xfrm_translator *xtr) +{ +} +#endif + #if IS_ENABLED(CONFIG_IPV6) static inline bool xfrm6_local_dontfrag(const struct sock *sk) { -- cgit v1.2.3 From 5461fc0c8d9f23956b99f5907f69726a293ccb67 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 21 Sep 2020 15:36:52 +0100 Subject: xfrm/compat: Add 64=>32-bit messages translator Provide the kernel-to-user translator under XFRM_USER_COMPAT, that creates for 64-bit xfrm-user message a 32-bit translation and puts it in skb's frag_list. net/compat.c layer provides MSG_CMSG_COMPAT to decide if the message should be taken from skb or frag_list. (used by wext-core which has also an ABI difference) Kernel sends 64-bit xfrm messages to the userspace for: - multicast (monitor events) - netlink dumps Wire up the translator to xfrm_nlmsg_multicast(). Signed-off-by: Dmitry Safonov Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fe2e3717da14..5b6cc62c9354 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2000,7 +2000,12 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, return 0; } +extern const int xfrm_msg_min[XFRM_NR_MSGTYPES]; + struct xfrm_translator { + /* Allocate frag_list and put compat translation there */ + int (*alloc_compat)(struct sk_buff *skb, const struct nlmsghdr *src); + struct module *owner; }; -- cgit v1.2.3 From 5106f4a8acff480e244300bc5097c0ad7048c3a2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 21 Sep 2020 15:36:55 +0100 Subject: xfrm/compat: Add 32=>64-bit messages translator Provide the user-to-kernel translator under XFRM_USER_COMPAT, that creates for 32-bit xfrm-user message a 64-bit translation. The translation is afterwards reused by xfrm_user code just as if userspace had sent 64-bit message. Signed-off-by: Dmitry Safonov Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 5b6cc62c9354..fa18cb6bb3f7 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2001,11 +2001,17 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, } extern const int xfrm_msg_min[XFRM_NR_MSGTYPES]; +extern const struct nla_policy xfrma_policy[XFRMA_MAX+1]; struct xfrm_translator { /* Allocate frag_list and put compat translation there */ int (*alloc_compat)(struct sk_buff *skb, const struct nlmsghdr *src); + /* Allocate nlmsg with 64-bit translaton of received 32-bit message */ + struct nlmsghdr *(*rcv_msg_compat)(const struct nlmsghdr *nlh, + int maxtype, const struct nla_policy *policy, + struct netlink_ext_ack *extack); + struct module *owner; }; -- cgit v1.2.3 From 96392ee5a13b992563cfe07d23ee30d333b89126 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 21 Sep 2020 15:36:56 +0100 Subject: xfrm/compat: Translate 32-bit user_policy from sockptr Provide compat_xfrm_userpolicy_info translation for xfrm setsocketopt(). Reallocate buffer and put the missing padding for 64-bit message. Signed-off-by: Dmitry Safonov Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fa18cb6bb3f7..53618a31634b 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2012,6 +2012,9 @@ struct xfrm_translator { int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack); + /* Translate 32-bit user_policy from sockptr */ + int (*xlate_user_policy_sockptr)(u8 **pdata32, int optlen); + struct module *owner; }; -- cgit v1.2.3 From e2f9a8fe73d3a29edfdb4215e7596c95b6db362d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Sep 2020 14:24:20 +0300 Subject: net: mscc: ocelot: always pass skb clone to ocelot_port_add_txtstamp_skb Currently, ocelot switchdev passes the skb directly to the function that enqueues it to the list of skb's awaiting a TX timestamp. Whereas the felix DSA driver first clones the skb, then passes the clone to this queue. This matters because in the case of felix, the common IRQ handler, which is ocelot_get_txtstamp(), currently clones the clone, and frees the original clone. This is useless and can be simplified by using skb_complete_tx_timestamp() instead of skb_tstamp_tx(). Signed-off-by: Vladimir Oltean Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 0ac4e7fba086..3105bbb6cdcf 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -710,8 +710,8 @@ int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid); int ocelot_hwstamp_get(struct ocelot *ocelot, int port, struct ifreq *ifr); int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr); -int ocelot_port_add_txtstamp_skb(struct ocelot_port *ocelot_port, - struct sk_buff *skb); +void ocelot_port_add_txtstamp_skb(struct ocelot *ocelot, int port, + struct sk_buff *clone); void ocelot_get_txtstamp(struct ocelot *ocelot); void ocelot_port_set_maxlen(struct ocelot *ocelot, int port, size_t sdu); int ocelot_get_max_mtu(struct ocelot *ocelot, int port); -- cgit v1.2.3 From b5b6775d72e8662b5ea7f892bf06db4831e2c1aa Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 23 Sep 2020 18:41:22 +0300 Subject: of: add of_mdio_find_device() api Add a helper function which finds the mdio_device structure given a device tree node. This is helpful for finding the PCS device based on a DTS node but managing it as a mdio_device instead of a phy_device. Signed-off-by: Russell King Signed-off-by: Ioana Ciornei Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/of_mdio.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index 1efb88d9f892..cfe8c607a628 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -17,6 +17,7 @@ bool of_mdiobus_child_is_phy(struct device_node *child); int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np); int devm_of_mdiobus_register(struct device *dev, struct mii_bus *mdio, struct device_node *np); +struct mdio_device *of_mdio_find_device(struct device_node *np); struct phy_device *of_phy_find_device(struct device_node *phy_np); struct phy_device * of_phy_connect(struct net_device *dev, struct device_node *phy_np, @@ -74,6 +75,11 @@ static inline int of_mdiobus_register(struct mii_bus *mdio, struct device_node * return mdiobus_register(mdio); } +static inline struct mdio_device *of_mdio_find_device(struct device_node *np) +{ + return NULL; +} + static inline struct phy_device *of_phy_find_device(struct device_node *phy_np) { return NULL; -- cgit v1.2.3 From 08b81d873126b413cda511b1ea1cbb0e99938bbd Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 24 Sep 2020 08:30:01 +0800 Subject: mptcp: add sk_stop_timer_sync helper This patch added a new helper sk_stop_timer_sync, it deactivates a timer like sk_stop_timer, but waits for the handler to finish. Acked-by: Paolo Abeni Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index eaa5cac5e836..a5c6ae78df77 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2195,6 +2195,8 @@ void sk_reset_timer(struct sock *sk, struct timer_list *timer, void sk_stop_timer(struct sock *sk, struct timer_list *timer); +void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer); + int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, -- cgit v1.2.3 From 77d0cab93909edf1e5740643d788ac2e458b8187 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 25 Sep 2020 01:23:02 +0200 Subject: net: tcp: drop unused function argument from mptcp_incoming_options Since commit cfde141ea3faa30e ("mptcp: move option parsing into mptcp_incoming_options()"), the 3rd function argument is no longer used. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/mptcp.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 3525d2822abe..753ba7e755d6 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -85,8 +85,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts); -void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, - struct tcp_options_received *opt_rx); +void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb); void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts); @@ -185,8 +184,7 @@ static inline bool mptcp_established_options(struct sock *sk, } static inline void mptcp_incoming_options(struct sock *sk, - struct sk_buff *skb, - struct tcp_options_received *opt_rx) + struct sk_buff *skb) { } -- cgit v1.2.3 From f19425641cb2572a33cb074d5e30283720bd4d22 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 6 Aug 2020 11:17:12 -0700 Subject: Bluetooth: L2CAP: Fix calling sk_filter on non-socket based channel Only sockets will have the chan->data set to an actual sk, channels like A2MP would have its own data which would likely cause a crash when calling sk_filter, in order to fix this a new callback has been introduced so channels can implement their own filtering if necessary. Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 8f1e6a7a2df8..1d1232917de7 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -665,6 +665,8 @@ struct l2cap_ops { struct sk_buff *(*alloc_skb) (struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb); + int (*filter) (struct l2cap_chan * chan, + struct sk_buff *skb); }; struct l2cap_conn { -- cgit v1.2.3 From 1df8f55a37bd286a3d40192980050bc3d7d78887 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:03:50 -0700 Subject: bpf: Enable bpf_skc_to_* sock casting helper to networking prog type There is a constant need to add more fields into the bpf_tcp_sock for the bpf programs running at tc, sock_ops...etc. A current workaround could be to use bpf_probe_read_kernel(). However, other than making another helper call for reading each field and missing CO-RE, it is also not as intuitive to use as directly reading "tp->lsndtime" for example. While already having perfmon cap to do bpf_probe_read_kernel(), it will be much easier if the bpf prog can directly read from the tcp_sock. This patch tries to do that by using the existing casting-helpers bpf_skc_to_*() whose func_proto returns a btf_id. For example, the func_proto of bpf_skc_to_tcp_sock returns the btf_id of the kernel "struct tcp_sock". These helpers are also added to is_ptr_cast_function(). It ensures the returning reg (BPF_REF_0) will also carries the ref_obj_id. That will keep the ref-tracking works properly. The bpf_skc_to_* helpers are made available to most of the bpf prog types in filter.c. The bpf_skc_to_* helpers will be limited by perfmon cap. This patch adds a ARG_PTR_TO_BTF_ID_SOCK_COMMON. The helper accepting this arg can accept a btf-id-ptr (PTR_TO_BTF_ID + &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON]) or a legacy-ctx-convert-skc-ptr (PTR_TO_SOCK_COMMON). The bpf_skc_to_*() helpers are changed to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will accept pointer obtained from skb->sk. Instead of specifying both arg_type and arg_btf_id in the same func_proto which is how the current ARG_PTR_TO_BTF_ID does, the arg_btf_id of the new ARG_PTR_TO_BTF_ID_SOCK_COMMON is specified in the compatible_reg_types[] in verifier.c. The reason is the arg_btf_id is always the same. Discussion in this thread: https://lore.kernel.org/bpf/20200922070422.1917351-1-kafai@fb.com/ The ARG_PTR_TO_BTF_ID_ part gives a clear expectation that the helper is expecting a PTR_TO_BTF_ID which could be NULL. This is the same behavior as the existing helper taking ARG_PTR_TO_BTF_ID. The _SOCK_COMMON part means the helper is also expecting the legacy SOCK_COMMON pointer. By excluding the _OR_NULL part, the bpf prog cannot call helper with a literal NULL which doesn't make sense in most cases. e.g. bpf_skc_to_tcp_sock(NULL) will be rejected. All PTR_TO_*_OR_NULL reg has to do a NULL check first before passing into the helper or else the bpf prog will be rejected. This behavior is nothing new and consistent with the current expectation during bpf-prog-load. [ ARG_PTR_TO_BTF_ID_SOCK_COMMON will be used to replace ARG_PTR_TO_SOCK* of other existing helpers later such that those existing helpers can take the PTR_TO_BTF_ID returned by the bpf_skc_to_*() helpers. The only special case is bpf_sk_lookup_assign() which can accept a literal NULL ptr. It has to be handled specially in another follow up patch if there is a need (e.g. by renaming ARG_PTR_TO_SOCKET_OR_NULL to ARG_PTR_TO_BTF_ID_SOCK_COMMON_OR_NULL). ] [ When converting the older helpers that take ARG_PTR_TO_SOCK* in the later patch, if the kernel does not support BTF, ARG_PTR_TO_BTF_ID_SOCK_COMMON will behave like ARG_PTR_TO_SOCK_COMMON because no reg->type could have PTR_TO_BTF_ID in this case. It is not a concern for the newer-btf-only helper like the bpf_skc_to_*() here though because these helpers must require BTF vmlinux to begin with. ] Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200925000350.3855720-1-kafai@fb.com --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fc5c901c7542..d0937f1d2980 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -292,6 +292,7 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ + ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ __BPF_ARG_TYPE_MAX, }; -- cgit v1.2.3 From a5fa25adf03d4b063aece74ba70ccbb3a71af122 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:03:56 -0700 Subject: bpf: Change bpf_sk_release and bpf_sk_*cgroup_id to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON The previous patch allows the networking bpf prog to use the bpf_skc_to_*() helpers to get a PTR_TO_BTF_ID socket pointer, e.g. "struct tcp_sock *". It allows the bpf prog to read all the fields of the tcp_sock. This patch changes the bpf_sk_release() and bpf_sk_*cgroup_id() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. For example, the following will work: sk = bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); if (!sk) return; tp = bpf_skc_to_tcp_sock(sk); if (!tp) { bpf_sk_release(sk); return; } lsndtime = tp->lsndtime; /* Pass tp to bpf_sk_release() will also work */ bpf_sk_release(tp); Since PTR_TO_BTF_ID could be NULL, the helper taking ARG_PTR_TO_BTF_ID_SOCK_COMMON has to check for NULL at runtime. A btf_id of "struct sock" may not always mean a fullsock. Regardless the helper's running context may get a non-fullsock or not, considering fullsock check/handling is pretty cheap, it is better to keep the same verifier expectation on helper that takes ARG_PTR_TO_BTF_ID* will be able to handle the minisock situation. In the bpf_sk_*cgroup_id() case, it will try to get a fullsock by using sk_to_full_sk() as its skb variant bpf_sk"b"_*cgroup_id() has already been doing. bpf_sk_release can already handle minisock, so nothing special has to be done. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200925000356.3856047-1-kafai@fb.com --- include/uapi/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a22812561064..c96a56d9c3be 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2512,7 +2512,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * long bpf_sk_release(struct bpf_sock *sock) + * long bpf_sk_release(void *sock) * Description * Release the reference held by *sock*. *sock* must be a * non-**NULL** pointer that was returned from @@ -3234,11 +3234,11 @@ union bpf_attr { * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. * - * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * u64 bpf_sk_cgroup_id(void *sk) * Description * Return the cgroup v2 id of the socket *sk*. * - * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * *sk* must be a non-**NULL** pointer to a socket, e.g. one * returned from **bpf_sk_lookup_xxx**\ (), * **bpf_sk_fullsock**\ (), etc. The format of returned id is * same as in **bpf_skb_cgroup_id**\ (). @@ -3248,7 +3248,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level) * Description * Return id of cgroup v2 that is ancestor of cgroup associated * with the *sk* at the *ancestor_level*. The root cgroup is at -- cgit v1.2.3 From 592a3498648af000e93dff2d36229ab11cd8c7f6 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:02 -0700 Subject: bpf: Change bpf_sk_storage_*() to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_sk_storage_*() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. A micro benchmark has been done on a "cgroup_skb/egress" bpf program which does a bpf_sk_storage_get(). It was driven by netperf doing a 4096 connected UDP_STREAM test with 64bytes packet. The stats from "kernel.bpf_stats_enabled" shows no meaningful difference. The sk_storage_get_btf_proto, sk_storage_delete_btf_proto, btf_sk_storage_get_proto, and btf_sk_storage_delete_proto are no longer needed, so they are removed. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20200925000402.3856307-1-kafai@fb.com --- include/net/bpf_sk_storage.h | 2 -- include/uapi/linux/bpf.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 119f4c9c3a9c..3c516dd07caf 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -20,8 +20,6 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; -extern const struct bpf_func_proto sk_storage_get_btf_proto; -extern const struct bpf_func_proto sk_storage_delete_btf_proto; struct bpf_local_storage_elem; struct bpf_sk_storage_diag; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c96a56d9c3be..0ec6dbeb17a5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2861,6 +2861,7 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). * * long bpf_send_signal(u32 sig) * Description -- cgit v1.2.3 From c0df236e1394970f3503a8fb103de95d000014ca Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:09 -0700 Subject: bpf: Change bpf_tcp_*_syncookie to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_tcp_*_syncookie() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20200925000409.3856725-1-kafai@fb.com --- include/uapi/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0ec6dbeb17a5..69b9e30375bc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2692,7 +2692,7 @@ union bpf_attr { * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. * - * long bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Check whether *iph* and *th* contain a valid SYN cookie ACK for * the listening socket in *sk*. @@ -2878,7 +2878,7 @@ union bpf_attr { * * **-EAGAIN** if bpf program can try again. * - * s64 bpf_tcp_gen_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) * Description * Try to issue a SYN cookie for the packet with corresponding * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. -- cgit v1.2.3 From 27e5203bd9c5cc6d54dcac48c3027f3f04522b8b Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 24 Sep 2020 17:04:15 -0700 Subject: bpf: Change bpf_sk_assign to accept ARG_PTR_TO_BTF_ID_SOCK_COMMON This patch changes the bpf_sk_assign() to take ARG_PTR_TO_BTF_ID_SOCK_COMMON such that they will work with the pointer returned by the bpf_skc_to_*() helpers also. The bpf_sk_lookup_assign() is taking ARG_PTR_TO_SOCKET_"OR_NULL". Meaning it specifically takes a literal NULL. ARG_PTR_TO_BTF_ID_SOCK_COMMON does not allow a literal NULL, so another ARG type is required for this purpose and another follow-up patch can be used if there is such need. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200925000415.3857374-1-kafai@fb.com --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69b9e30375bc..2d6519a2ed77 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3107,7 +3107,7 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags) + * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags) * Description * Helper is overloaded depending on BPF program type. This * description applies to **BPF_PROG_TYPE_SCHED_CLS** and -- cgit v1.2.3 From e0f9956a3862b32ad73869a8e52a33c84aafa46f Mon Sep 17 00:00:00 2001 From: "Chuah, Kim Tatt" Date: Fri, 25 Sep 2020 17:40:41 +0800 Subject: net: stmmac: Add option for VLAN filter fail queue enable Add option in plat_stmmacenet_data struct to enable VLAN Filter Fail Queuing. This option allows packets that fail VLAN filter to be routed to a specific Rx queue when Receive All is also set. When this option is enabled: - Enable VFFQ only when entering promiscuous mode, because Receive All will pass up all rx packets that failed address filtering (similar to promiscuous mode). - VLAN-promiscuous mode is never entered to allow rx packet to fail VLAN filters and get routed to selected VFFQ Rx queue. Reviewed-by: Voon Weifeng Reviewed-by: Ong Boon Leong Signed-off-by: Chuah, Kim Tatt Signed-off-by: Ong Boon Leong Signed-off-by: David S. Miller --- include/linux/stmmac.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index bd964c31d333..00e83c877496 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -198,5 +198,7 @@ struct plat_stmmacenet_data { int mac_port_sel_speed; bool en_tx_lpi_clockgating; int has_xgmac; + bool vlan_fail_q_en; + u8 vlan_fail_q; }; #endif -- cgit v1.2.3 From ba5f4cfeac77fca981b199ec7f2396a3616e5216 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 24 Sep 2020 12:58:40 -0700 Subject: bpf: Add comment to document BTF type PTR_TO_BTF_ID_OR_NULL The meaning of PTR_TO_BTF_ID_OR_NULL differs slightly from other types denoted with the *_OR_NULL type. For example the types PTR_TO_SOCKET and PTR_TO_SOCKET_OR_NULL can be used for branch analysis because the type PTR_TO_SOCKET is guaranteed to _not_ have a null value. In contrast PTR_TO_BTF_ID and BTF_TO_BTF_ID_OR_NULL have slightly different meanings. A PTR_TO_BTF_TO_ID may be a pointer to NULL value, but it is safe to read this pointer in the program context because the program context will handle any faults. The fallout is for PTR_TO_BTF_ID the verifier can assume reads are safe, but can not use the type in branch analysis. Additionally, authors need to be extra careful when passing PTR_TO_BTF_ID into helpers. In general helpers consuming type PTR_TO_BTF_ID will need to assume it may be null. Seeing the above is not obvious to readers without the back knowledge lets add a comment in the type definition. Editorial comment, as networking and tracing programs get closer and more tightly merged we may need to consider a new type that we can ensure is non-null for branch analysis and also passing into helpers. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Acked-by: Lorenz Bauer --- include/linux/bpf.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0937f1d2980..79902325bef8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -383,8 +383,22 @@ enum bpf_reg_type { PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ - PTR_TO_BTF_ID, /* reg points to kernel struct */ - PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */ + /* PTR_TO_BTF_ID points to a kernel struct that does not need + * to be null checked by the BPF program. This does not imply the + * pointer is _not_ null and in practice this can easily be a null + * pointer when reading pointer chains. The assumption is program + * context will handle null pointer dereference typically via fault + * handling. The verifier must keep this in mind and can make no + * assumptions about null or non-null when doing branch analysis. + * Further, when passed into helpers the helpers can not, without + * additional context, assume the value is non-null. + */ + PTR_TO_BTF_ID, + /* PTR_TO_BTF_ID_OR_NULL points to a kernel struct that has not + * been checked for null. Used primarily to inform the verifier + * an explicit null check is required for this struct. + */ + PTR_TO_BTF_ID_OR_NULL, PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */ PTR_TO_RDONLY_BUF, /* reg points to a readonly buffer */ -- cgit v1.2.3 From 22ec3d232f8511b21355fcdb6fb2a4eced3decd8 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 25 Sep 2020 13:46:05 -0700 Subject: devlink: check flash_update parameter support in net core When implementing .flash_update, drivers which do not support per-component update are manually checking the component parameter to verify that it is NULL. Without this check, the driver might accept an update request with a component specified even though it will not honor such a request. Instead of having each driver check this, move the logic into net/core/devlink.c, and use a new `supported_flash_update_params` field in the devlink_ops. Drivers which will support per-component update must now specify this by setting DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT in the supported_flash_update_params in their devlink_ops. This helps ensure that drivers do not forget to check for a NULL component if they do not support per-component update. This also enables a slightly better error message by enabling the core stack to set the netlink bad attribute message to indicate precisely the unsupported attribute in the message. Going forward, any new additional parameter to flash update will require a bit in the supported_flash_update_params bitfield. Signed-off-by: Jacob Keller Reviewed-by: Jakub Kicinski Cc: Jiri Pirko Cc: Jonathan Corbet Cc: Michael Chan Cc: Bin Luo Cc: Saeed Mahameed Cc: Leon Romanovsky Cc: Ido Schimmel Cc: Danielle Ratson Cc: Shannon Nelson Signed-off-by: David S. Miller --- include/net/devlink.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 4883dbae7faf..cec6b4f109fa 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -550,6 +550,8 @@ enum devlink_param_generic_id { /* Firmware bundle identifier */ #define DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID "fw.bundle_id" +#define DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT BIT(0) + struct devlink_region; struct devlink_info_req; @@ -1037,6 +1039,12 @@ enum devlink_trap_group_generic_id { } struct devlink_ops { + /** + * @supported_flash_update_params: + * mask of parameters supported by the driver's .flash_update + * implemementation. + */ + u32 supported_flash_update_params; int (*reload_down)(struct devlink *devlink, bool netns_change, struct netlink_ext_ack *extack); int (*reload_up)(struct devlink *devlink, @@ -1097,6 +1105,13 @@ struct devlink_ops { struct netlink_ext_ack *extack); int (*info_get)(struct devlink *devlink, struct devlink_info_req *req, struct netlink_ext_ack *extack); + /** + * @flash_update: Device flash update function + * + * Used to perform a flash update for the device. The set of + * parameters supported by the driver should be set in + * supported_flash_update_params. + */ int (*flash_update)(struct devlink *devlink, const char *file_name, const char *component, struct netlink_ext_ack *extack); -- cgit v1.2.3 From bc75c054f04048517e0b153ab38d973bbcdcef59 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 25 Sep 2020 13:46:06 -0700 Subject: devlink: convert flash_update to use params structure The devlink core recently gained support for checking whether the driver supports a flash_update parameter, via `supported_flash_update_params`. However, parameters are specified as function arguments. Adding a new parameter still requires modifying the signature of the .flash_update callback in all drivers. Convert the .flash_update function to take a new `struct devlink_flash_update_params` instead. By using this structure, and the `supported_flash_update_params` bit field, a new parameter to flash_update can be added without requiring modification to existing drivers. As before, all parameters except file_name will require driver opt-in. Because file_name is a necessary field to for the flash_update to make sense, no "SUPPORTED" bitflag is provided and it is always considered valid. All future additional parameters will require a new bit in the supported_flash_update_params bitfield. Signed-off-by: Jacob Keller Reviewed-by: Jakub Kicinski Cc: Jiri Pirko Cc: Jakub Kicinski Cc: Jonathan Corbet Cc: Michael Chan Cc: Bin Luo Cc: Saeed Mahameed Cc: Leon Romanovsky Cc: Ido Schimmel Cc: Danielle Ratson Signed-off-by: David S. Miller --- include/net/devlink.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index cec6b4f109fa..7794e1601772 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -550,6 +550,20 @@ enum devlink_param_generic_id { /* Firmware bundle identifier */ #define DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID "fw.bundle_id" +/** + * struct devlink_flash_update_params - Flash Update parameters + * @file_name: the name of the flash firmware file to update from + * @component: the flash component to update + * + * With the exception of file_name, drivers must opt-in to parameters by + * setting the appropriate bit in the supported_flash_update_params field in + * their devlink_ops structure. + */ +struct devlink_flash_update_params { + const char *file_name; + const char *component; +}; + #define DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT BIT(0) struct devlink_region; @@ -1112,8 +1126,8 @@ struct devlink_ops { * parameters supported by the driver should be set in * supported_flash_update_params. */ - int (*flash_update)(struct devlink *devlink, const char *file_name, - const char *component, + int (*flash_update)(struct devlink *devlink, + struct devlink_flash_update_params *params, struct netlink_ext_ack *extack); /** * @trap_init: Trap initialization function. -- cgit v1.2.3 From 5d5b4128c4caae34ddcd9b2dc30ac4d6155617a3 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 25 Sep 2020 13:46:07 -0700 Subject: devlink: introduce flash update overwrite mask Sections of device flash may contain settings or device identifying information. When performing a flash update, it is generally expected that these settings and identifiers are not overwritten. However, it may sometimes be useful to allow overwriting these fields when performing a flash update. Some examples include, 1) customizing the initial device config on first programming, such as overwriting default device identifying information, or 2) reverting a device configuration to known good state provided in the new firmware image, or 3) in case it is suspected that current firmware logic for managing the preservation of fields during an update is broken. Although some devices are able to completely separate these types of settings and fields into separate components, this is not true for all hardware. To support controlling this behavior, a new DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK is defined. This is an nla_bitfield32 which will define what subset of fields in a component should be overwritten during an update. If no bits are specified, or of the overwrite mask is not provided, then an update should not overwrite anything, and should maintain the settings and identifiers as they are in the previous image. If the overwrite mask has the DEVLINK_FLASH_OVERWRITE_SETTINGS bit set, then the device should be configured to overwrite any of the settings in the requested component with settings found in the provided image. Similarly, if the DEVLINK_FLASH_OVERWRITE_IDENTIFIERS bit is set, the device should be configured to overwrite any device identifiers in the requested component with the identifiers from the image. Multiple overwrite modes may be combined to indicate that a combination of the set of fields that should be overwritten. Drivers which support the new overwrite mask must set the DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK in the supported_flash_update_params field of their devlink_ops. Signed-off-by: Jacob Keller Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 4 +++- include/uapi/linux/devlink.h | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 7794e1601772..7339bf9ba6b4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -562,9 +562,11 @@ enum devlink_param_generic_id { struct devlink_flash_update_params { const char *file_name; const char *component; + u32 overwrite_mask; }; -#define DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT BIT(0) +#define DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT BIT(0) +#define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK BIT(1) struct devlink_region; struct devlink_info_req; diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index a2ecc8b00611..7b0face1bad5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -230,6 +230,28 @@ enum { DEVLINK_ATTR_STATS_MAX = __DEVLINK_ATTR_STATS_MAX - 1 }; +/* Specify what sections of a flash component can be overwritten when + * performing an update. Overwriting of firmware binary sections is always + * implicitly assumed to be allowed. + * + * Each section must be documented in + * Documentation/networking/devlink/devlink-flash.rst + * + */ +enum { + DEVLINK_FLASH_OVERWRITE_SETTINGS_BIT, + DEVLINK_FLASH_OVERWRITE_IDENTIFIERS_BIT, + + __DEVLINK_FLASH_OVERWRITE_MAX_BIT, + DEVLINK_FLASH_OVERWRITE_MAX_BIT = __DEVLINK_FLASH_OVERWRITE_MAX_BIT - 1 +}; + +#define DEVLINK_FLASH_OVERWRITE_SETTINGS _BITUL(DEVLINK_FLASH_OVERWRITE_SETTINGS_BIT) +#define DEVLINK_FLASH_OVERWRITE_IDENTIFIERS _BITUL(DEVLINK_FLASH_OVERWRITE_IDENTIFIERS_BIT) + +#define DEVLINK_SUPPORTED_FLASH_OVERWRITE_SECTIONS \ + (_BITUL(__DEVLINK_FLASH_OVERWRITE_MAX_BIT) - 1) + /** * enum devlink_trap_action - Packet trap action. * @DEVLINK_TRAP_ACTION_DROP: Packet is dropped by the device and a copy is not @@ -464,6 +486,7 @@ enum devlink_attr { DEVLINK_ATTR_PORT_CONTROLLER_NUMBER, /* u32 */ DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, /* u64 */ + DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK, /* bitfield32 */ /* add new attributes above here, update the policy in devlink.c */ -- cgit v1.2.3 From 2d44b097bbb9d0af0f3b94304fee4b639ab14171 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 26 Sep 2020 22:32:01 +0300 Subject: net: mscc: ocelot: move NPI port configuration to DSA Remove the ocelot_configure_cpu() function, which was in fact bringing up 2 ports: the CPU port module, which both switchdev and DSA have, and the NPI port, which only DSA has. The (non-Ethernet) CPU port module is at a fixed index in the analyzer, whereas the NPI port is selected through the "ethernet" property in the device tree. Therefore, the function to set up an NPI port is DSA-specific, so we move it there, simplifying the ocelot switch library a little bit. Cc: Horatiu Vultur Cc: Alexandre Belloni Cc: UNGLinuxDriver Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 3105bbb6cdcf..349e839c4c18 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -672,9 +672,6 @@ void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg, int ocelot_regfields_init(struct ocelot *ocelot, const struct reg_field *const regfields); struct regmap *ocelot_regmap_init(struct ocelot *ocelot, struct resource *res); -void ocelot_configure_cpu(struct ocelot *ocelot, int npi, - enum ocelot_tag_prefix injection, - enum ocelot_tag_prefix extraction); int ocelot_init(struct ocelot *ocelot); void ocelot_deinit(struct ocelot *ocelot); void ocelot_init_port(struct ocelot *ocelot, int port); -- cgit v1.2.3 From c3975400c8014e39f7f272808377c97c906c8cee Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 26 Sep 2020 22:32:02 +0300 Subject: net: dsa: allow drivers to request promiscuous mode on master Currently DSA assumes that taggers don't mess with the destination MAC address of the frames on RX. That is not always the case. Some DSA headers are placed before the Ethernet header (ocelot), and others simply mangle random bytes from the destination MAC address (sja1105 with its incl_srcpt option). Currently the DSA master goes to promiscuous mode automatically when the slave devices go too (such as when enslaved to a bridge), but in standalone mode this is a problem that needs to be dealt with. So give drivers the possibility to signal that their tagging protocol will get randomly dropped otherwise, and let DSA deal with fixing that. Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index d16057c5987a..46019edc32cb 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -84,6 +84,12 @@ struct dsa_device_ops { unsigned int overhead; const char *name; enum dsa_tag_protocol proto; + /* Some tagging protocols either mangle or shift the destination MAC + * address, in which case the DSA master would drop packets on ingress + * if what it understands out of the destination MAC address is not in + * its RX filter. + */ + bool promisc_on_master; }; /* This structure defines the control interfaces that are overlayed by the -- cgit v1.2.3 From 5124197ce58b5706bb60c2ecb3b79f4dfabab6e1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 26 Sep 2020 22:32:04 +0300 Subject: net: dsa: tag_ocelot: use a short prefix on both ingress and egress There are 2 goals that we follow: - Reduce the header size - Make the header size equal between RX and TX The issue that required long prefix on RX was the fact that the ocelot DSA tag, being put before Ethernet as it is, would overlap with the area that a DSA master uses for RX filtering (destination MAC address mainly). Now that we can ask DSA to put the master in promiscuous mode, in theory we could remove the prefix altogether and call it a day, but it looks like we can't. Using no prefix on ingress, some packets (such as ICMP) would be received, while others (such as PTP) would not be received. This is because the DSA master we use (enetc) triggers parse errors ("MAC rx frame errors") presumably because it sees Ethernet frames with a bad length. And indeed, when using no prefix, the EtherType (bytes 12-13 of the frame, bits 96-111) falls over the REW_VAL field from the extraction header, aka the PTP timestamp. When turning the short (32-bit) prefix on, the EtherType overlaps with bits 64-79 of the extraction header, which are a reserved area transmitted as zero by the switch. The packets are not dropped by the DSA master with a short prefix. Actually, the frames look like this in tcpdump (below is a PTP frame, with an extra dsa_8021q tag - dadb 0482 - added by a downstream sja1105). 89:0c:a9:f2:01:00 > 88:80:00:0a:00:1d, 802.3, length 0: LLC, \ dsap Unknown (0x10) Individual, ssap ProWay NM (0x0e) Response, \ ctrl 0x0004: Information, send seq 2, rcv seq 0, \ Flags [Response], length 78 0x0000: 8880 000a 001d 890c a9f2 0100 0000 100f ................ 0x0010: 0400 0000 0180 c200 000e 001f 7b63 0248 ............{c.H 0x0020: dadb 0482 88f7 1202 0036 0000 0000 0000 .........6...... 0x0030: 0000 0000 0000 0000 0000 001f 7bff fe63 ............{..c 0x0040: 0248 0001 1f81 0500 0000 0000 0000 0000 .H.............. 0x0050: 0000 0000 0000 0000 0000 0000 ............ So the short prefix is our new default: we've shortened our RX frames by 12 octets, increased TX by 4, and headers are now equal between RX and TX. Note that we still need promiscuous mode for the DSA master to not drop it. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 349e839c4c18..3093385f6147 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -101,6 +101,7 @@ #define OCELOT_TAG_LEN 16 #define OCELOT_SHORT_PREFIX_LEN 4 #define OCELOT_LONG_PREFIX_LEN 16 +#define OCELOT_TOTAL_TAG_LEN (OCELOT_SHORT_PREFIX_LEN + OCELOT_TAG_LEN) #define OCELOT_SPEED_2500 0 #define OCELOT_SPEED_1000 1 -- cgit v1.2.3 From 2e8cb1b3db384382c84cc5f765c821454640aac1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 26 Sep 2020 22:32:05 +0300 Subject: net: dsa: make the .flow_dissect tagger callback return void There is no tagger that returns anything other than zero, so just change the return type appropriately. Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 46019edc32cb..98d339311898 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -74,8 +74,8 @@ struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); - int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, - int *offset); + void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, + int *offset); /* Used to determine which traffic should match the DSA filter in * eth_type_trans, and which, if any, should bypass it and be processed * as regular on the master net device. -- cgit v1.2.3 From 9790cf20a8c4bb8d774797c238fa3643f4336e46 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 26 Sep 2020 22:32:06 +0300 Subject: net: dsa: add a generic procedure for the flow dissector For all DSA formats that don't use tail tags, it looks like behind the obscure number crunching they're all doing the same thing: locating the real EtherType behind the DSA tag. Nonetheless, this is not immediately obvious, so create a generic helper for those DSA taggers that put the header before the EtherType. Another assumption for the generic function is that the DSA tags are of equal length on RX and on TX. Prior to the previous patch, this was not true for ocelot and for gswip. The problem was resolved for ocelot, but for gswip it still remains, so that can't use this helper yet. Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 98d339311898..817fab5e2c21 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -711,6 +711,32 @@ static inline bool dsa_can_decode(const struct sk_buff *skb, return false; } +/* All DSA tags that push the EtherType to the right (basically all except tail + * tags, which don't break dissection) can be treated the same from the + * perspective of the flow dissector. + * + * We need to return: + * - offset: the (B - A) difference between: + * A. the position of the real EtherType and + * B. the current skb->data (aka ETH_HLEN bytes into the frame, aka 2 bytes + * after the normal EtherType was supposed to be) + * The offset in bytes is exactly equal to the tagger overhead (and half of + * that, in __be16 shorts). + * + * - proto: the value of the real EtherType. + */ +static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb, + __be16 *proto, int *offset) +{ +#if IS_ENABLED(CONFIG_NET_DSA) + const struct dsa_device_ops *ops = skb->dev->dsa_ptr->tag_ops; + int tag_len = ops->overhead; + + *offset = tag_len; + *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1]; +#endif +} + #if IS_ENABLED(CONFIG_NET_DSA) static inline int __dsa_netdevice_ops_check(struct net_device *dev) { -- cgit v1.2.3 From 7a6ffe764be35af0527d8cfd047945e8f8797ddf Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 26 Sep 2020 22:32:07 +0300 Subject: net: dsa: point out the tail taggers The Marvell 88E6060 uses tag_trailer.c and the KSZ8795, KSZ9477 and KSZ9893 switches also use tail tags. Tell that to the DSA core, since this makes a difference for the flow dissector. Most switches break the parsing of frame headers, but these ones don't, so no flow dissector adjustment needs to be done for them. Signed-off-by: Vladimir Oltean Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 817fab5e2c21..b502a63d196e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -90,6 +90,7 @@ struct dsa_device_ops { * its RX filter. */ bool promisc_on_master; + bool tail_tag; }; /* This structure defines the control interfaces that are overlayed by the -- cgit v1.2.3 From e622129569963fa3bab976685443756e5da70da9 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Sun, 27 Sep 2020 16:01:50 +0800 Subject: ptp: add stub function for ptp_get_msgtype() Added the missing stub function for ptp_get_msgtype(). Fixes: 036c508ba95e ("ptp: Add generic ptp message type function") Reported-by: Randy Dunlap Signed-off-by: Yangbo Lu Acked-by: Randy Dunlap # build-tested Signed-off-by: David S. Miller --- include/linux/ptp_classify.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index 8437307cca8c..c6487b7ab026 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -134,5 +134,13 @@ static inline struct ptp_header *ptp_parse_header(struct sk_buff *skb, { return NULL; } +static inline u8 ptp_get_msgtype(const struct ptp_header *hdr, + unsigned int type) +{ + /* The return is meaningless. The stub function would not be + * executed since no available header from ptp_parse_header. + */ + return 0; +} #endif #endif /* _PTP_CLASSIFY_H_ */ -- cgit v1.2.3 From c8cb5b854b40f2ce52ccd032fa19750f4181d5fc Mon Sep 17 00:00:00 2001 From: Tova Mussai Date: Fri, 18 Sep 2020 11:33:13 +0200 Subject: nl80211/cfg80211: support 6 GHz scanning Support 6 GHz scanning, by * a new scan flag to scan for colocated BSSes advertised by (and found) APs on 2.4 & 5 GHz * doing the necessary reduced neighbor report parsing for this, to find them * adding the ability to split the scan request in case the device by itself cannot support this. Also add some necessary bits in mac80211 to not break with these changes. Signed-off-by: Tova Mussai Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20200918113313.232917c93af9.Ida22f0212f9122f47094d81659e879a50434a6a2@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 32 +++++++++++++++++++++++++++++++- include/uapi/linux/nl80211.h | 3 +++ 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 10c2cc8f0efc..11eb81676e95 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2095,6 +2095,27 @@ struct cfg80211_scan_info { bool aborted; }; +/** + * struct cfg80211_scan_6ghz_params - relevant for 6 GHz only + * + * @short_bssid: short ssid to scan for + * @bssid: bssid to scan for + * @channel_idx: idx of the channel in the channel array in the scan request + * which the above info relvant to + * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU + * @short_ssid_valid: short_ssid is valid and can be used + * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait + * 20 TUs before starting to send probe requests. + */ +struct cfg80211_scan_6ghz_params { + u32 short_ssid; + u32 channel_idx; + u8 bssid[ETH_ALEN]; + bool unsolicited_probe; + bool short_ssid_valid; + bool psc_no_listen; +}; + /** * struct cfg80211_scan_request - scan request description * @@ -2122,6 +2143,10 @@ struct cfg80211_scan_info { * @mac_addr_mask: MAC address mask used with randomisation, bits that * are 0 in the mask should be randomised, bits that are 1 should * be taken from the @mac_addr + * @scan_6ghz: relevant for split scan request only, + * true if this is the second scan request + * @n_6ghz_params: number of 6 GHz params + * @scan_6ghz_params: 6 GHz params * @bssid: BSSID to scan for (most commonly, the wildcard BSSID) */ struct cfg80211_scan_request { @@ -2149,6 +2174,9 @@ struct cfg80211_scan_request { struct cfg80211_scan_info info; bool notified; bool no_cck; + bool scan_6ghz; + u32 n_6ghz_params; + struct cfg80211_scan_6ghz_params *scan_6ghz_params; /* keep last */ struct ieee80211_channel *channels[]; @@ -4217,6 +4245,8 @@ struct cfg80211_ops { /** * enum wiphy_flags - wiphy capability flags * + * @WIPHY_FLAG_SPLIT_SCAN_6GHZ: if set to true, the scan request will be split + * into two, first for legacy bands and second for UHB. * @WIPHY_FLAG_NETNS_OK: if not set, do not allow changing the netns of this * wiphy at all * @WIPHY_FLAG_PS_ON_BY_DEFAULT: if set to true, powersave will be enabled @@ -4260,7 +4290,7 @@ struct cfg80211_ops { enum wiphy_flags { WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), /* use hole at 1 */ - /* use hole at 2 */ + WIPHY_FLAG_SPLIT_SCAN_6GHZ = BIT(2), WIPHY_FLAG_NETNS_OK = BIT(3), WIPHY_FLAG_PS_ON_BY_DEFAULT = BIT(4), WIPHY_FLAG_4ADDR_AP = BIT(5), diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index bdc90b8dfd24..c74ceaddb909 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6059,6 +6059,8 @@ enum nl80211_timeout_reason { * @NL80211_SCAN_FLAG_FREQ_KHZ: report scan results with * %NL80211_ATTR_SCAN_FREQ_KHZ. This also means * %NL80211_ATTR_SCAN_FREQUENCIES will not be included. + * @NL80211_SCAN_FLAG_COLOCATED_6GHZ: scan for colocated APs reported by + * 2.4/5 GHz APs */ enum nl80211_scan_flags { NL80211_SCAN_FLAG_LOW_PRIORITY = 1<<0, @@ -6075,6 +6077,7 @@ enum nl80211_scan_flags { NL80211_SCAN_FLAG_RANDOM_SN = 1<<11, NL80211_SCAN_FLAG_MIN_PREQ_CONTENT = 1<<12, NL80211_SCAN_FLAG_FREQ_KHZ = 1<<13, + NL80211_SCAN_FLAG_COLOCATED_6GHZ = 1<<14, }; /** -- cgit v1.2.3 From d2b7588a47de8322891de38ec14d15105d66cb1e Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:04 -0700 Subject: nl80211: support S1G capability overrides in assoc NL80211_ATTR_S1G_CAPABILITY can be passed along with NL80211_ATTR_S1G_CAPABILITY_MASK to NL80211_CMD_ASSOCIATE to indicate S1G capabilities which should override the hardware capabilities in eg. the association request. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-4-thomas@adapt-ip.com [johannes: always require both attributes together, commit message] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 ++ include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 9 +++++++++ 3 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 53fba39d4ba6..f71cffa18176 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2330,6 +2330,8 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) } /* S1G Capabilities Information field */ +#define IEEE80211_S1G_CAPABILITY_LEN 15 + #define S1G_CAP0_S1G_LONG BIT(0) #define S1G_CAP0_SGI_1MHZ BIT(1) #define S1G_CAP0_SGI_2MHZ BIT(2) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 11eb81676e95..bead4b9afeca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2556,6 +2556,8 @@ enum cfg80211_assoc_req_flags { * @fils_nonces: FILS nonces (part of AAD) for protecting (Re)Association * Request/Response frame or %NULL if FILS is not used. This field starts * with 16 octets of STA Nonce followed by 16 octets of AP Nonce. + * @s1g_capa: S1G capability override + * @s1g_capa_mask: S1G capability override mask */ struct cfg80211_assoc_request { struct cfg80211_bss *bss; @@ -2570,6 +2572,7 @@ struct cfg80211_assoc_request { const u8 *fils_kek; size_t fils_kek_len; const u8 *fils_nonces; + struct ieee80211_s1g_cap s1g_capa, s1g_capa_mask; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c74ceaddb909..05db40b4c56f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2521,6 +2521,12 @@ enum nl80211_commands { * unsolicited broadcast probe response. It is a nested attribute, see * &enum nl80211_unsol_bcast_probe_resp_attributes. * + * @NL80211_ATTR_S1G_CAPABILITY: S1G Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION) + * @NL80211_ATTR_S1G_CAPABILITY_MASK: S1G Capability Information element + * override mask. Used with NL80211_ATTR_S1G_CAPABILITY in + * NL80211_CMD_ASSOCIATE or NL80211_CMD_CONNECT. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3007,6 +3013,9 @@ enum nl80211_attrs { NL80211_ATTR_UNSOL_BCAST_PROBE_RESP, + NL80211_ATTR_S1G_CAPABILITY, + NL80211_ATTR_S1G_CAPABILITY_MASK, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, -- cgit v1.2.3 From 9eaffe5078ca0808603cdd15c4eaf0106a996f3a Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:06 -0700 Subject: cfg80211: convert S1G beacon to scan results The S1G beacon is an extension frame as opposed to management frame for the regular beacon. This means we may have to occasionally cast the frame buffer to a different header type. Luckily this isn't too bad as scan results mostly only care about the IEs. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-6-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f71cffa18176..1ce0b37441b9 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -151,6 +151,9 @@ #define IEEE80211_ANO_NETTYPE_WILD 15 +/* bits unique to S1G beacon */ +#define IEEE80211_S1G_BCN_NEXT_TBTT 0x100 + /* control extension - for IEEE80211_FTYPE_CTL | IEEE80211_STYPE_CTL_EXT */ #define IEEE80211_CTL_EXT_POLL 0x2000 #define IEEE80211_CTL_EXT_SPR 0x3000 @@ -553,6 +556,28 @@ static inline bool ieee80211_is_s1g_beacon(__le16 fc) cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON); } +/** + * ieee80211_next_tbtt_present - check if IEEE80211_FTYPE_EXT && + * IEEE80211_STYPE_S1G_BEACON && IEEE80211_S1G_BCN_NEXT_TBTT + * @fc: frame control bytes in little-endian byteorder + */ +static inline bool ieee80211_next_tbtt_present(__le16 fc) +{ + return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) == + cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON) && + fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT); +} + +/** + * ieee80211_is_s1g_short_beacon - check if next tbtt present bit is set. Only + * true for S1G beacons when they're short. + * @fc: frame control bytes in little-endian byteorder + */ +static inline bool ieee80211_is_s1g_short_beacon(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && ieee80211_next_tbtt_present(fc); +} + /** * ieee80211_is_atim - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ATIM * @fc: frame control bytes in little-endian byteorder @@ -1034,6 +1059,13 @@ struct ieee80211_ext { u8 change_seq; u8 variable[0]; } __packed s1g_beacon; + struct { + u8 sa[ETH_ALEN]; + __le32 timestamp; + u8 change_seq; + u8 next_tbtt[3]; + u8 variable[0]; + } __packed s1g_short_beacon; } u; } __packed __aligned(2); -- cgit v1.2.3 From 80ca25711380c8eabe51eed875ca9432b4f8939e Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:09 -0700 Subject: cfg80211: handle Association Response from S1G STA The sending STA type is implicit based on beacon or probe response content. If sending STA was an S1G STA, adjust the Information Element location accordingly. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-9-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 1ce0b37441b9..d0fda8424118 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1100,6 +1100,11 @@ struct ieee80211_mgmt { /* followed by Supported rates */ u8 variable[0]; } __packed assoc_resp, reassoc_resp; + struct { + __le16 capab_info; + __le16 status_code; + u8 variable[0]; + } __packed s1g_assoc_resp, s1g_reassoc_resp; struct { __le16 capab_info; __le16 listen_interval; -- cgit v1.2.3 From 05d109576a36fd498e5db2d905eb50c7dd844b83 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:10 -0700 Subject: mac80211: encode listen interval for S1G S1G allows listen interval up to 2^14 * 10000 beacon intervals. In order to do this listen interval needs a scaling factor applied to the lower 14 bits. Calculate this and properly encode the listen interval for S1G STAs. See IEEE802.11ah-2016 Table 9-44a for reference. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-10-thomas@adapt-ip.com [move listen_int_usf into function using it] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d0fda8424118..7b6af47dd279 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2448,6 +2448,13 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) #define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ BIT(0) #define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) + +#define LISTEN_INT_USF GENMASK(15, 14) +#define LISTEN_INT_UI GENMASK(13, 0) + +#define IEEE80211_MAX_USF FIELD_MAX(LISTEN_INT_USF) +#define IEEE80211_MAX_UI FIELD_MAX(LISTEN_INT_UI) + /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 -- cgit v1.2.3 From 1821f8b36f112be9e3071779da82e14384fc6989 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:12 -0700 Subject: mac80211: handle S1G low rates S1G doesn't have legacy (sband->bitrates) rates, only MCS. For now, just send a frame at MCS 0 if a low rate is requested. Note we also redefine (since we're out of TX flags) TX_RC_VHT_MCS as TX_RC_S1G_MCS to indicate an S1G MCS. This is probably OK as VHT MCS is not valid on S1G band and vice versa. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-12-thomas@adapt-ip.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index e90089d104b0..de22524e9270 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -833,6 +833,8 @@ enum mac80211_tx_info_flags { #define IEEE80211_TX_CTL_STBC_SHIFT 23 +#define IEEE80211_TX_RC_S1G_MCS IEEE80211_TX_RC_VHT_MCS + /** * enum mac80211_tx_control_flags - flags to describe transmit control * -- cgit v1.2.3 From 1d00ce807efaa0ee3a96de7801be042a06d35873 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:15 -0700 Subject: mac80211: support S1G association The changes required for associating in S1G are: - apply S1G BSS channel info before assoc - mark all S1G STAs as QoS STAs - include and parse AID request element - handle new Association Response format - don't fail assoc if supported rates element is missing Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-15-thomas@adapt-ip.com [pass skb to ieee80211_add_aid_request_ie(), remove unused variable 'bss'] Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 21 +++++++++++++++++++++ include/net/mac80211.h | 2 ++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 7b6af47dd279..f2f56b287aed 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -987,6 +987,25 @@ enum ieee80211_vht_opmode_bits { IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF = 0x80, }; +/** + * enum ieee80211_s1g_chanwidth + * These are defined in IEEE802.11-2016ah Table 10-20 + * as BSS Channel Width + * + * @IEEE80211_S1G_CHANWIDTH_1MHZ: 1MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_2MHZ: 2MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_4MHZ: 4MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_8MHZ: 8MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_16MHZ: 16MHz operating channel + */ +enum ieee80211_s1g_chanwidth { + IEEE80211_S1G_CHANWIDTH_1MHZ = 0, + IEEE80211_S1G_CHANWIDTH_2MHZ = 1, + IEEE80211_S1G_CHANWIDTH_4MHZ = 3, + IEEE80211_S1G_CHANWIDTH_8MHZ = 7, + IEEE80211_S1G_CHANWIDTH_16MHZ = 15, +}; + #define WLAN_SA_QUERY_TR_ID_LEN 2 #define WLAN_MEMBERSHIP_LEN 8 #define WLAN_USER_POSITION_LEN 16 @@ -2854,6 +2873,8 @@ enum ieee80211_eid { WLAN_EID_REDUCED_NEIGHBOR_REPORT = 201, + WLAN_EID_AID_REQUEST = 210, + WLAN_EID_AID_RESPONSE = 211, WLAN_EID_S1G_BCN_COMPAT = 213, WLAN_EID_S1G_SHORT_BCN_INTERVAL = 214, WLAN_EID_S1G_CAPABILITIES = 217, diff --git a/include/net/mac80211.h b/include/net/mac80211.h index de22524e9270..72bc877d2c22 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -627,6 +627,7 @@ struct ieee80211_fils_discovery { * @fils_discovery: FILS discovery configuration * @unsol_bcast_probe_resp_interval: Unsolicited broadcast probe response * interval. + * @s1g: BSS is S1G BSS (affects Association Request format). */ struct ieee80211_bss_conf { const u8 *bssid; @@ -696,6 +697,7 @@ struct ieee80211_bss_conf { struct cfg80211_he_bss_color he_bss_color; struct ieee80211_fils_discovery fils_discovery; u32 unsol_bcast_probe_resp_interval; + bool s1g; }; /** -- cgit v1.2.3 From 58ef7c1b555e0e605da24b76cb2821dd3fcd6bc6 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 21 Sep 2020 19:28:16 -0700 Subject: nl80211: include frequency offset in survey info Recently channels gained a potential frequency offset, so include this in the per-channel survey info. Signed-off-by: Thomas Pedersen Link: https://lore.kernel.org/r/20200922022818.15855-16-thomas@adapt-ip.com [add the offset only if non-zero] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 05db40b4c56f..1e51445f81cd 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4097,6 +4097,7 @@ enum nl80211_user_reg_hint_type { * receiving frames destined to the local BSS * @NL80211_SURVEY_INFO_MAX: highest survey info attribute number * currently defined + * @NL80211_SURVEY_INFO_FREQUENCY_OFFSET: center frequency offset in KHz * @__NL80211_SURVEY_INFO_AFTER_LAST: internal use */ enum nl80211_survey_info { @@ -4112,6 +4113,7 @@ enum nl80211_survey_info { NL80211_SURVEY_INFO_TIME_SCAN, NL80211_SURVEY_INFO_PAD, NL80211_SURVEY_INFO_TIME_BSS_RX, + NL80211_SURVEY_INFO_FREQUENCY_OFFSET, /* keep last */ __NL80211_SURVEY_INFO_AFTER_LAST, -- cgit v1.2.3 From 265a0708339daeb71848169f52b91066cc2984fd Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Tue, 22 Sep 2020 12:19:56 -0700 Subject: mac80211: Support not iterating over not-sdata-in-driver ifaces Allow drivers to request that interface-iterator does NOT iterate over interfaces that are not sdata-in-driver. This will allow us to fix crashes in ath10k (and possibly other drivers). To summarize Johannes' explanation: Consider add interface wlan0 add interface wlan1 iterate active interfaces -> wlan0 wlan1 add interface wlan2 iterate active interfaces -> wlan0 wlan1 wlan2 If you apply this scenario to a restart, which ought to be functionally equivalent to the normal startup, just compressed in time, you're basically saying that today you get add interface wlan0 add interface wlan1 iterate active interfaces -> wlan0 wlan1 wlan2 << problem here add interface wlan2 iterate active interfaces -> wlan0 wlan1 wlan2 which yeah, totally seems wrong. But fixing that to be add interface wlan0 add interface wlan1 iterate active interfaces -> add interface wlan2 iterate active interfaces -> (or maybe -> wlan0 wlan1 wlan2 if the reconfig already completed) This is also at least somewhat wrong, but better to not iterate over something that exists in the driver than iterate over something that does not. Originally the first issue was causing crashes in testing with lots of station vdevs on an ath10k radio, combined with firmware crashing. I ran with a similar patch for years with no obvious bad results, including significant testing with ath9k and ath10k. Signed-off-by: Ben Greear Link: https://lore.kernel.org/r/20200922191957.25257-1-greearb@candelatech.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 72bc877d2c22..4747d446179a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5407,11 +5407,15 @@ void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw); * @IEEE80211_IFACE_ITER_RESUME_ALL: During resume, iterate over all * interfaces, even if they haven't been re-added to the driver yet. * @IEEE80211_IFACE_ITER_ACTIVE: Iterate only active interfaces (netdev is up). + * @IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER: Skip any interfaces where SDATA + * is not in the driver. This may fix crashes during firmware recovery + * for instance. */ enum ieee80211_interface_iteration_flags { IEEE80211_IFACE_ITER_NORMAL = 0, IEEE80211_IFACE_ITER_RESUME_ALL = BIT(0), IEEE80211_IFACE_ITER_ACTIVE = BIT(1), + IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER = BIT(2), }; /** -- cgit v1.2.3 From f5bec330e3010450daeb5cb6a94a4a7c54afa306 Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Mon, 28 Sep 2020 00:28:11 -0700 Subject: nl80211: extend support to config spatial reuse parameter set Allow the user to configure below Spatial Reuse Parameter Set element. * Non-SRG OBSS PD Max Offset * SRG BSS Color Bitmap * SRG Partial BSSID Bitmap Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1601278091-20313-2-git-send-email-rmanohar@codeaurora.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 7 +++++-- include/net/cfg80211.h | 10 ++++++++++ include/uapi/linux/nl80211.h | 11 +++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f2f56b287aed..770408b2fdaf 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2350,8 +2350,11 @@ ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) } /* HE Spatial Reuse defines */ -#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT 0x4 -#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT 0x8 +#define IEEE80211_HE_SPR_PSR_DISALLOWED BIT(0) +#define IEEE80211_HE_SPR_NON_SRG_OBSS_PD_SR_DISALLOWED BIT(1) +#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT BIT(2) +#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT BIT(3) +#define IEEE80211_HE_SPR_HESIGA_SR_VAL15_ALLOWED BIT(4) /* * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index bead4b9afeca..aee47f2b5709 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -269,13 +269,23 @@ struct ieee80211_rate { * struct ieee80211_he_obss_pd - AP settings for spatial reuse * * @enable: is the feature enabled. + * @sr_ctrl: The SR Control field of SRP element. + * @non_srg_max_offset: non-SRG maximum tx power offset * @min_offset: minimal tx power offset an associated station shall use * @max_offset: maximum tx power offset an associated station shall use + * @bss_color_bitmap: bitmap that indicates the BSS color values used by + * members of the SRG + * @partial_bssid_bitmap: bitmap that indicates the partial BSSID values + * used by members of the SRG */ struct ieee80211_he_obss_pd { bool enable; + u8 sr_ctrl; + u8 non_srg_max_offset; u8 min_offset; u8 max_offset; + u8 bss_color_bitmap[8]; + u8 partial_bssid_bitmap[8]; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1e51445f81cd..47700a2b9af9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6991,6 +6991,13 @@ enum nl80211_peer_measurement_ftm_resp { * * @NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET: the OBSS PD minimum tx power offset. * @NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET: the OBSS PD maximum tx power offset. + * @NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET: the non-SRG OBSS PD maximum + * tx power offset. + * @NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP: bitmap that indicates the BSS color + * values used by members of the SRG. + * @NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP: bitmap that indicates the partial + * BSSID values used by members of the SRG. + * @NL80211_HE_OBSS_PD_ATTR_SR_CTRL: The SR Control field of SRP element. * * @__NL80211_HE_OBSS_PD_ATTR_LAST: Internal * @NL80211_HE_OBSS_PD_ATTR_MAX: highest OBSS PD attribute. @@ -7000,6 +7007,10 @@ enum nl80211_obss_pd_attributes { NL80211_HE_OBSS_PD_ATTR_MIN_OFFSET, NL80211_HE_OBSS_PD_ATTR_MAX_OFFSET, + NL80211_HE_OBSS_PD_ATTR_NON_SRG_MAX_OFFSET, + NL80211_HE_OBSS_PD_ATTR_BSS_COLOR_BITMAP, + NL80211_HE_OBSS_PD_ATTR_PARTIAL_BSSID_BITMAP, + NL80211_HE_OBSS_PD_ATTR_SR_CTRL, /* keep last */ __NL80211_HE_OBSS_PD_ATTR_LAST, -- cgit v1.2.3 From 74cc6d182d038cbba6c6d91beb1b2bab926b618b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 25 Sep 2020 17:56:40 -0700 Subject: udp_tunnel: add the ability to share port tables Unfortunately recent Intel NIC designs share the UDP port table across netdevs. So far the UDP tunnel port state was maintained per netdev, we need to extend that to cater to Intel NICs. Expect NICs to allocate the info structure dynamically and link to the state from there. All the shared NICs will record port offload information in the one instance of the table so we need to make sure that the use count can accommodate larger numbers. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/udp_tunnel.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 94bb7a882250..2ea453dac876 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -200,11 +200,27 @@ enum udp_tunnel_nic_info_flags { UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(3), }; +struct udp_tunnel_nic; + +#define UDP_TUNNEL_NIC_MAX_SHARING_DEVICES (U16_MAX / 2) + +struct udp_tunnel_nic_shared { + struct udp_tunnel_nic *udp_tunnel_nic_info; + + struct list_head devices; +}; + +struct udp_tunnel_nic_shared_node { + struct net_device *dev; + struct list_head list; +}; + /** * struct udp_tunnel_nic_info - driver UDP tunnel offload information * @set_port: callback for adding a new port * @unset_port: callback for removing a port * @sync_table: callback for syncing the entire port table at once + * @shared: reference to device global state (optional) * @flags: device flags from enum udp_tunnel_nic_info_flags * @tables: UDP port tables this device has * @tables.n_entries: number of entries in this table @@ -213,6 +229,12 @@ enum udp_tunnel_nic_info_flags { * Drivers are expected to provide either @set_port and @unset_port callbacks * or the @sync_table callback. Callbacks are invoked with rtnl lock held. * + * Devices which (misguidedly) share the UDP tunnel port table across multiple + * netdevs should allocate an instance of struct udp_tunnel_nic_shared and + * point @shared at it. + * There must never be more than %UDP_TUNNEL_NIC_MAX_SHARING_DEVICES devices + * sharing a table. + * * Known limitations: * - UDP tunnel port notifications are fundamentally best-effort - * it is likely the driver will both see skbs which use a UDP tunnel port, @@ -234,6 +256,8 @@ struct udp_tunnel_nic_info { /* all at once */ int (*sync_table)(struct net_device *dev, unsigned int table); + struct udp_tunnel_nic_shared *shared; + unsigned int flags; struct udp_tunnel_nic_table_info { -- cgit v1.2.3 From 1b4d60ec162f82ea29a2e7a907b5c6cc9f926321 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 25 Sep 2020 13:54:29 -0700 Subject: bpf: Enable BPF_PROG_TEST_RUN for raw_tracepoint Add .test_run for raw_tracepoint. Also, introduce a new feature that runs the target program on a specific CPU. This is achieved by a new flag in bpf_attr.test, BPF_F_TEST_RUN_ON_CPU. When this flag is set, the program is triggered on cpu with id bpf_attr.test.cpu. This feature is needed for BPF programs that handle perf_event and other percpu resources, as the program can access these resource locally. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200925205432.1777-2-songliubraving@fb.com --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 7 +++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 79902325bef8..db6dcdee7933 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1396,6 +1396,9 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2d6519a2ed77..82522f05c021 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -424,6 +424,11 @@ enum { */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) +/* Flags for BPF_PROG_TEST_RUN */ + +/* If set, run the test on the cpu specified by bpf_attr.test.cpu */ +#define BPF_F_TEST_RUN_ON_CPU (1U << 0) + /* type for BPF_ENABLE_STATS */ enum bpf_stats_type { /* enabled run_time_ns and run_cnt */ @@ -566,6 +571,8 @@ union bpf_attr { */ __aligned_u64 ctx_in; __aligned_u64 ctx_out; + __u32 flags; + __u32 cpu; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ -- cgit v1.2.3 From 201091ebb2a161a0e10aab36186690b332941f6a Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Sat, 26 Sep 2020 12:44:24 +0200 Subject: net/smc: introduce System Enterprise ID (SEID) SMCD version 2 defines a System Enterprise ID (short SEID). This patch contains the SEID creation and adds the callback to retrieve the created SEID. Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- include/net/smc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index 646feb4bc75f..b28b384d0625 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -37,6 +37,8 @@ struct smcd_dmb { #define ISM_EVENT_GID 1 #define ISM_EVENT_SWR 2 +#define ISM_RESERVED_VLANID 0x1FFF + #define ISM_ERROR 0xFFFF struct smcd_event { @@ -63,6 +65,7 @@ struct smcd_ops { int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); + void (*get_system_eid)(struct smcd_dev *dev, u8 **eid); }; struct smcd_dev { -- cgit v1.2.3 From 8caaccf521c160d231587091f1f5e8aef2dd0a5e Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Sat, 26 Sep 2020 12:44:25 +0200 Subject: net/smc: introduce CHID callback for ISM devices With SMCD version 2 the CHIDs of ISM devices are needed for the CLC handshake. This patch provides the new callback to retrieve the CHID of an ISM device. Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- include/net/smc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index b28b384d0625..e441aa97ad61 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -66,6 +66,7 @@ struct smcd_ops { bool sf, unsigned int offset, void *data, unsigned int size); void (*get_system_eid)(struct smcd_dev *dev, u8 **eid); + u16 (*get_chid)(struct smcd_dev *dev); }; struct smcd_dev { -- cgit v1.2.3 From efc68158c429f37d87fd02ee9a26913c78546fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 25 Sep 2020 23:25:01 +0200 Subject: bpf: change logging calls from verbose() to bpf_log() and use log pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for moving code around, change a bunch of references to env->log (and the verbose() logging helper) to use bpf_log() and a direct pointer to struct bpf_verifier_log. While we're touching the function signature, mark the 'prog' argument to bpf_check_type_match() as const. Also enhance the bpf_verifier_log_needed() check to handle NULL pointers for the log struct so we can re-use the code with logging disabled. Acked-by: Andrii Nakryiko Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- include/linux/bpf_verifier.h | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index db6dcdee7933..5176726f4f03 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1420,7 +1420,7 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); -int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, +int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); struct bpf_prog *bpf_prog_by_id(u32 id); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2bb48a2c4d08..7bc9276c4ef4 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -347,8 +347,9 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { - return (log->level && log->ubuf && !bpf_verifier_log_full(log)) || - log->level == BPF_LOG_KERNEL; + return log && + ((log->level && log->ubuf && !bpf_verifier_log_full(log)) || + log->level == BPF_LOG_KERNEL); } #define BPF_MAX_SUBPROGS 256 -- cgit v1.2.3 From f7b12b6fea00988496b7606d4964cd77beef46a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Fri, 25 Sep 2020 23:25:02 +0200 Subject: bpf: verifier: refactor check_attach_btf_id() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check_attach_btf_id() function really does three things: 1. It performs a bunch of checks on the program to ensure that the attachment is valid. 2. It stores a bunch of state about the attachment being requested in the verifier environment and struct bpf_prog objects. 3. It allocates a trampoline for the attachment. This patch splits out (1.) and (3.) into separate functions which will perform the checks, but return the computed values instead of directly modifying the environment. This is done in preparation for reusing the checks when the actual attachment is happening, which will allow tracing programs to have multiple (compatible) attachments. This also fixes a bug where a bunch of checks were skipped if a trampoline already existed for the tracing target. Fixes: 6ba43b761c41 ("bpf: Attachment verification for BPF_MODIFY_RETURN") Fixes: 1e6c62a88215 ("bpf: Introduce sleepable BPF programs") Acked-by: Andrii Nakryiko Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 ++++++++++++++----- include/linux/bpf_verifier.h | 13 +++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5176726f4f03..b89a30764069 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -606,6 +606,13 @@ struct bpf_trampoline { struct bpf_ksym ksym; }; +struct bpf_attach_target_info { + struct btf_func_model fmodel; + long tgt_addr; + const char *tgt_name; + const struct btf_type *tgt_type; +}; + #define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */ struct bpf_dispatcher_prog { @@ -633,9 +640,10 @@ static __always_inline unsigned int bpf_dispatcher_nop_func( return bpf_func(ctx, insnsi); } #ifdef CONFIG_BPF_JIT -struct bpf_trampoline *bpf_trampoline_lookup(u64 key); int bpf_trampoline_link_prog(struct bpf_prog *prog); int bpf_trampoline_unlink_prog(struct bpf_prog *prog); +struct bpf_trampoline *bpf_trampoline_get(u64 key, + struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ @@ -680,10 +688,6 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); #else -static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key) -{ - return NULL; -} static inline int bpf_trampoline_link_prog(struct bpf_prog *prog) { return -ENOTSUPP; @@ -692,6 +696,11 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog) { return -ENOTSUPP; } +static inline struct bpf_trampoline *bpf_trampoline_get(u64 key, + struct bpf_attach_target_info *tgt_info) +{ + return ERR_PTR(-EOPNOTSUPP); +} static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {} #define DEFINE_BPF_DISPATCHER(name) #define DECLARE_BPF_DISPATCHER(name) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7bc9276c4ef4..363b4f1c562a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -450,4 +450,17 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); int check_ctx_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); +/* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ +static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, + u32 btf_id) +{ + return tgt_prog ? (((u64)tgt_prog->aux->id) << 32 | btf_id) : btf_id; +} + +int bpf_check_attach_target(struct bpf_verifier_log *log, + const struct bpf_prog *prog, + const struct bpf_prog *tgt_prog, + u32 btf_id, + struct bpf_attach_target_info *tgt_info); + #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From 76654e67f3a01c50dc13dd6dea75e58943413956 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:03 +0100 Subject: bpf: Provide function to get vmlinux BTF information It will be used later for BPF structure display support Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-2-git-send-email-alan.maguire@oracle.com --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b89a30764069..e620a4b1290f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1364,6 +1364,8 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, union bpf_attr __user *uattr); void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +struct btf *bpf_get_btf_vmlinux(void); + /* Map specifics */ struct xdp_buff; struct sk_buff; -- cgit v1.2.3 From 31d0bc81637d8d974a6dad9827b765b4b70c89d7 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:04 +0100 Subject: bpf: Move to generic BTF show support, apply it to seq files/strings generalize the "seq_show" seq file support in btf.c to support a generic show callback of which we support two instances; the current seq file show, and a show with snprintf() behaviour which instead writes the type data to a supplied string. Both classes of show function call btf_type_show() with different targets; the seq file or the string to be written. In the string case we need to track additional data - length left in string to write and length to return that we would have written (a la snprintf). By default show will display type information, field members and their types and values etc, and the information is indented based upon structure depth. Zeroed fields are omitted. Show however supports flags which modify its behaviour: BTF_SHOW_COMPACT - suppress newline/indent. BTF_SHOW_NONAME - suppress show of type and member names. BTF_SHOW_PTR_RAW - do not obfuscate pointer values. BTF_SHOW_UNSAFE - do not copy data to safe buffer before display. BTF_SHOW_ZERO - show zeroed values (by default they are not shown). Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-3-git-send-email-alan.maguire@oracle.com --- include/linux/btf.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index a9af5e7a7ece..d0f5d3c9ec3d 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -13,6 +13,7 @@ struct btf; struct btf_member; struct btf_type; union bpf_attr; +struct btf_show; extern const struct file_operations btf_fops; @@ -46,8 +47,43 @@ int btf_get_info_by_fd(const struct btf *btf, const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *type_id, u32 *ret_size); + +/* + * Options to control show behaviour. + * - BTF_SHOW_COMPACT: no formatting around type information + * - BTF_SHOW_NONAME: no struct/union member names/types + * - BTF_SHOW_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_SHOW_ZERO: show zero-valued struct/union members; they + * are not displayed by default + * - BTF_SHOW_UNSAFE: skip use of bpf_probe_read() to safely read + * data before displaying it. + */ +#define BTF_SHOW_COMPACT (1ULL << 0) +#define BTF_SHOW_NONAME (1ULL << 1) +#define BTF_SHOW_PTR_RAW (1ULL << 2) +#define BTF_SHOW_ZERO (1ULL << 3) +#define BTF_SHOW_UNSAFE (1ULL << 4) + void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); + +/* + * Copy len bytes of string representation of obj of BTF type_id into buf. + * + * @btf: struct btf object + * @type_id: type id of type obj points to + * @obj: pointer to typed data + * @buf: buffer to write to + * @len: maximum length to write to buf + * @flags: show options (see above) + * + * Return: length that would have been/was copied as per snprintf, or + * negative error. + */ +int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, + char *buf, int len, u64 flags); + int btf_get_fd_by_id(u32 id); u32 btf_id(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, -- cgit v1.2.3 From c4d0bfb45068d853a478b9067a95969b1886a30f Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:05 +0100 Subject: bpf: Add bpf_snprintf_btf helper A helper is added to support tracing kernel type information in BPF using the BPF Type Format (BTF). Its signature is long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags); struct btf_ptr * specifies - a pointer to the data to be traced - the BTF id of the type of data pointed to - a flags field is provided for future use; these flags are not to be confused with the BTF_F_* flags below that control how the btf_ptr is displayed; the flags member of the struct btf_ptr may be used to disambiguate types in kernel versus module BTF, etc; the main distinction is the flags relate to the type and information needed in identifying it; not how it is displayed. For example a BPF program with a struct sk_buff *skb could do the following: static struct btf_ptr b = { }; b.ptr = skb; b.type_id = __builtin_btf_type_id(struct sk_buff, 1); bpf_snprintf_btf(str, sizeof(str), &b, sizeof(b), 0, 0); Default output looks like this: (struct sk_buff){ .transport_header = (__u16)65535, .mac_header = (__u16)65535, .end = (sk_buff_data_t)192, .head = (unsigned char *)0x000000007524fd8b, .data = (unsigned char *)0x000000007524fd8b, .truesize = (unsigned int)768, .users = (refcount_t){ .refs = (atomic_t){ .counter = (int)1, }, }, } Flags modifying display are as follows: - BTF_F_COMPACT: no formatting around type information - BTF_F_NONAME: no struct/union member names/types - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; equivalent to %px. - BTF_F_ZERO: show zero-valued struct/union members; they are not displayed by default Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-4-git-send-email-alan.maguire@oracle.com --- include/linux/bpf.h | 1 + include/linux/btf.h | 9 ++++--- include/uapi/linux/bpf.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e620a4b1290f..768b533ba48e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1822,6 +1822,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; +extern const struct bpf_func_proto bpf_snprintf_btf_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/btf.h b/include/linux/btf.h index d0f5d3c9ec3d..3e5cdc2ba963 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -6,6 +6,7 @@ #include #include +#include #define BTF_TYPE_EMIT(type) ((void)(type *)0) @@ -59,10 +60,10 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, * - BTF_SHOW_UNSAFE: skip use of bpf_probe_read() to safely read * data before displaying it. */ -#define BTF_SHOW_COMPACT (1ULL << 0) -#define BTF_SHOW_NONAME (1ULL << 1) -#define BTF_SHOW_PTR_RAW (1ULL << 2) -#define BTF_SHOW_ZERO (1ULL << 3) +#define BTF_SHOW_COMPACT BTF_F_COMPACT +#define BTF_SHOW_NONAME BTF_F_NONAME +#define BTF_SHOW_PTR_RAW BTF_F_PTR_RAW +#define BTF_SHOW_ZERO BTF_F_ZERO #define BTF_SHOW_UNSAFE (1ULL << 4) void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 82522f05c021..cca9eb1b13e5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3594,6 +3594,42 @@ union bpf_attr { * the data in *dst*. This is a wrapper of **copy_from_user**\ (). * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags) + * Description + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * Return + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3745,6 +3781,7 @@ union bpf_attr { FN(inode_storage_delete), \ FN(d_path), \ FN(copy_from_user), \ + FN(snprintf_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4853,4 +4890,34 @@ struct bpf_sk_lookup { __u32 local_port; /* Host byte order */ }; +/* + * struct btf_ptr is used for typed pointer representation; the + * type id is used to render the pointer data as the appropriate type + * via the bpf_snprintf_btf() helper described above. A flags field - + * potentially to specify additional details about the BTF pointer + * (rather than its mode of display) - is included for future use. + * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately. + */ +struct btf_ptr { + void *ptr; + __u32 type_id; + __u32 flags; /* BTF ptr flags; unused at present. */ +}; + +/* + * Flags to control bpf_snprintf_btf() behaviour. + * - BTF_F_COMPACT: no formatting around type information + * - BTF_F_NONAME: no struct/union member names/types + * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_F_ZERO: show zero-valued struct/union members; they + * are not displayed by default + */ +enum { + BTF_F_COMPACT = (1ULL << 0), + BTF_F_NONAME = (1ULL << 1), + BTF_F_PTR_RAW = (1ULL << 2), + BTF_F_ZERO = (1ULL << 3), +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From eb411377aed9e27835e77ee0710ee8f4649958f3 Mon Sep 17 00:00:00 2001 From: Alan Maguire Date: Mon, 28 Sep 2020 12:31:09 +0100 Subject: bpf: Add bpf_seq_printf_btf helper A helper is added to allow seq file writing of kernel data structures using vmlinux BTF. Its signature is long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags); Flags and struct btf_ptr definitions/use are identical to the bpf_snprintf_btf helper, and the helper returns 0 on success or a negative error value. Suggested-by: Alexei Starovoitov Signed-off-by: Alan Maguire Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/1601292670-1616-8-git-send-email-alan.maguire@oracle.com --- include/linux/btf.h | 2 ++ include/uapi/linux/bpf.h | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 3e5cdc2ba963..024e16ff7dcc 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -68,6 +68,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, struct seq_file *m); +int btf_type_seq_show_flags(const struct btf *btf, u32 type_id, void *obj, + struct seq_file *m, u64 flags); /* * Copy len bytes of string representation of obj of BTF type_id into buf. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cca9eb1b13e5..96ddb00b91dc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3630,6 +3630,14 @@ union bpf_attr { * The number of bytes that were written (or would have been * written if output had to be truncated due to string size), * or a negative error in cases of failure. + * + * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags) + * Description + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * Return + * 0 on success or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3782,6 +3790,7 @@ union bpf_attr { FN(d_path), \ FN(copy_from_user), \ FN(snprintf_btf), \ + FN(seq_printf_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From b4c5f83ae3f3e2b3239751c304e424eace62448b Mon Sep 17 00:00:00 2001 From: Rusaimi Amira Ruslan Date: Mon, 28 Sep 2020 18:12:12 +0800 Subject: stmmac: intel: Adding ref clock 1us tic for LPI cntr Adding reference clock (1us tic) for all LPI timer on Intel platforms. The reference clock is derived from ptp clk. This also enables all LPI counter. Signed-off-by: Rusaimi Amira Ruslan Signed-off-by: Voon Weifeng Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 00e83c877496..628e28903b8b 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -200,5 +200,6 @@ struct plat_stmmacenet_data { int has_xgmac; bool vlan_fail_q_en; u8 vlan_fail_q; + unsigned int eee_usecs_rate; }; #endif -- cgit v1.2.3 From 3aac1ead5eb6b76f3d2b8d111e6de1c2de23fb34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 29 Sep 2020 14:45:50 +0200 Subject: bpf: Move prog->aux->linked_prog and trampoline into bpf_link on attach MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In preparation for allowing multiple attachments of freplace programs, move the references to the target program and trampoline into the bpf_tracing_link structure when that is created. To do this atomically, introduce a new mutex in prog->aux to protect writing to the two pointers to target prog and trampoline, and rename the members to make it clear that they are related. With this change, it is no longer possible to attach the same tracing program multiple times (detaching in-between), since the reference from the tracing program to the target disappears on the first attach. However, since the next patch will let the caller supply an attach target, that will also make it possible to attach to the same place multiple times. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/160138355059.48470.2503076992210324984.stgit@toke.dk --- include/linux/bpf.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 768b533ba48e..839dd8670a7a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -640,8 +640,8 @@ static __always_inline unsigned int bpf_dispatcher_nop_func( return bpf_func(ctx, insnsi); } #ifdef CONFIG_BPF_JIT -int bpf_trampoline_link_prog(struct bpf_prog *prog); -int bpf_trampoline_unlink_prog(struct bpf_prog *prog); +int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr); +int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr); struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); @@ -688,11 +688,13 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); #else -static inline int bpf_trampoline_link_prog(struct bpf_prog *prog) +static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, + struct bpf_trampoline *tr) { return -ENOTSUPP; } -static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog) +static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog, + struct bpf_trampoline *tr) { return -ENOTSUPP; } @@ -763,7 +765,9 @@ struct bpf_prog_aux { u32 max_rdonly_access; u32 max_rdwr_access; const struct bpf_ctx_arg_aux *ctx_arg_info; - struct bpf_prog *linked_prog; + struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ + struct bpf_prog *dst_prog; + struct bpf_trampoline *dst_trampoline; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ @@ -771,7 +775,6 @@ struct bpf_prog_aux { bool sleepable; bool tail_call_reachable; enum bpf_tramp_prog_type trampoline_prog_type; - struct bpf_trampoline *trampoline; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; -- cgit v1.2.3 From 4a1e7c0c63e02daad751842b7880f9bbcdfb6e89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 29 Sep 2020 14:45:51 +0200 Subject: bpf: Support attaching freplace programs to multiple attach points MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This enables support for attaching freplace programs to multiple attach points. It does this by amending the UAPI for bpf_link_Create with a target btf ID that can be used to supply the new attachment point along with the target program fd. The target must be compatible with the target that was supplied at program load time. The implementation reuses the checks that were factored out of check_attach_btf_id() to ensure compatibility between the BTF types of the old and new attachment. If these match, a new bpf_tracing_link will be created for the new attach target, allowing multiple attachments to co-exist simultaneously. The code could theoretically support multiple-attach of other types of tracing programs as well, but since I don't have a use case for any of those, there is no API support for doing so. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/160138355169.48470.17165680973640685368.stgit@toke.dk --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 9 +++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 839dd8670a7a..50e5c4b52bd1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -768,6 +768,8 @@ struct bpf_prog_aux { struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; struct bpf_trampoline *dst_trampoline; + enum bpf_prog_type saved_dst_prog_type; + enum bpf_attach_type saved_dst_attach_type; bool verifier_zext; /* Zero extensions has been inserted by verifier. */ bool offload_requested; bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 96ddb00b91dc..2b1d3f16cbd1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -639,8 +639,13 @@ union bpf_attr { }; __u32 attach_type; /* attach type */ __u32 flags; /* extra flags */ - __aligned_u64 iter_info; /* extra bpf_iter_link_info */ - __u32 iter_info_len; /* iter_info length */ + union { + __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ + }; + }; } link_create; struct { /* struct used by BPF_LINK_UPDATE command */ -- cgit v1.2.3 From 3f47cb4c1cf3bceb2438ea962bfffc6665ee4a9f Mon Sep 17 00:00:00 2001 From: Tom Parkin Date: Tue, 29 Sep 2020 13:35:41 +0100 Subject: l2tp: report rx cookie discards in netlink get When an L2TPv3 session receives a data frame with an incorrect cookie l2tp_core logs a warning message and bumps a stats counter to reflect the fact that the packet has been dropped. However, the stats counter in question is missing from the l2tp_netlink get message for tunnel and session instances. Include the statistic in the netlink get response. Signed-off-by: Tom Parkin Signed-off-by: David S. Miller --- include/uapi/linux/l2tp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index 88a0d32b8c07..30c80d5ba4bf 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -144,6 +144,7 @@ enum { L2TP_ATTR_RX_OOS_PACKETS, /* u64 */ L2TP_ATTR_RX_ERRORS, /* u64 */ L2TP_ATTR_STATS_PAD, + L2TP_ATTR_RX_COOKIE_DISCARDS, /* u64 */ __L2TP_ATTR_STATS_MAX, }; -- cgit v1.2.3 From 2ec13cbcfadbbeac499f3b63de0f7db490d45a7e Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 29 Sep 2020 11:08:59 -0700 Subject: devlink: include for _BITUL Commit 5d5b4128c4ca ("devlink: introduce flash update overwrite mask") added a usage of _BITUL to the UAPI header, but failed to include the header file where it was defined. It happens that this does not break any existing kernel include chains because it gets included through other sources. However, when including the UAPI headers in a userspace application (such as devlink in iproute2), _BITUL is not defined. Fixes: 5d5b4128c4ca ("devlink: introduce flash update overwrite mask") Signed-off-by: Jacob Keller Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 7b0face1bad5..ba467dc07852 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -13,6 +13,8 @@ #ifndef _UAPI_LINUX_DEVLINK_H_ #define _UAPI_LINUX_DEVLINK_H_ +#include + #define DEVLINK_GENL_NAME "devlink" #define DEVLINK_GENL_VERSION 0x1 #define DEVLINK_GENL_MCGRP_CONFIG_NAME "config" -- cgit v1.2.3 From f2bf88c4afc8c5ab92b40af24819933e57d0968c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 Sep 2020 22:25:11 +0200 Subject: net: caif: Remove unused caif SPI driver While chasing in_interrupt() (ab)use in drivers it turned out that the caif_spi driver has never been in use since the driver was merged 10 years ago. There never was any matching code which provides a platform device. The driver has not seen any update (asided of treewide changes and cleanups) since 8 years and the maintainers vanished from the planet. So analysing the potential contexts and the (in)correctness of in_interrupt() usage is just a pointless exercise. Remove the cruft. Signed-off-by: Thomas Gleixner Signed-off-by: David S. Miller --- include/net/caif/caif_spi.h | 155 -------------------------------------------- 1 file changed, 155 deletions(-) delete mode 100644 include/net/caif/caif_spi.h (limited to 'include') diff --git a/include/net/caif/caif_spi.h b/include/net/caif/caif_spi.h deleted file mode 100644 index a0bf4cbce71b..000000000000 --- a/include/net/caif/caif_spi.h +++ /dev/null @@ -1,155 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) ST-Ericsson AB 2010 - * Author: Daniel Martensson / Daniel.Martensson@stericsson.com - */ - -#ifndef CAIF_SPI_H_ -#define CAIF_SPI_H_ - -#include - -#define SPI_CMD_WR 0x00 -#define SPI_CMD_RD 0x01 -#define SPI_CMD_EOT 0x02 -#define SPI_CMD_IND 0x04 - -#define SPI_DMA_BUF_LEN 8192 - -#define WL_SZ 2 /* 16 bits. */ -#define SPI_CMD_SZ 4 /* 32 bits. */ -#define SPI_IND_SZ 4 /* 32 bits. */ - -#define SPI_XFER 0 -#define SPI_SS_ON 1 -#define SPI_SS_OFF 2 -#define SPI_TERMINATE 3 - -/* Minimum time between different levels is 50 microseconds. */ -#define MIN_TRANSITION_TIME_USEC 50 - -/* Defines for calculating duration of SPI transfers for a particular - * number of bytes. - */ -#define SPI_MASTER_CLK_MHZ 13 -#define SPI_XFER_TIME_USEC(bytes, clk) (((bytes) * 8) / clk) - -/* Normally this should be aligned on the modem in order to benefit from full - * duplex transfers. However a size of 8188 provokes errors when running with - * the modem. These errors occur when packet sizes approaches 4 kB of data. - */ -#define CAIF_MAX_SPI_FRAME 4092 - -/* Maximum number of uplink CAIF frames that can reside in the same SPI frame. - * This number should correspond with the modem setting. The application side - * CAIF accepts any number of embedded downlink CAIF frames. - */ -#define CAIF_MAX_SPI_PKTS 9 - -/* Decides if SPI buffers should be prefilled with 0xFF pattern for easier - * debugging. Both TX and RX buffers will be filled before the transfer. - */ -#define CFSPI_DBG_PREFILL 0 - -/* Structure describing a SPI transfer. */ -struct cfspi_xfer { - u16 tx_dma_len; - u16 rx_dma_len; - void *va_tx[2]; - dma_addr_t pa_tx[2]; - void *va_rx; - dma_addr_t pa_rx; -}; - -/* Structure implemented by the SPI interface. */ -struct cfspi_ifc { - void (*ss_cb) (bool assert, struct cfspi_ifc *ifc); - void (*xfer_done_cb) (struct cfspi_ifc *ifc); - void *priv; -}; - -/* Structure implemented by SPI clients. */ -struct cfspi_dev { - int (*init_xfer) (struct cfspi_xfer *xfer, struct cfspi_dev *dev); - void (*sig_xfer) (bool xfer, struct cfspi_dev *dev); - struct cfspi_ifc *ifc; - char *name; - u32 clk_mhz; - void *priv; -}; - -/* Enumeration describing the CAIF SPI state. */ -enum cfspi_state { - CFSPI_STATE_WAITING = 0, - CFSPI_STATE_AWAKE, - CFSPI_STATE_FETCH_PKT, - CFSPI_STATE_GET_NEXT, - CFSPI_STATE_INIT_XFER, - CFSPI_STATE_WAIT_ACTIVE, - CFSPI_STATE_SIG_ACTIVE, - CFSPI_STATE_WAIT_XFER_DONE, - CFSPI_STATE_XFER_DONE, - CFSPI_STATE_WAIT_INACTIVE, - CFSPI_STATE_SIG_INACTIVE, - CFSPI_STATE_DELIVER_PKT, - CFSPI_STATE_MAX, -}; - -/* Structure implemented by SPI physical interfaces. */ -struct cfspi { - struct caif_dev_common cfdev; - struct net_device *ndev; - struct platform_device *pdev; - struct sk_buff_head qhead; - struct sk_buff_head chead; - u16 cmd; - u16 tx_cpck_len; - u16 tx_npck_len; - u16 rx_cpck_len; - u16 rx_npck_len; - struct cfspi_ifc ifc; - struct cfspi_xfer xfer; - struct cfspi_dev *dev; - unsigned long state; - struct work_struct work; - struct workqueue_struct *wq; - struct list_head list; - int flow_off_sent; - u32 qd_low_mark; - u32 qd_high_mark; - struct completion comp; - wait_queue_head_t wait; - spinlock_t lock; - bool flow_stop; - bool slave; - bool slave_talked; -#ifdef CONFIG_DEBUG_FS - enum cfspi_state dbg_state; - u16 pcmd; - u16 tx_ppck_len; - u16 rx_ppck_len; - struct dentry *dbgfs_dir; - struct dentry *dbgfs_state; - struct dentry *dbgfs_frame; -#endif /* CONFIG_DEBUG_FS */ -}; - -extern int spi_frm_align; -extern int spi_up_head_align; -extern int spi_up_tail_align; -extern int spi_down_head_align; -extern int spi_down_tail_align; -extern struct platform_driver cfspi_spi_driver; - -void cfspi_dbg_state(struct cfspi *cfspi, int state); -int cfspi_xmitfrm(struct cfspi *cfspi, u8 *buf, size_t len); -int cfspi_xmitlen(struct cfspi *cfspi); -int cfspi_rxfrm(struct cfspi *cfspi, u8 *buf, size_t len); -int cfspi_spi_remove(struct platform_device *pdev); -int cfspi_spi_probe(struct platform_device *pdev); -int cfspi_xmitfrm(struct cfspi *cfspi, u8 *buf, size_t len); -int cfspi_xmitlen(struct cfspi *cfspi); -int cfspi_rxfrm(struct cfspi *cfspi, u8 *buf, size_t len); -void cfspi_xfer(struct work_struct *work); - -#endif /* CAIF_SPI_H_ */ -- cgit v1.2.3 From c11171a413385c1a72291cdb4a7435cb189762ab Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 29 Sep 2020 22:25:12 +0200 Subject: net: Add netif_rx_any_context() Quite some drivers make conditional decisions based on in_interrupt() to invoke either netif_rx() or netif_rx_ni(). Conditionals based on in_interrupt() or other variants of preempt count checks in drivers should not exist for various reasons and Linus clearly requested to either split the code pathes or pass an argument to the common functions which provides the context. This is obviously the correct solution, but for some of the affected drivers this needs a major rewrite due to their convoluted structure. As in_interrupt() usage in drivers needs to be phased out, provide netif_rx_any_context() as a stop gap for these drivers. This confines the in_interrupt() conditional to core code which in turn allows to remove the access to this check for driver code and provides one central place to do further modifications once the driver maze is cleaned up. Suggested-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a431c3229cbf..28cfa53daf72 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3785,6 +3785,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); int netif_rx_ni(struct sk_buff *skb); +int netif_rx_any_context(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); void netif_receive_skb_list(struct list_head *head); -- cgit v1.2.3 From 3c0e37a9e4021ccbf855dfcbc5aff1ca10487cd4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 30 Sep 2020 01:27:21 +0300 Subject: net: mscc: ocelot: introduce a new ocelot_target_{read,write} API There are some targets (register blocks) in the Ocelot switch that are instantiated more than once. For example, the VCAP IS1, IS2 and ES0 blocks all share the same register layout for interacting with the cache for the TCAM and the action RAM. For the VCAPs, the procedure for servicing them is actually common. We just need an API specifying which VCAP we are talking to, and we do that via these raw ocelot_target_read and ocelot_target_write accessors. In plain ocelot_read, the target is encoded into the register enum itself: u16 target = reg >> TARGET_OFFSET; For the VCAPs, the registers are currently defined like this: enum ocelot_reg { [...] S2_CORE_UPDATE_CTRL = S2 << TARGET_OFFSET, S2_CORE_MV_CFG, S2_CACHE_ENTRY_DAT, S2_CACHE_MASK_DAT, S2_CACHE_ACTION_DAT, S2_CACHE_CNT_DAT, S2_CACHE_TG_DAT, [...] }; which is precisely what we want to avoid, because we'd have to duplicate the same register map for S1 and for S0, and then figure out how to pass VCAP instance-specific registers to the ocelot_read calls (basically another lookup table that undoes the effect of shifting with TARGET_OFFSET). So for some targets, propose a more raw API, similar to what is currently done with ocelot_port_readl and ocelot_port_writel. Those targets can only be accessed with ocelot_target_{read,write} and not with ocelot_{read,write} after the conversion, which is fine. The VCAP registers are not actually modified to use this new API as of this patch. They will be modified in the next one. Signed-off-by: Vladimir Oltean Acked-by: Alexandre Belloni Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 3093385f6147..d459f4f25dc8 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -661,6 +661,24 @@ struct ocelot_policer { #define ocelot_fields_write(ocelot, id, reg, val) regmap_fields_write((ocelot)->regfields[(reg)], (id), (val)) #define ocelot_fields_read(ocelot, id, reg, val) regmap_fields_read((ocelot)->regfields[(reg)], (id), (val)) +#define ocelot_target_read_ix(ocelot, target, reg, gi, ri) \ + __ocelot_target_read_ix(ocelot, target, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) +#define ocelot_target_read_gix(ocelot, target, reg, gi) \ + __ocelot_target_read_ix(ocelot, target, reg, reg##_GSZ * (gi)) +#define ocelot_target_read_rix(ocelot, target, reg, ri) \ + __ocelot_target_read_ix(ocelot, target, reg, reg##_RSZ * (ri)) +#define ocelot_target_read(ocelot, target, reg) \ + __ocelot_target_read_ix(ocelot, target, reg, 0) + +#define ocelot_target_write_ix(ocelot, target, val, reg, gi, ri) \ + __ocelot_target_write_ix(ocelot, target, val, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) +#define ocelot_target_write_gix(ocelot, target, val, reg, gi) \ + __ocelot_target_write_ix(ocelot, target, val, reg, reg##_GSZ * (gi)) +#define ocelot_target_write_rix(ocelot, target, val, reg, ri) \ + __ocelot_target_write_ix(ocelot, target, val, reg, reg##_RSZ * (ri)) +#define ocelot_target_write(ocelot, target, val, reg) \ + __ocelot_target_write_ix(ocelot, target, val, reg, 0) + /* I/O */ u32 ocelot_port_readl(struct ocelot_port *port, u32 reg); void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg); @@ -668,6 +686,10 @@ u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset); void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset); void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg, u32 offset); +u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target, + u32 reg, u32 offset); +void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target, + u32 val, u32 reg, u32 offset); /* Hardware initialization */ int ocelot_regfields_init(struct ocelot *ocelot, -- cgit v1.2.3 From c1c3993edb7c8cfbe6b3991b4b4c9f673268770e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 30 Sep 2020 01:27:23 +0300 Subject: net: mscc: ocelot: generalize existing code for VCAP In the Ocelot switches there are 3 TCAMs: VCAP ES0, IS1 and IS2, which have the same configuration interface, but different sets of keys and actions. The driver currently only supports VCAP IS2. In preparation of VCAP IS1 and ES0 support, the existing code must be generalized to work with any VCAP. In that direction, we should move the structures that depend upon VCAP instantiation, like vcap_is2_keys and vcap_is2_actions, out of struct ocelot and into struct vcap_props .keys and .actions, a structure that is replicated 3 times, once per VCAP. We'll pass that structure as an argument to each function that does the key and action packing - only the control logic needs to distinguish between ocelot->vcap[VCAP_IS2] or IS1 or ES0. Another change is to make use of the newly introduced ocelot_target_read and ocelot_target_write API, since the 3 VCAPs have the same registers but put at different addresses. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 22 ++++++++------- include/soc/mscc/ocelot_vcap.h | 62 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index d459f4f25dc8..728b040e4e3e 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -393,13 +393,6 @@ enum ocelot_reg { SYS_CM_DATA_RD, SYS_CM_OP, SYS_CM_DATA, - S2_CORE_UPDATE_CTRL = S2 << TARGET_OFFSET, - S2_CORE_MV_CFG, - S2_CACHE_ENTRY_DAT, - S2_CACHE_MASK_DAT, - S2_CACHE_ACTION_DAT, - S2_CACHE_CNT_DAT, - S2_CACHE_TG_DAT, PTP_PIN_CFG = PTP << TARGET_OFFSET, PTP_PIN_TOD_SEC_MSB, PTP_PIN_TOD_SEC_LSB, @@ -518,6 +511,18 @@ enum ocelot_regfield { REGFIELD_MAX }; +enum { + /* VCAP_CORE_CFG */ + VCAP_CORE_UPDATE_CTRL, + VCAP_CORE_MV_CFG, + /* VCAP_CORE_CACHE */ + VCAP_CACHE_ENTRY_DAT, + VCAP_CACHE_MASK_DAT, + VCAP_CACHE_ACTION_DAT, + VCAP_CACHE_CNT_DAT, + VCAP_CACHE_TG_DAT, +}; + enum ocelot_ptp_pins { PTP_PIN_0, PTP_PIN_1, @@ -614,9 +619,6 @@ struct ocelot { struct list_head multicast; struct ocelot_vcap_block block; - - const struct vcap_field *vcap_is2_keys; - const struct vcap_field *vcap_is2_actions; const struct vcap_props *vcap; /* Workqueue to check statistics for overflow with its lock */ diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 5748373ab4d3..05466a1d7bd4 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -6,6 +6,8 @@ #ifndef _OCELOT_VCAP_H_ #define _OCELOT_VCAP_H_ +#include + /* ================================================================= * VCAP Common * ================================================================= @@ -33,6 +35,11 @@ struct vcap_props { } action_table[2]; u16 counter_words; /* Number of counter words */ u16 counter_width; /* Counter width (in bits) */ + + enum ocelot_target target; + + const struct vcap_field *keys; + const struct vcap_field *actions; }; /* VCAP Type-Group values */ @@ -41,6 +48,61 @@ struct vcap_props { #define VCAP_TG_HALF 2 /* Half entry */ #define VCAP_TG_QUARTER 3 /* Quarter entry */ +#define VCAP_CORE_UPDATE_CTRL_UPDATE_CMD(x) (((x) << 22) & GENMASK(24, 22)) +#define VCAP_CORE_UPDATE_CTRL_UPDATE_CMD_M GENMASK(24, 22) +#define VCAP_CORE_UPDATE_CTRL_UPDATE_CMD_X(x) (((x) & GENMASK(24, 22)) >> 22) +#define VCAP_CORE_UPDATE_CTRL_UPDATE_ENTRY_DIS BIT(21) +#define VCAP_CORE_UPDATE_CTRL_UPDATE_ACTION_DIS BIT(20) +#define VCAP_CORE_UPDATE_CTRL_UPDATE_CNT_DIS BIT(19) +#define VCAP_CORE_UPDATE_CTRL_UPDATE_ADDR(x) (((x) << 3) & GENMASK(18, 3)) +#define VCAP_CORE_UPDATE_CTRL_UPDATE_ADDR_M GENMASK(18, 3) +#define VCAP_CORE_UPDATE_CTRL_UPDATE_ADDR_X(x) (((x) & GENMASK(18, 3)) >> 3) +#define VCAP_CORE_UPDATE_CTRL_UPDATE_SHOT BIT(2) +#define VCAP_CORE_UPDATE_CTRL_CLEAR_CACHE BIT(1) +#define VCAP_CORE_UPDATE_CTRL_MV_TRAFFIC_IGN BIT(0) + +#define VCAP_CORE_MV_CFG_MV_NUM_POS(x) (((x) << 16) & GENMASK(31, 16)) +#define VCAP_CORE_MV_CFG_MV_NUM_POS_M GENMASK(31, 16) +#define VCAP_CORE_MV_CFG_MV_NUM_POS_X(x) (((x) & GENMASK(31, 16)) >> 16) +#define VCAP_CORE_MV_CFG_MV_SIZE(x) ((x) & GENMASK(15, 0)) +#define VCAP_CORE_MV_CFG_MV_SIZE_M GENMASK(15, 0) + +#define VCAP_CACHE_ENTRY_DAT_RSZ 0x4 + +#define VCAP_CACHE_MASK_DAT_RSZ 0x4 + +#define VCAP_CACHE_ACTION_DAT_RSZ 0x4 + +#define VCAP_CACHE_CNT_DAT_RSZ 0x4 + +#define VCAP_STICKY_VCAP_ROW_DELETED_STICKY BIT(0) + +#define TCAM_BIST_CTRL_TCAM_BIST BIT(1) +#define TCAM_BIST_CTRL_TCAM_INIT BIT(0) + +#define TCAM_BIST_CFG_TCAM_BIST_SOE_ENA BIT(8) +#define TCAM_BIST_CFG_TCAM_HCG_DIS BIT(7) +#define TCAM_BIST_CFG_TCAM_CG_DIS BIT(6) +#define TCAM_BIST_CFG_TCAM_BIAS(x) ((x) & GENMASK(5, 0)) +#define TCAM_BIST_CFG_TCAM_BIAS_M GENMASK(5, 0) + +#define TCAM_BIST_STAT_BIST_RT_ERR BIT(15) +#define TCAM_BIST_STAT_BIST_PENC_ERR BIT(14) +#define TCAM_BIST_STAT_BIST_COMP_ERR BIT(13) +#define TCAM_BIST_STAT_BIST_ADDR_ERR BIT(12) +#define TCAM_BIST_STAT_BIST_BL1E_ERR BIT(11) +#define TCAM_BIST_STAT_BIST_BL1_ERR BIT(10) +#define TCAM_BIST_STAT_BIST_BL0E_ERR BIT(9) +#define TCAM_BIST_STAT_BIST_BL0_ERR BIT(8) +#define TCAM_BIST_STAT_BIST_PH1_ERR BIT(7) +#define TCAM_BIST_STAT_BIST_PH0_ERR BIT(6) +#define TCAM_BIST_STAT_BIST_PV1_ERR BIT(5) +#define TCAM_BIST_STAT_BIST_PV0_ERR BIT(4) +#define TCAM_BIST_STAT_BIST_RUN BIT(3) +#define TCAM_BIST_STAT_BIST_ERR BIT(2) +#define TCAM_BIST_STAT_BIST_BUSY BIT(1) +#define TCAM_BIST_STAT_TCAM_RDY BIT(0) + /* ================================================================= * VCAP IS2 * ================================================================= -- cgit v1.2.3 From a61e365d7c183c556717bbf36dcf00c941ec044e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 30 Sep 2020 01:27:24 +0300 Subject: net: mscc: ocelot: add definitions for VCAP IS1 keys, actions and target As a preparation step for the offloading to IS1, let's create the infrastructure for talking with this hardware block. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 1 + include/soc/mscc/ocelot_vcap.h | 93 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 93 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 728b040e4e3e..d0073c94e22a 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -123,6 +123,7 @@ enum ocelot_target { QSYS, REW, SYS, + S1, S2, HSIO, PTP, diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 05466a1d7bd4..7ac184047292 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -14,7 +14,7 @@ */ enum { - /* VCAP_IS1, */ + VCAP_IS1, VCAP_IS2, /* VCAP_ES0, */ }; @@ -264,4 +264,95 @@ enum vcap_is2_action_field { VCAP_IS2_ACT_HIT_CNT, }; +/* ================================================================= + * VCAP IS1 + * ================================================================= + */ + +/* IS1 half key types */ +#define IS1_TYPE_S1_NORMAL 0 +#define IS1_TYPE_S1_5TUPLE_IP4 1 + +/* IS1 full key types */ +#define IS1_TYPE_S1_NORMAL_IP6 0 +#define IS1_TYPE_S1_7TUPLE 1 +#define IS2_TYPE_S1_5TUPLE_IP6 2 + +enum { + IS1_ACTION_TYPE_NORMAL, + IS1_ACTION_TYPE_MAX, +}; + +enum vcap_is1_half_key_field { + VCAP_IS1_HK_TYPE, + VCAP_IS1_HK_LOOKUP, + VCAP_IS1_HK_IGR_PORT_MASK, + VCAP_IS1_HK_RSV, + VCAP_IS1_HK_OAM_Y1731, + VCAP_IS1_HK_L2_MC, + VCAP_IS1_HK_L2_BC, + VCAP_IS1_HK_IP_MC, + VCAP_IS1_HK_VLAN_TAGGED, + VCAP_IS1_HK_VLAN_DBL_TAGGED, + VCAP_IS1_HK_TPID, + VCAP_IS1_HK_VID, + VCAP_IS1_HK_DEI, + VCAP_IS1_HK_PCP, + /* Specific Fields for IS1 Half Key S1_NORMAL */ + VCAP_IS1_HK_L2_SMAC, + VCAP_IS1_HK_ETYPE_LEN, + VCAP_IS1_HK_ETYPE, + VCAP_IS1_HK_IP_SNAP, + VCAP_IS1_HK_IP4, + VCAP_IS1_HK_L3_FRAGMENT, + VCAP_IS1_HK_L3_FRAG_OFS_GT0, + VCAP_IS1_HK_L3_OPTIONS, + VCAP_IS1_HK_L3_DSCP, + VCAP_IS1_HK_L3_IP4_SIP, + VCAP_IS1_HK_TCP_UDP, + VCAP_IS1_HK_TCP, + VCAP_IS1_HK_L4_SPORT, + VCAP_IS1_HK_L4_RNG, + /* Specific Fields for IS1 Half Key S1_5TUPLE_IP4 */ + VCAP_IS1_HK_IP4_INNER_TPID, + VCAP_IS1_HK_IP4_INNER_VID, + VCAP_IS1_HK_IP4_INNER_DEI, + VCAP_IS1_HK_IP4_INNER_PCP, + VCAP_IS1_HK_IP4_IP4, + VCAP_IS1_HK_IP4_L3_FRAGMENT, + VCAP_IS1_HK_IP4_L3_FRAG_OFS_GT0, + VCAP_IS1_HK_IP4_L3_OPTIONS, + VCAP_IS1_HK_IP4_L3_DSCP, + VCAP_IS1_HK_IP4_L3_IP4_DIP, + VCAP_IS1_HK_IP4_L3_IP4_SIP, + VCAP_IS1_HK_IP4_L3_PROTO, + VCAP_IS1_HK_IP4_TCP_UDP, + VCAP_IS1_HK_IP4_TCP, + VCAP_IS1_HK_IP4_L4_RNG, + VCAP_IS1_HK_IP4_IP_PAYLOAD_S1_5TUPLE, +}; + +enum vcap_is1_action_field { + VCAP_IS1_ACT_DSCP_ENA, + VCAP_IS1_ACT_DSCP_VAL, + VCAP_IS1_ACT_QOS_ENA, + VCAP_IS1_ACT_QOS_VAL, + VCAP_IS1_ACT_DP_ENA, + VCAP_IS1_ACT_DP_VAL, + VCAP_IS1_ACT_PAG_OVERRIDE_MASK, + VCAP_IS1_ACT_PAG_VAL, + VCAP_IS1_ACT_RSV, + VCAP_IS1_ACT_VID_REPLACE_ENA, + VCAP_IS1_ACT_VID_ADD_VAL, + VCAP_IS1_ACT_FID_SEL, + VCAP_IS1_ACT_FID_VAL, + VCAP_IS1_ACT_PCP_DEI_ENA, + VCAP_IS1_ACT_PCP_VAL, + VCAP_IS1_ACT_DEI_VAL, + VCAP_IS1_ACT_VLAN_POP_CNT_ENA, + VCAP_IS1_ACT_VLAN_POP_CNT, + VCAP_IS1_ACT_CUSTOM_ACE_TYPE_ENA, + VCAP_IS1_ACT_HIT_STICKY, +}; + #endif /* _OCELOT_VCAP_H_ */ -- cgit v1.2.3 From e3aea296d86f0f2166f4ddc5e1325217f847e722 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 30 Sep 2020 01:27:25 +0300 Subject: net: mscc: ocelot: add definitions for VCAP ES0 keys, actions and target As a preparation step for the offloading to ES0, let's create the infrastructure for talking with this hardware block. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 1 + include/soc/mscc/ocelot_vcap.h | 44 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index d0073c94e22a..b0a9efce8813 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -123,6 +123,7 @@ enum ocelot_target { QSYS, REW, SYS, + S0, S1, S2, HSIO, diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 7ac184047292..707e609ec919 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -14,9 +14,9 @@ */ enum { + VCAP_ES0, VCAP_IS1, VCAP_IS2, - /* VCAP_ES0, */ }; struct vcap_props { @@ -355,4 +355,46 @@ enum vcap_is1_action_field { VCAP_IS1_ACT_HIT_STICKY, }; +/* ================================================================= + * VCAP ES0 + * ================================================================= + */ + +enum { + ES0_ACTION_TYPE_NORMAL, + ES0_ACTION_TYPE_MAX, +}; + +enum vcap_es0_key_field { + VCAP_ES0_EGR_PORT, + VCAP_ES0_IGR_PORT, + VCAP_ES0_RSV, + VCAP_ES0_L2_MC, + VCAP_ES0_L2_BC, + VCAP_ES0_VID, + VCAP_ES0_DP, + VCAP_ES0_PCP, +}; + +enum vcap_es0_action_field { + VCAP_ES0_ACT_PUSH_OUTER_TAG, + VCAP_ES0_ACT_PUSH_INNER_TAG, + VCAP_ES0_ACT_TAG_A_TPID_SEL, + VCAP_ES0_ACT_TAG_A_VID_SEL, + VCAP_ES0_ACT_TAG_A_PCP_SEL, + VCAP_ES0_ACT_TAG_A_DEI_SEL, + VCAP_ES0_ACT_TAG_B_TPID_SEL, + VCAP_ES0_ACT_TAG_B_VID_SEL, + VCAP_ES0_ACT_TAG_B_PCP_SEL, + VCAP_ES0_ACT_TAG_B_DEI_SEL, + VCAP_ES0_ACT_VID_A_VAL, + VCAP_ES0_ACT_PCP_A_VAL, + VCAP_ES0_ACT_DEI_A_VAL, + VCAP_ES0_ACT_VID_B_VAL, + VCAP_ES0_ACT_PCP_B_VAL, + VCAP_ES0_ACT_DEI_B_VAL, + VCAP_ES0_ACT_RSV, + VCAP_ES0_ACT_HIT_STICKY, +}; + #endif /* _OCELOT_VCAP_H_ */ -- cgit v1.2.3 From 2096805497e2bd21df3c26bd48c43ff0ce954316 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 30 Sep 2020 01:27:26 +0300 Subject: net: mscc: ocelot: automatically detect VCAP constants The numbers in struct vcap_props are not intuitive to derive, because they are not a straightforward copy-and-paste from the reference manual but instead rely on a fairly detailed level of understanding of the layout of an entry in the TCAM and in the action RAM. For this reason, bugs are very easy to introduce here. Ease the work of hardware porters and read from hardware the constants that were exported for this particular purpose. Note that this implies that struct vcap_props can no longer be const. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 13 ++++++++++++- include/soc/mscc/ocelot_vcap.h | 3 +++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index b0a9efce8813..0c40122dcb88 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -523,6 +523,17 @@ enum { VCAP_CACHE_ACTION_DAT, VCAP_CACHE_CNT_DAT, VCAP_CACHE_TG_DAT, + /* VCAP_CONST */ + VCAP_CONST_VCAP_VER, + VCAP_CONST_ENTRY_WIDTH, + VCAP_CONST_ENTRY_CNT, + VCAP_CONST_ENTRY_SWCNT, + VCAP_CONST_ENTRY_TG_WIDTH, + VCAP_CONST_ACTION_DEF_CNT, + VCAP_CONST_ACTION_WIDTH, + VCAP_CONST_CNT_WIDTH, + VCAP_CONST_CORE_CNT, + VCAP_CONST_IF_CNT, }; enum ocelot_ptp_pins { @@ -621,7 +632,7 @@ struct ocelot { struct list_head multicast; struct ocelot_vcap_block block; - const struct vcap_props *vcap; + struct vcap_props *vcap; /* Workqueue to check statistics for overflow with its lock */ struct mutex stats_lock; diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 707e609ec919..96300adf3648 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -17,8 +17,11 @@ enum { VCAP_ES0, VCAP_IS1, VCAP_IS2, + __VCAP_COUNT, }; +#define OCELOT_NUM_VCAP_BLOCKS __VCAP_COUNT + struct vcap_props { u16 tg_width; /* Type-group width (in bits) */ u16 sw_count; /* Sub word count */ -- cgit v1.2.3 From 002f2176532093753cb6ced61e5ea7b8904c6cae Mon Sep 17 00:00:00 2001 From: "Jose M. Guisado Gomez" Date: Mon, 28 Sep 2020 14:27:10 +0200 Subject: netfilter: nf_tables: add userdata attributes to nft_chain Enables storing userdata for nft_chain. Field udata points to user data and udlen stores its length. Adds new attribute flag NFTA_CHAIN_USERDATA. Signed-off-by: Jose M. Guisado Gomez Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ include/uapi/linux/netfilter/nf_tables.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c4c526507ddb..0bd2a081ae39 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -945,6 +945,8 @@ struct nft_chain { bound:1, genmask:2; char *name; + u16 udlen; + u8 *udata; /* Only used during control plane commit phase: */ struct nft_rule **rules_next; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 3c2469b43742..352ee51707a1 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -208,6 +208,7 @@ enum nft_chain_flags { * @NFTA_CHAIN_COUNTERS: counter specification of the chain (NLA_NESTED: nft_counter_attributes) * @NFTA_CHAIN_FLAGS: chain flags * @NFTA_CHAIN_ID: uniquely identifies a chain in a transaction (NLA_U32) + * @NFTA_CHAIN_USERDATA: user data (NLA_BINARY) */ enum nft_chain_attributes { NFTA_CHAIN_UNSPEC, @@ -222,6 +223,7 @@ enum nft_chain_attributes { NFTA_CHAIN_PAD, NFTA_CHAIN_FLAGS, NFTA_CHAIN_ID, + NFTA_CHAIN_USERDATA, __NFTA_CHAIN_MAX }; #define NFTA_CHAIN_MAX (__NFTA_CHAIN_MAX - 1) -- cgit v1.2.3 From b426ce83baa7dff947fb354118d3133f2953aac8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:15 +0200 Subject: bpf: Add classid helper only based on skb->sk Similarly to 5a52ae4e32a6 ("bpf: Allow to retrieve cgroup v1 classid from v2 hooks"), add a helper to retrieve cgroup v1 classid solely based on the skb->sk, so it can be used as key as part of BPF map lookups out of tc from host ns, in particular given the skb->sk is retained these days when crossing net ns thanks to 9c4c325252c5 ("skbuff: preserve sock reference when scrubbing the skb."). This is similar to bpf_skb_cgroup_id() which implements the same for v2. Kubernetes ecosystem is still operating on v1 however, hence net_cls needs to be used there until this can be dropped in with the v2 helper of bpf_skb_cgroup_id(). Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/ed633cf27a1c620e901c5aa99ebdefb028dce600.1601477936.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2b1d3f16cbd1..6116a7f54c8f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3643,6 +3643,15 @@ union bpf_attr { * *flags* are identical to those used for bpf_snprintf_btf. * Return * 0 on success or a negative error in case of failure. + * + * u64 bpf_skb_cgroup_classid(struct sk_buff *skb) + * Description + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3796,6 +3805,7 @@ union bpf_attr { FN(copy_from_user), \ FN(snprintf_btf), \ FN(seq_printf_btf), \ + FN(skb_cgroup_classid), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 92acdc58ab11af66fcaef485433fde61b5e32fac Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:16 +0200 Subject: bpf, net: Rework cookie generator as per-cpu one With its use in BPF, the cookie generator can be called very frequently in particular when used out of cgroup v2 hooks (e.g. connect / sendmsg) and attached to the root cgroup, for example, when used in v1/v2 mixed environments. In particular, when there's a high churn on sockets in the system there can be many parallel requests to the bpf_get_socket_cookie() and bpf_get_netns_cookie() helpers which then cause contention on the atomic counter. As similarly done in f991bd2e1421 ("fs: introduce a per-cpu last_ino allocator"), add a small helper library that both can use for the 64 bit counters. Given this can be called from different contexts, we also need to deal with potential nested calls even though in practice they are considered extremely rare. One idea as suggested by Eric Dumazet was to use a reverse counter for this situation since we don't expect 64 bit overflows anyways; that way, we can avoid bigger gaps in the 64 bit counter space compared to just batch-wise increase. Even on machines with small number of cores (e.g. 4) the cookie generation shrinks from min/max/med/avg (ns) of 22/50/40/38.9 down to 10/35/14/17.3 when run in parallel from multiple CPUs. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Eric Dumazet Acked-by: Martin KaFai Lau Cc: Eric Dumazet Link: https://lore.kernel.org/bpf/8a80b8d27d3c49f9a14e1d5213c19d8be87d1dc8.1601477936.git.daniel@iogearbox.net --- include/linux/cookie.h | 51 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/sock_diag.h | 14 ++++++++++++- include/net/net_namespace.h | 2 +- 3 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 include/linux/cookie.h (limited to 'include') diff --git a/include/linux/cookie.h b/include/linux/cookie.h new file mode 100644 index 000000000000..0c159f585109 --- /dev/null +++ b/include/linux/cookie.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_COOKIE_H +#define __LINUX_COOKIE_H + +#include +#include +#include + +struct pcpu_gen_cookie { + local_t nesting; + u64 last; +} __aligned(16); + +struct gen_cookie { + struct pcpu_gen_cookie __percpu *local; + atomic64_t forward_last ____cacheline_aligned_in_smp; + atomic64_t reverse_last; +}; + +#define COOKIE_LOCAL_BATCH 4096 + +#define DEFINE_COOKIE(name) \ + static DEFINE_PER_CPU(struct pcpu_gen_cookie, __##name); \ + static struct gen_cookie name = { \ + .local = &__##name, \ + .forward_last = ATOMIC64_INIT(0), \ + .reverse_last = ATOMIC64_INIT(0), \ + } + +static __always_inline u64 gen_cookie_next(struct gen_cookie *gc) +{ + struct pcpu_gen_cookie *local = this_cpu_ptr(gc->local); + u64 val; + + if (likely(local_inc_return(&local->nesting) == 1)) { + val = local->last; + if (__is_defined(CONFIG_SMP) && + unlikely((val & (COOKIE_LOCAL_BATCH - 1)) == 0)) { + s64 next = atomic64_add_return(COOKIE_LOCAL_BATCH, + &gc->forward_last); + val = next - COOKIE_LOCAL_BATCH; + } + local->last = ++val; + } else { + val = atomic64_dec_return(&gc->reverse_last); + } + local_dec(&local->nesting); + return val; +} + +#endif /* __LINUX_COOKIE_H */ diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index 15fe980a27ea..0b9ecd8cf979 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -25,7 +25,19 @@ void sock_diag_unregister(const struct sock_diag_handler *h); void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); -u64 sock_gen_cookie(struct sock *sk); +u64 __sock_gen_cookie(struct sock *sk); + +static inline u64 sock_gen_cookie(struct sock *sk) +{ + u64 cookie; + + preempt_disable(); + cookie = __sock_gen_cookie(sk); + preempt_enable(); + + return cookie; +} + int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie); void sock_diag_save_cookie(struct sock *sk, __u32 *cookie); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 2ee5901bec7a..22bc07f4b043 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -230,7 +230,7 @@ extern struct list_head net_namespace_list; struct net *get_net_ns_by_pid(pid_t pid); struct net *get_net_ns_by_fd(int fd); -u64 net_gen_cookie(struct net *net); +u64 __net_gen_cookie(struct net *net); #ifdef CONFIG_SYSCTL void ipx_register_sysctl(void); -- cgit v1.2.3 From b4ab31414970a7a03a5d55d75083f2c101a30592 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 30 Sep 2020 17:18:17 +0200 Subject: bpf: Add redirect_neigh helper as redirect drop-in Add a redirect_neigh() helper as redirect() drop-in replacement for the xmit side. Main idea for the helper is to be very similar in semantics to the latter just that the skb gets injected into the neighboring subsystem in order to let the stack do the work it knows best anyway to populate the L2 addresses of the packet and then hand over to dev_queue_xmit() as redirect() does. This solves two bigger items: i) skbs don't need to go up to the stack on the host facing veth ingress side for traffic egressing the container to achieve the same for populating L2 which also has the huge advantage that ii) the skb->sk won't get orphaned in ip_rcv_core() when entering the IP routing layer on the host stack. Given that skb->sk neither gets orphaned when crossing the netns as per 9c4c325252c5 ("skbuff: preserve sock reference when scrubbing the skb.") the helper can then push the skbs directly to the phys device where FQ scheduler can do its work and TCP stack gets proper backpressure given we hold on to skb->sk as long as skb is still residing in queues. With the helper used in BPF data path to then push the skb to the phys device, I observed a stable/consistent TCP_STREAM improvement on veth devices for traffic going container -> host -> host -> container from ~10Gbps to ~15Gbps for a single stream in my test environment. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: David Ahern Acked-by: Martin KaFai Lau Cc: David Ahern Link: https://lore.kernel.org/bpf/f207de81629e1724899b73b8112e0013be782d35.1601477936.git.daniel@iogearbox.net --- include/linux/skbuff.h | 5 +++++ include/uapi/linux/bpf.h | 14 ++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 04a18e01b362..3d0cf3722bb4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2548,6 +2548,11 @@ static inline int skb_mac_header_was_set(const struct sk_buff *skb) return skb->mac_header != (typeof(skb->mac_header))~0U; } +static inline void skb_unset_mac_header(struct sk_buff *skb) +{ + skb->mac_header = (typeof(skb->mac_header))~0U; +} + static inline void skb_reset_mac_header(struct sk_buff *skb) { skb->mac_header = skb->data - skb->head; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6116a7f54c8f..1f17c6752deb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3652,6 +3652,19 @@ union bpf_attr { * associated socket instead of the current process. * Return * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_redirect_neigh(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * fills in e.g. MAC addresses based on the L3 information from + * the packet. This helper is supported for IPv4 and IPv6 protocols. + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3806,6 +3819,7 @@ union bpf_attr { FN(snprintf_btf), \ FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ + FN(redirect_neigh), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 20c168be684a97b084525906eb7ed017b7f9c0b8 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Wed, 30 Sep 2020 12:50:59 +0200 Subject: net: macb: move pdata to private header struct macb_platform_data is only used by macb_pci to register the platform device, move its definition to cadence/macb.h and remove platform_data/macb.h Signed-off-by: Alexandre Belloni Signed-off-by: David S. Miller --- include/linux/platform_data/macb.h | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 include/linux/platform_data/macb.h (limited to 'include') diff --git a/include/linux/platform_data/macb.h b/include/linux/platform_data/macb.h deleted file mode 100644 index aa5b5562d6f7..000000000000 --- a/include/linux/platform_data/macb.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2004-2006 Atmel Corporation - */ -#ifndef __MACB_PDATA_H__ -#define __MACB_PDATA_H__ - -#include - -/** - * struct macb_platform_data - platform data for MACB Ethernet - * @pclk: platform clock - * @hclk: AHB clock - */ -struct macb_platform_data { - struct clk *pclk; - struct clk *hclk; -}; - -#endif /* __MACB_PDATA_H__ */ -- cgit v1.2.3 From b6b6d6533a14b5ddcf9f9c5239fc3721fc6beda0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Sep 2020 05:54:56 -0700 Subject: inet: remove icsk_ack.blocked TCP has been using it to work around the possibility of tcp_delack_timer() finding the socket owned by user. After commit 6f458dfb4092 ("tcp: improve latencies of timer triggered events") we added TCP_DELACK_TIMER_DEFERRED atomic bit for more immediate recovery, so we can get rid of icsk_ack.blocked This frees space that following patch will reuse. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index dc763ca9413c..79875f976190 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -110,7 +110,7 @@ struct inet_connection_sock { __u8 pending; /* ACK is pending */ __u8 quick; /* Scheduled number of quick acks */ __u8 pingpong; /* The session is interactive */ - __u8 blocked; /* Delayed ACK was blocked by socket lock */ + /* one byte hole. */ __u32 ato; /* Predicted tick of soft clock */ unsigned long timeout; /* Currently scheduled timeout */ __u32 lrcvtime; /* timestamp of last received data packet */ @@ -198,7 +198,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) sk_stop_timer(sk, &icsk->icsk_retransmit_timer); #endif } else if (what == ICSK_TIME_DACK) { - icsk->icsk_ack.blocked = icsk->icsk_ack.pending = 0; + icsk->icsk_ack.pending = 0; #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_delack_timer); #endif -- cgit v1.2.3 From a37c2134bed6f28c6d6aefa2699331e6e4e9c4f1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 30 Sep 2020 05:54:57 -0700 Subject: tcp: add exponential backoff in __tcp_send_ack() Whenever host is under very high memory pressure, __tcp_send_ack() skb allocation fails, and we setup a 200 ms (TCP_DELACK_MAX) timer before retrying. On hosts with high number of TCP sockets, we can spend considerable amount of cpu cycles in these attempts, add high pressure on various spinlocks in mm-layer, ultimately blocking threads attempting to free space from making any progress. This patch adds standard exponential backoff to avoid adding fuel to the fire. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 79875f976190..7338b3865a2a 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -110,7 +110,7 @@ struct inet_connection_sock { __u8 pending; /* ACK is pending */ __u8 quick; /* Scheduled number of quick acks */ __u8 pingpong; /* The session is interactive */ - /* one byte hole. */ + __u8 retry; /* Number of attempts */ __u32 ato; /* Predicted tick of soft clock */ unsigned long timeout; /* Currently scheduled timeout */ __u32 lrcvtime; /* timestamp of last received data packet */ @@ -199,6 +199,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) #endif } else if (what == ICSK_TIME_DACK) { icsk->icsk_ack.pending = 0; + icsk->icsk_ack.retry = 0; #ifdef INET_CSK_CLEAR_TIMERS sk_stop_timer(sk, &icsk->icsk_delack_timer); #endif -- cgit v1.2.3 From 5b88823bfe0875b327cc041017b5dcbec9dcbcc8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 29 Sep 2020 11:15:50 +0300 Subject: devlink: Add a tracepoint for trap reports Add a tracepoint for trap reports so that drop monitor could register its probe on it. Use trace_devlink_trap_report_enabled() to avoid wasting cycles setting the trap metadata if the tracepoint is not enabled. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 14 ++++++++++++++ include/trace/events/devlink.h | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 7339bf9ba6b4..1014294ba6a0 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -624,6 +624,20 @@ struct devlink_health_reporter_ops { struct netlink_ext_ack *extack); }; +/** + * struct devlink_trap_metadata - Packet trap metadata. + * @trap_name: Trap name. + * @trap_group_name: Trap group name. + * @input_dev: Input netdevice. + * @fa_cookie: Flow action user cookie. + */ +struct devlink_trap_metadata { + const char *trap_name; + const char *trap_group_name; + struct net_device *input_dev; + const struct flow_action_cookie *fa_cookie; +}; + /** * struct devlink_trap_policer - Immutable packet trap policer attributes. * @id: Policer identifier. diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h index 6f60a78d9a7e..44d8e2981065 100644 --- a/include/trace/events/devlink.h +++ b/include/trace/events/devlink.h @@ -171,6 +171,43 @@ TRACE_EVENT(devlink_health_reporter_state_update, __entry->new_state) ); +/* + * Tracepoint for devlink packet trap: + */ +TRACE_EVENT(devlink_trap_report, + TP_PROTO(const struct devlink *devlink, struct sk_buff *skb, + const struct devlink_trap_metadata *metadata), + + TP_ARGS(devlink, skb, metadata), + + TP_STRUCT__entry( + __string(bus_name, devlink->dev->bus->name) + __string(dev_name, dev_name(devlink->dev)) + __string(driver_name, devlink->dev->driver->name) + __string(trap_name, metadata->trap_name) + __string(trap_group_name, metadata->trap_group_name) + __dynamic_array(char, input_dev_name, IFNAMSIZ) + ), + + TP_fast_assign( + struct net_device *input_dev = metadata->input_dev; + + __assign_str(bus_name, devlink->dev->bus->name); + __assign_str(dev_name, dev_name(devlink->dev)); + __assign_str(driver_name, devlink->dev->driver->name); + __assign_str(trap_name, metadata->trap_name); + __assign_str(trap_group_name, metadata->trap_group_name); + __assign_str(input_dev_name, + (input_dev ? input_dev->name : "NULL")); + ), + + TP_printk("bus_name=%s dev_name=%s driver_name=%s trap_name=%s " + "trap_group_name=%s input_dev_name=%s", __get_str(bus_name), + __get_str(dev_name), __get_str(driver_name), + __get_str(trap_name), __get_str(trap_group_name), + __get_str(input_dev_name)) +); + #endif /* _TRACE_DEVLINK_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 8ee2267ad33e0ba021e9dd9b437f773906cd99d6 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 29 Sep 2020 11:15:52 +0300 Subject: drop_monitor: Convert to using devlink tracepoint Convert drop monitor to use the recently introduced 'devlink_trap_report' tracepoint instead of having devlink call into drop monitor. This is both consistent with software originated drops ('kfree_skb' tracepoint) and also allows drop monitor to be built as a module and still report hardware originated drops. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/drop_monitor.h | 36 ------------------------------------ 1 file changed, 36 deletions(-) delete mode 100644 include/net/drop_monitor.h (limited to 'include') diff --git a/include/net/drop_monitor.h b/include/net/drop_monitor.h deleted file mode 100644 index 3f5b6ddb3179..000000000000 --- a/include/net/drop_monitor.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ - -#ifndef _NET_DROP_MONITOR_H_ -#define _NET_DROP_MONITOR_H_ - -#include -#include -#include -#include - -/** - * struct net_dm_hw_metadata - Hardware-supplied packet metadata. - * @trap_group_name: Hardware trap group name. - * @trap_name: Hardware trap name. - * @input_dev: Input netdevice. - * @fa_cookie: Flow action user cookie. - */ -struct net_dm_hw_metadata { - const char *trap_group_name; - const char *trap_name; - struct net_device *input_dev; - const struct flow_action_cookie *fa_cookie; -}; - -#if IS_REACHABLE(CONFIG_NET_DROP_MONITOR) -void net_dm_hw_report(struct sk_buff *skb, - const struct net_dm_hw_metadata *hw_metadata); -#else -static inline void -net_dm_hw_report(struct sk_buff *skb, - const struct net_dm_hw_metadata *hw_metadata) -{ -} -#endif - -#endif /* _NET_DROP_MONITOR_H_ */ -- cgit v1.2.3 From 93e155967ccc053b71d408edf8c0142199df5c8c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 29 Sep 2020 11:15:55 +0300 Subject: drop_monitor: Filter control packets in drop monitor Previously, devlink called into drop monitor in order to report hardware originated drops / exceptions. devlink intentionally filtered control packets and did not pass them to drop monitor as they were not dropped by the underlying hardware. Now drop monitor registers its probe on a generic 'devlink_trap_report' tracepoint and should therefore perform this filtering itself instead of having devlink do that. Add the trap type as metadata and have drop monitor ignore control packets. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 1014294ba6a0..1c286e9a3590 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -630,12 +630,14 @@ struct devlink_health_reporter_ops { * @trap_group_name: Trap group name. * @input_dev: Input netdevice. * @fa_cookie: Flow action user cookie. + * @trap_type: Trap type. */ struct devlink_trap_metadata { const char *trap_name; const char *trap_group_name; struct net_device *input_dev; const struct flow_action_cookie *fa_cookie; + enum devlink_trap_type trap_type; }; /** -- cgit v1.2.3 From 7cd7becdddb00620fb8deb74e6fe4e5a1522ae5a Mon Sep 17 00:00:00 2001 From: sunils Date: Fri, 11 Sep 2020 00:13:26 +0300 Subject: net/mlx5: E-switch, Use PF num in metadata reg c0 Currently only 256 vports can be supported as only 8 bits are reserved for them and 8 bits are reserved for vhca_ids in metadata reg c0. To support more than 256 vports, replace vhca_id with a unique shorter 4-bit PF number which covers upto 16 PF's. Use remaining 12 bits for vports ranging 1-4095. This will continue to generate unique metadata even if multiple PCI devices have same switch_id. Signed-off-by: sunils Reviewed-by: Parav Pandit Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- include/linux/mlx5/eswitch.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h index c16827eeba9c..b0ae8020f13e 100644 --- a/include/linux/mlx5/eswitch.h +++ b/include/linux/mlx5/eswitch.h @@ -74,15 +74,16 @@ bool mlx5_eswitch_reg_c1_loopback_enabled(const struct mlx5_eswitch *esw); bool mlx5_eswitch_vport_match_metadata_enabled(const struct mlx5_eswitch *esw); /* Reg C0 usage: - * Reg C0 = < ESW_VHCA_ID_BITS(8) | ESW_VPORT BITS(8) | ESW_CHAIN_TAG(16) > + * Reg C0 = < ESW_PFNUM_BITS(4) | ESW_VPORT BITS(12) | ESW_CHAIN_TAG(16) > * - * Highest 8 bits of the reg c0 is the vhca_id, next 8 bits is vport_num, - * the rest (lowest 16 bits) is left for tc chain tag restoration. - * VHCA_ID + VPORT comprise the SOURCE_PORT matching. + * Highest 4 bits of the reg c0 is the PF_NUM (range 0-15), 12 bits of + * unique non-zero vport id (range 1-4095). The rest (lowest 16 bits) is left + * for tc chain tag restoration. + * PFNUM + VPORT comprise the SOURCE_PORT matching. */ -#define ESW_VHCA_ID_BITS 8 -#define ESW_VPORT_BITS 8 -#define ESW_SOURCE_PORT_METADATA_BITS (ESW_VHCA_ID_BITS + ESW_VPORT_BITS) +#define ESW_VPORT_BITS 12 +#define ESW_PFNUM_BITS 4 +#define ESW_SOURCE_PORT_METADATA_BITS (ESW_PFNUM_BITS + ESW_VPORT_BITS) #define ESW_SOURCE_PORT_METADATA_OFFSET (32 - ESW_SOURCE_PORT_METADATA_BITS) #define ESW_CHAIN_TAG_METADATA_BITS (32 - ESW_SOURCE_PORT_METADATA_BITS) #define ESW_CHAIN_TAG_METADATA_MASK GENMASK(ESW_CHAIN_TAG_METADATA_BITS - 1,\ -- cgit v1.2.3 From 792caccc4526bb489e054f9ab61d7c024b15dea2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 30 Sep 2020 15:49:26 -0700 Subject: bpf: Introduce BPF_F_PRESERVE_ELEMS for perf event array Currently, perf event in perf event array is removed from the array when the map fd used to add the event is closed. This behavior makes it difficult to the share perf events with perf event array. Introduce perf event map that keeps the perf event open with a new flag BPF_F_PRESERVE_ELEMS. With this flag set, perf events in the array are not removed when the original map fd is closed. Instead, the perf event will stay in the map until 1) it is explicitly removed from the array; or 2) the array is freed. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200930224927.1936644-2-songliubraving@fb.com --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1f17c6752deb..4f556cfcbfbe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -414,6 +414,9 @@ enum { /* Enable memory-mapping BPF map */ BPF_F_MMAPABLE = (1U << 10), + +/* Share perf_event among processes */ + BPF_F_PRESERVE_ELEMS = (1U << 11), }; /* Flags for BPF_PROG_QUERY. */ -- cgit v1.2.3 From 82f45c6c4a70622cc0585e3f4372e192a6491d26 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Oct 2020 18:34:48 -0700 Subject: bpf: tcp: Do not limit cb_flags when creating child sk from listen sk The commit 0813a841566f ("bpf: tcp: Allow bpf prog to write and parse TCP header option") unnecessarily introduced bpf_skops_init_child() which limited the child sk from inheriting all bpf_sock_ops_cb_flags of the listen sk. That breaks existing user expectation. This patch removes the bpf_skops_init_child() and just allows sock_copy() to do its job to copy everything from listen sk to the child sk. Fixes: 0813a841566f ("bpf: tcp: Allow bpf prog to write and parse TCP header option") Reported-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201002013448.2542025-1-kafai@fb.com --- include/net/tcp.h | 33 --------------------------------- 1 file changed, 33 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 3601dea931a6..d4ef5bf94168 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2228,34 +2228,6 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, #endif /* CONFIG_NET_SOCK_MSG */ #ifdef CONFIG_CGROUP_BPF -/* Copy the listen sk's HDR_OPT_CB flags to its child. - * - * During 3-Way-HandShake, the synack is usually sent from - * the listen sk with the HDR_OPT_CB flags set so that - * bpf-prog will be called to write the BPF hdr option. - * - * In fastopen, the child sk is used to send synack instead - * of the listen sk. Thus, inheriting the HDR_OPT_CB flags - * from the listen sk gives the bpf-prog a chance to write - * BPF hdr option in the synack pkt during fastopen. - * - * Both fastopen and non-fastopen child will inherit the - * HDR_OPT_CB flags to keep the bpf-prog having a consistent - * behavior when deciding to clear this cb flags (or not) - * during the PASSIVE_ESTABLISHED_CB. - * - * In the future, other cb flags could be inherited here also. - */ -static inline void bpf_skops_init_child(const struct sock *sk, - struct sock *child) -{ - tcp_sk(child)->bpf_sock_ops_cb_flags = - tcp_sk(sk)->bpf_sock_ops_cb_flags & - (BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG | - BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG | - BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG); -} - static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, unsigned int end_offset) @@ -2264,11 +2236,6 @@ static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, skops->skb_data_end = skb->data + end_offset; } #else -static inline void bpf_skops_init_child(const struct sock *sk, - struct sock *child) -{ -} - static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, unsigned int end_offset) -- cgit v1.2.3 From 949ca6b82e43b342dba153a9fd643fb1b5e9f034 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Oct 2020 09:46:04 +0200 Subject: netlink: fix policy dump leak [ Upstream commit a95bc734e60449e7b073ff7ff70c35083b290ae9 ] If userspace doesn't complete the policy dump, we leak the allocated state. Fix this. Fixes: d07dcf9aadd6 ("netlink: add infrastructure to expose policies to userspace") Signed-off-by: Johannes Berg Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netlink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index b2cf34f53e55..9e7eca961a98 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1938,7 +1938,8 @@ void nla_get_range_signed(const struct nla_policy *pt, int netlink_policy_dump_start(const struct nla_policy *policy, unsigned int maxtype, unsigned long *state); -bool netlink_policy_dump_loop(unsigned long *state); +bool netlink_policy_dump_loop(unsigned long state); int netlink_policy_dump_write(struct sk_buff *skb, unsigned long state); +void netlink_policy_dump_free(unsigned long state); #endif -- cgit v1.2.3 From 1dc0408cdf3caf3a8b8ad97c831ae52d2ab5b953 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Thu, 1 Oct 2020 19:42:12 -0700 Subject: net: dsa: Call dsa_untag_bridge_pvid() from dsa_switch_rcv() When a DSA switch driver needs to call dsa_untag_bridge_pvid(), it can set dsa_switch::untag_brige_pvid to indicate this is necessary. This is a pre-requisite to making sure that we are always calling dsa_untag_bridge_pvid() after eth_type_trans() has been called. Signed-off-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index b502a63d196e..8b0696e08cac 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -308,6 +308,14 @@ struct dsa_switch { */ bool configure_vlan_while_not_filtering; + /* If the switch driver always programs the CPU port as egress tagged + * despite the VLAN configuration indicating otherwise, then setting + * @untag_bridge_pvid will force the DSA receive path to pop the bridge's + * default_pvid VLAN tagged frames to offer a consistent behavior + * between a vlan_filtering=0 and vlan_filtering=1 bridge device. + */ + bool untag_bridge_pvid; + /* In case vlan_filtering_is_global is set, the VLAN awareness state * should be retrieved from here and not from the per-port settings. */ -- cgit v1.2.3 From 4976b718c3551faba2c0616ef55ebeb74db1c5ca Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:44 -0700 Subject: bpf: Introduce pseudo_btf_id Pseudo_btf_id is a type of ld_imm insn that associates a btf_id to a ksym so that further dereferences on the ksym can use the BTF info to validate accesses. Internally, when seeing a pseudo_btf_id ld insn, the verifier reads the btf_id stored in the insn[0]'s imm field and marks the dst_reg as PTR_TO_BTF_ID. The btf_id points to a VAR_KIND, which is encoded in btf_vminux by pahole. If the VAR is not of a struct type, the dst reg will be marked as PTR_TO_MEM instead of PTR_TO_BTF_ID and the mem_size is resolved to the size of the VAR's type. >From the VAR btf_id, the verifier can also read the address of the ksym's corresponding kernel var from kallsyms and use that to fill dst_reg. Therefore, the proper functionality of pseudo_btf_id depends on (1) kallsyms and (2) the encoding of kernel global VARs in pahole, which should be available since pahole v1.18. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-2-haoluo@google.com --- include/linux/bpf_verifier.h | 7 +++++++ include/linux/btf.h | 15 +++++++++++++++ include/uapi/linux/bpf.h | 36 +++++++++++++++++++++++++++--------- 3 files changed, 49 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 363b4f1c562a..e83ef6f6bf43 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -308,6 +308,13 @@ struct bpf_insn_aux_data { u32 map_index; /* index into used_maps[] */ u32 map_off; /* offset from value base address */ }; + struct { + enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ + union { + u32 btf_id; /* btf_id for struct typed var */ + u32 mem_size; /* mem_size for non-struct typed var */ + }; + } btf_var; }; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ diff --git a/include/linux/btf.h b/include/linux/btf.h index 024e16ff7dcc..af1244180588 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -145,6 +145,21 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; } +static inline bool btf_type_is_var(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; +} + +/* union is only a special case of struct: + * all its offsetof(member) == 0 + */ +static inline bool btf_type_is_struct(const struct btf_type *t) +{ + u8 kind = BTF_INFO_KIND(t->info); + + return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +} + static inline u16 btf_type_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4f556cfcbfbe..2aa156af24d6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -356,18 +356,36 @@ enum bpf_link_type { #define BPF_F_SLEEPABLE (1U << 4) /* When BPF ldimm64's insn[0].src_reg != 0 then this can have - * two extensions: - * - * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE - * insn[0].imm: map fd map fd - * insn[1].imm: 0 offset into value - * insn[0].off: 0 0 - * insn[1].off: 0 0 - * ldimm64 rewrite: address of map address of map[0]+offset - * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_FD + * insn[0].imm: map fd + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP */ #define BPF_PSEUDO_MAP_FD 1 +/* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE + * insn[0].imm: map fd + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ #define BPF_PSEUDO_MAP_VALUE 2 +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative * offset to another bpf function -- cgit v1.2.3 From eaa6bcb71ef6ed3dc18fc525ee7e293b06b4882b Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:47 -0700 Subject: bpf: Introduce bpf_per_cpu_ptr() Add bpf_per_cpu_ptr() to help bpf programs access percpu vars. bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the kernel except that it may return NULL. This happens when the cpu parameter is out of range. So the caller must check the returned value. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-5-haoluo@google.com --- include/linux/bpf.h | 4 ++++ include/linux/btf.h | 11 +++++++++++ include/uapi/linux/bpf.h | 18 ++++++++++++++++++ 3 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 50e5c4b52bd1..9dde15b2479d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -293,6 +293,7 @@ enum bpf_arg_type { ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ + ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ __BPF_ARG_TYPE_MAX, }; @@ -307,6 +308,7 @@ enum bpf_return_type { RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ + RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -405,6 +407,7 @@ enum bpf_reg_type { PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */ PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */ PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */ + PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ }; /* The information passed from prog-specific *_is_valid_access @@ -1828,6 +1831,7 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto; extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; +extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/btf.h b/include/linux/btf.h index af1244180588..2bf641829664 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -110,6 +110,11 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type, i < btf_type_vlen(struct_type); \ i++, member++) +#define for_each_vsi(i, datasec_type, member) \ + for (i = 0, member = btf_type_var_secinfo(datasec_type); \ + i < btf_type_vlen(datasec_type); \ + i++, member++) + static inline bool btf_type_is_ptr(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; @@ -194,6 +199,12 @@ static inline const struct btf_member *btf_type_member(const struct btf_type *t) return (const struct btf_member *)(t + 1); } +static inline const struct btf_var_secinfo *btf_type_var_secinfo( + const struct btf_type *t) +{ + return (const struct btf_var_secinfo *)(t + 1); +} + #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2aa156af24d6..f3c1b637ab39 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3686,6 +3686,23 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3841,6 +3858,7 @@ union bpf_attr { FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ FN(redirect_neigh), \ + FN(bpf_per_cpu_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 63d9b80dcf2c67bc5ade61cbbaa09d7af21f43f1 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Tue, 29 Sep 2020 16:50:48 -0700 Subject: bpf: Introducte bpf_this_cpu_ptr() Add bpf_this_cpu_ptr() to help access percpu var on this cpu. This helper always returns a valid pointer, therefore no need to check returned value for NULL. Also note that all programs run with preemption disabled, which means that the returned pointer is stable during all the execution of the program. Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200929235049.2533242-6-haoluo@google.com --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 13 +++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9dde15b2479d..dc63eeed4fd9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -309,6 +309,7 @@ enum bpf_return_type { RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */ RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ + RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -1832,6 +1833,7 @@ extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto; extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; +extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f3c1b637ab39..c446394135be 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3703,6 +3703,18 @@ union bpf_attr { * Return * A pointer pointing to the kernel percpu variable on *cpu*, or * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3859,6 +3871,7 @@ union bpf_attr { FN(skb_cgroup_classid), \ FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ + FN(bpf_this_cpu_ptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 319e4dd11a207bac2eaa9b96060145cd0d4c12d2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 2 Oct 2020 15:02:21 +0300 Subject: net: mscc: ocelot: introduce conversion helpers between port and netdev Since the mscc_ocelot_switch_lib is common between a pure switchdev and a DSA driver, the procedure of retrieving a net_device for a certain port index differs, as those are registered by their individual front-ends. Up to now that has been dealt with by always passing the port index to the switch library, but now, we're going to need to work with net_device pointers from the tc-flower offload, for things like indev, or mirred. It is not desirable to refactor that, so let's make sure that the flower offload core has the ability to translate between a net_device and a port index properly. Signed-off-by: Vladimir Oltean Acked-by: Alexandre Belloni Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 0c40122dcb88..424256fa531b 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -559,6 +559,8 @@ enum ocelot_tag_prefix { struct ocelot; struct ocelot_ops { + struct net_device *(*port_to_netdev)(struct ocelot *ocelot, int port); + int (*netdev_to_port)(struct net_device *dev); int (*reset)(struct ocelot *ocelot); u16 (*wm_enc)(u16 value); }; -- cgit v1.2.3 From 1397a2eb52e20e20363cc0a1cb707d5eb473dbb7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 2 Oct 2020 15:02:22 +0300 Subject: net: mscc: ocelot: create TCAM skeleton from tc filter chains For Ocelot switches, there are 2 ingress pipelines for flow offload rules: VCAP IS1 (Ingress Classification) and IS2 (Security Enforcement). IS1 and IS2 support different sets of actions. The pipeline order for a packet on ingress is: Basic classification -> VCAP IS1 -> VCAP IS2 Furthermore, IS1 is looked up 3 times, and IS2 is looked up twice (each TCAM entry can be configured to match only on the first lookup, or only on the second, or on both etc). Because the TCAMs are completely independent in hardware, and because of the fixed pipeline, we actually have very limited options when it comes to offloading complex rules to them while still maintaining the same semantics with the software data path. This patch maps flow offload rules to ingress TCAMs according to a predefined chain index number. There is going to be a script in selftests that clarifies the usage model. There is also an egress TCAM (VCAP ES0, the Egress Rewriter), which is modeled on top of the default chain 0 of the egress qdisc, because it doesn't have multiple lookups. Suggested-by: Allan W. Nielsen Co-developed-by: Xiaoliang Yang Signed-off-by: Xiaoliang Yang Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 424256fa531b..46608494616f 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -633,7 +633,8 @@ struct ocelot { struct list_head multicast; - struct ocelot_vcap_block block; + struct list_head dummy_rules; + struct ocelot_vcap_block block[3]; struct vcap_props *vcap; /* Workqueue to check statistics for overflow with its lock */ -- cgit v1.2.3 From 10c24eb23da0dc67934fcc9b5b0f201750ff8cd8 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Thu, 1 Oct 2020 18:11:45 +0300 Subject: devlink: add parser error drop packet traps Add parser error drop packet traps, so that capable device driver could register them with devlink. The new packet trap group holds any drops of packets which were marked by the device as erroneous during header parsing. Add documentation for every added packet trap and packet trap group. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- include/net/devlink.h | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 1c286e9a3590..1f5004a5c9f9 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -784,6 +784,22 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_FLOW_ACTION_SAMPLE, DEVLINK_TRAP_GENERIC_ID_FLOW_ACTION_TRAP, DEVLINK_TRAP_GENERIC_ID_EARLY_DROP, + DEVLINK_TRAP_GENERIC_ID_VXLAN_PARSING, + DEVLINK_TRAP_GENERIC_ID_LLC_SNAP_PARSING, + DEVLINK_TRAP_GENERIC_ID_VLAN_PARSING, + DEVLINK_TRAP_GENERIC_ID_PPPOE_PPP_PARSING, + DEVLINK_TRAP_GENERIC_ID_MPLS_PARSING, + DEVLINK_TRAP_GENERIC_ID_ARP_PARSING, + DEVLINK_TRAP_GENERIC_ID_IP_1_PARSING, + DEVLINK_TRAP_GENERIC_ID_IP_N_PARSING, + DEVLINK_TRAP_GENERIC_ID_GRE_PARSING, + DEVLINK_TRAP_GENERIC_ID_UDP_PARSING, + DEVLINK_TRAP_GENERIC_ID_TCP_PARSING, + DEVLINK_TRAP_GENERIC_ID_IPSEC_PARSING, + DEVLINK_TRAP_GENERIC_ID_SCTP_PARSING, + DEVLINK_TRAP_GENERIC_ID_DCCP_PARSING, + DEVLINK_TRAP_GENERIC_ID_GTP_PARSING, + DEVLINK_TRAP_GENERIC_ID_ESP_PARSING, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -819,6 +835,7 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_GENERAL, DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_SAMPLE, DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_TRAP, + DEVLINK_TRAP_GROUP_GENERIC_ID_PARSER_ERROR_DROPS, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -974,6 +991,39 @@ enum devlink_trap_group_generic_id { "flow_action_trap" #define DEVLINK_TRAP_GENERIC_NAME_EARLY_DROP \ "early_drop" +#define DEVLINK_TRAP_GENERIC_NAME_VXLAN_PARSING \ + "vxlan_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_LLC_SNAP_PARSING \ + "llc_snap_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_VLAN_PARSING \ + "vlan_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_PPPOE_PPP_PARSING \ + "pppoe_ppp_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_MPLS_PARSING \ + "mpls_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_ARP_PARSING \ + "arp_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_IP_1_PARSING \ + "ip_1_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_IP_N_PARSING \ + "ip_n_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_GRE_PARSING \ + "gre_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_UDP_PARSING \ + "udp_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_TCP_PARSING \ + "tcp_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_IPSEC_PARSING \ + "ipsec_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_SCTP_PARSING \ + "sctp_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_DCCP_PARSING \ + "dccp_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_GTP_PARSING \ + "gtp_parsing" +#define DEVLINK_TRAP_GENERIC_NAME_ESP_PARSING \ + "esp_parsing" + #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -1025,6 +1075,8 @@ enum devlink_trap_group_generic_id { "acl_sample" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_TRAP \ "acl_trap" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PARSER_ERROR_DROPS \ + "parser_error_drops" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ -- cgit v1.2.3 From c50bf2be7306cd37e4a8228acfe0fee36b9097dc Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Thu, 1 Oct 2020 18:11:46 +0300 Subject: devlink: add .trap_group_action_set() callback Add a new devlink callback, .trap_group_action_set(), which can be used by device drivers which do not support controlling the action (drop, trap) on each trap but rather on the entire group trap. If this new callback is populated, it will take precedence over the .trap_action_set() callback when the user requests a change of all the traps in a group. Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- include/net/devlink.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 1f5004a5c9f9..89ede1ce3a3a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1242,6 +1242,16 @@ struct devlink_ops { const struct devlink_trap_group *group, const struct devlink_trap_policer *policer, struct netlink_ext_ack *extack); + /** + * @trap_group_action_set: Trap group action set function. + * + * If this callback is populated, it will take precedence over looping + * over all traps in a group and calling .trap_action_set(). + */ + int (*trap_group_action_set)(struct devlink *devlink, + const struct devlink_trap_group *group, + enum devlink_trap_action action, + struct netlink_ext_ack *extack); /** * @trap_policer_init: Trap policer initialization function. * -- cgit v1.2.3 From e5086736969880478abb2ac85ef8757ac6ce45bf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 2 Oct 2020 14:49:52 -0700 Subject: genetlink: reorg struct genl_family There are holes and oversized members in struct genl_family. Before: /* size: 104, cachelines: 2, members: 16 */ After: /* size: 88, cachelines: 2, members: 16 */ The command field in struct genlmsghdr is a u8, so no point in the operation count being 32 bit. Also operation 0 is usually undefined, so we only need 255 entries. netnsok and parallel_ops are only ever initialized to true. We can grow the fields as needed, compiler should warn us if someone tries to assign larger constants. Signed-off-by: Jakub Kicinski Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index b9eb92f3fe86..5cd9ab0c6bd9 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -48,8 +48,11 @@ struct genl_family { char name[GENL_NAMSIZ]; unsigned int version; unsigned int maxattr; - bool netnsok; - bool parallel_ops; + unsigned int mcgrp_offset; /* private */ + u8 netnsok:1; + u8 parallel_ops:1; + u8 n_ops; + u8 n_mcgrps; const struct nla_policy *policy; int (*pre_doit)(const struct genl_ops *ops, struct sk_buff *skb, @@ -59,9 +62,6 @@ struct genl_family { struct genl_info *info); const struct genl_ops * ops; const struct genl_multicast_group *mcgrps; - unsigned int n_ops; - unsigned int n_mcgrps; - unsigned int mcgrp_offset; /* private */ struct module *module; }; -- cgit v1.2.3 From 0b588afdd16f9e0b63128dc4bcd002e7f2725fe0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 2 Oct 2020 14:49:53 -0700 Subject: genetlink: add small version of ops We want to add maxattr and policy back to genl_ops, to enable dumping per command policy to user space. This, however, would cause bloat for all the families with global policies. Introduce smaller version of ops (half the size of genl_ops). Translate these smaller ops into a full blown struct before use in the core. v1: - use struct assignment - put a full copy of the op in struct genl_dumpit_info - s/light/small/ Signed-off-by: Jakub Kicinski Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 53 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 5cd9ab0c6bd9..8ea1fc1ed1c7 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -41,6 +41,8 @@ struct genl_info; * (private) * @ops: the operations supported by this family * @n_ops: number of operations supported by this family + * @small_ops: the small-struct operations supported by this family + * @n_small_ops: number of small-struct operations supported by this family */ struct genl_family { int id; /* private */ @@ -52,6 +54,7 @@ struct genl_family { u8 netnsok:1; u8 parallel_ops:1; u8 n_ops; + u8 n_small_ops; u8 n_mcgrps; const struct nla_policy *policy; int (*pre_doit)(const struct genl_ops *ops, @@ -61,6 +64,7 @@ struct genl_family { struct sk_buff *skb, struct genl_info *info); const struct genl_ops * ops; + const struct genl_small_ops *small_ops; const struct genl_multicast_group *mcgrps; struct module *module; }; @@ -108,23 +112,26 @@ enum genl_validate_flags { }; /** - * struct genl_info - info that is available during dumpit op call - * @family: generic netlink family - for internal genl code usage - * @ops: generic netlink ops - for internal genl code usage - * @attrs: netlink attributes + * struct genl_small_ops - generic netlink operations (small version) + * @cmd: command identifier + * @internal_flags: flags used by the family + * @flags: flags + * @validate: validation flags from enum genl_validate_flags + * @doit: standard command callback + * @dumpit: callback for dumpers + * + * This is a cut-down version of struct genl_ops for users who don't need + * most of the ancillary infra and want to save space. */ -struct genl_dumpit_info { - const struct genl_family *family; - const struct genl_ops *ops; - struct nlattr **attrs; +struct genl_small_ops { + int (*doit)(struct sk_buff *skb, struct genl_info *info); + int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); + u8 cmd; + u8 internal_flags; + u8 flags; + u8 validate; }; -static inline const struct genl_dumpit_info * -genl_dumpit_info(struct netlink_callback *cb) -{ - return cb->data; -} - /** * struct genl_ops - generic netlink operations * @cmd: command identifier @@ -148,6 +155,24 @@ struct genl_ops { u8 validate; }; +/** + * struct genl_info - info that is available during dumpit op call + * @family: generic netlink family - for internal genl code usage + * @ops: generic netlink ops - for internal genl code usage + * @attrs: netlink attributes + */ +struct genl_dumpit_info { + const struct genl_family *family; + struct genl_ops op; + struct nlattr **attrs; +}; + +static inline const struct genl_dumpit_info * +genl_dumpit_info(struct netlink_callback *cb) +{ + return cb->data; +} + int genl_register_family(struct genl_family *family); int genl_unregister_family(const struct genl_family *family); void genl_notify(const struct genl_family *family, struct sk_buff *skb, -- cgit v1.2.3 From adc848450ff84e961cf7966b8a475889a92a9fd3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 2 Oct 2020 14:49:55 -0700 Subject: genetlink: add a structure for dump state Whenever netlink dump uses more than 2 cb->args[] entries code gets hard to read. We're about to add more state to ctrl_dumppolicy() so create a structure. Since the structure is typed and clearly named we can remove the local fam_id variable and use ctx->fam_id directly. v3: - rebase onto explicit free fix v1: - s/nl_policy_dump/netlink_policy_dump_state/ - forward declare struct netlink_policy_dump_state, and move from passing unsigned long to actual pointer type - add build bug on - u16 fam_id - s/args/ctx/ Signed-off-by: Jakub Kicinski Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 9e7eca961a98..00258590f2cb 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1935,11 +1935,14 @@ void nla_get_range_unsigned(const struct nla_policy *pt, void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range); +struct netlink_policy_dump_state; + int netlink_policy_dump_start(const struct nla_policy *policy, unsigned int maxtype, - unsigned long *state); -bool netlink_policy_dump_loop(unsigned long state); -int netlink_policy_dump_write(struct sk_buff *skb, unsigned long state); -void netlink_policy_dump_free(unsigned long state); + struct netlink_policy_dump_state **state); +bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state); +int netlink_policy_dump_write(struct sk_buff *skb, + struct netlink_policy_dump_state *state); +void netlink_policy_dump_free(struct netlink_policy_dump_state *state); #endif -- cgit v1.2.3 From 48526a0f4ca2b484cab4318dc0b2c2be1d8685b7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 2 Oct 2020 14:49:57 -0700 Subject: genetlink: bring back per op policy Add policy to the struct genl_ops structure, this time with maxattr, so it can be used properly. Propagate .policy and .maxattr from the family in genl_get_cmd() if needed, this way the rest of the code does not have to worry if the policy is per op or global. Signed-off-by: Jakub Kicinski Reviewed-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/genetlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 8ea1fc1ed1c7..cb35625d001e 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -137,6 +137,8 @@ struct genl_small_ops { * @cmd: command identifier * @internal_flags: flags used by the family * @flags: flags + * @maxattr: maximum number of attributes supported + * @policy: netlink policy (takes precedence over family policy) * @doit: standard command callback * @start: start callback for dumps * @dumpit: callback for dumpers @@ -149,6 +151,8 @@ struct genl_ops { int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); + const struct nla_policy *policy; + unsigned int maxattr; u8 cmd; u8 internal_flags; u8 flags; -- cgit v1.2.3 From 04a351a62bd4be1dbcc88fae69b990362d88ffe5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 3 Oct 2020 10:44:43 +0200 Subject: netlink: rework policy dump to support multiple policies Rework the policy dump code a bit to support adding multiple policies to a single dump, in order to e.g. support per-op policies in generic netlink. v2: - move kernel-doc to implementation [Jakub] - squash the first patch to not flip-flop on the prototype [Jakub] - merge netlink_policy_dump_get_policy_idx() with the old get_policy_idx() we already had - rebase without Jakub's patch to have per-op dump Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/net/netlink.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 00258590f2cb..5a5ff97cc596 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1937,9 +1937,12 @@ void nla_get_range_signed(const struct nla_policy *pt, struct netlink_policy_dump_state; -int netlink_policy_dump_start(const struct nla_policy *policy, - unsigned int maxtype, - struct netlink_policy_dump_state **state); +int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate, + const struct nla_policy *policy, + unsigned int maxtype); +int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state, + const struct nla_policy *policy, + unsigned int maxtype); bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state); int netlink_policy_dump_write(struct sk_buff *skb, struct netlink_policy_dump_state *state); -- cgit v1.2.3 From 50a896cf2d6f34e884a00139d6e6012c9833ace3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 3 Oct 2020 10:44:45 +0200 Subject: genetlink: properly support per-op policy dumping Add support for per-op policy dumping. The data is pretty much as before, except that now the assumption that the policy with index 0 is "the" policy no longer holds - you now need to look at the new CTRL_ATTR_OP_POLICY attribute which is a nested attr (indexed by op) containing attributes for do and dump policies. When a single op is requested, the CTRL_ATTR_OP_POLICY will be added in the same way, since do and dump policies may differ. v2: - conditionally advertise per-command policies only if there actually is a policy being used for the do/dump and it's present at all Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/uapi/linux/genetlink.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index 9c0636ec2286..bc9c98e84828 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -64,6 +64,7 @@ enum { CTRL_ATTR_OPS, CTRL_ATTR_MCAST_GROUPS, CTRL_ATTR_POLICY, + CTRL_ATTR_OP_POLICY, __CTRL_ATTR_MAX, }; @@ -85,6 +86,15 @@ enum { __CTRL_ATTR_MCAST_GRP_MAX, }; +enum { + CTRL_ATTR_POLICY_UNSPEC, + CTRL_ATTR_POLICY_DO, + CTRL_ATTR_POLICY_DUMP, + + __CTRL_ATTR_POLICY_DUMP_MAX, + CTRL_ATTR_POLICY_DUMP_MAX = __CTRL_ATTR_POLICY_DUMP_MAX - 1 +}; + #define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1) -- cgit v1.2.3 From e992a6eda9a1eeeab73a8d2792464e4a2b1ebc3b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 3 Oct 2020 10:44:46 +0200 Subject: genetlink: allow dumping command-specific policy Right now CTRL_CMD_GETPOLICY can only dump the family-wide policy. Support dumping policy of a specific op. v3: - rebase after per-op policy export and handle that v2: - make cmd U32, just in case. v1: - don't echo op in the output in a naive way, this should make it cleaner to extend the output format for dumping policies for all the commands at once in the future. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20201001225933.1373426-11-kuba@kernel.org Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/uapi/linux/genetlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h index bc9c98e84828..d83f214b4134 100644 --- a/include/uapi/linux/genetlink.h +++ b/include/uapi/linux/genetlink.h @@ -65,6 +65,7 @@ enum { CTRL_ATTR_MCAST_GROUPS, CTRL_ATTR_POLICY, CTRL_ATTR_OP_POLICY, + CTRL_ATTR_OP, __CTRL_ATTR_MAX, }; -- cgit v1.2.3 From 8e1b3884eed7d96d4f1210b668611ee6a1803ea5 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Thu, 1 Oct 2020 17:12:50 +0000 Subject: net: remove NETDEV_HW_ADDR_T_SLAVE NETDEV_HW_ADDR_T_SLAVE is not used anymore, remove it. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28cfa53daf72..0c79d9e56a5e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -212,9 +212,8 @@ struct netdev_hw_addr { unsigned char type; #define NETDEV_HW_ADDR_T_LAN 1 #define NETDEV_HW_ADDR_T_SAN 2 -#define NETDEV_HW_ADDR_T_SLAVE 3 -#define NETDEV_HW_ADDR_T_UNICAST 4 -#define NETDEV_HW_ADDR_T_MULTICAST 5 +#define NETDEV_HW_ADDR_T_UNICAST 3 +#define NETDEV_HW_ADDR_T_MULTICAST 4 bool global_use; int sync_cnt; int refcount; -- cgit v1.2.3 From 19fbcb36a39eefbe8912a13ccc02e937b1c418d6 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 3 Oct 2020 00:44:28 +0200 Subject: net/sched: act_vlan: Add {POP,PUSH}_ETH actions Implement TCA_VLAN_ACT_POP_ETH and TCA_VLAN_ACT_PUSH_ETH, to respectively pop and push a base Ethernet header at the beginning of a frame. POP_ETH is just a matter of pulling ETH_HLEN bytes. VLAN tags, if any, must be stripped before calling POP_ETH. PUSH_ETH is restricted to skbs with no mac_header, and only the MAC addresses can be configured. The Ethertype is automatically set from skb->protocol. These restrictions ensure that all skb's fields remain consistent, so that this action can't confuse other part of the networking stack (like GSO). Since openvswitch already had these actions, consolidate the code in skbuff.c (like for vlan and mpls push/pop). Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +++ include/net/tc_act/tc_vlan.h | 2 ++ include/uapi/linux/tc_act/tc_vlan.h | 4 ++++ 3 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3d0cf3722bb4..42131e325e27 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3573,6 +3573,9 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); +int skb_eth_pop(struct sk_buff *skb); +int skb_eth_push(struct sk_buff *skb, const unsigned char *dst, + const unsigned char *src); int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto, int mac_len, bool ethernet); int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len, diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 4e2502408c31..f051046ba034 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -11,6 +11,8 @@ struct tcf_vlan_params { int tcfv_action; + unsigned char tcfv_push_dst[ETH_ALEN]; + unsigned char tcfv_push_src[ETH_ALEN]; u16 tcfv_push_vid; __be16 tcfv_push_proto; u8 tcfv_push_prio; diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h index 168995b54a70..5b306fe815cc 100644 --- a/include/uapi/linux/tc_act/tc_vlan.h +++ b/include/uapi/linux/tc_act/tc_vlan.h @@ -16,6 +16,8 @@ #define TCA_VLAN_ACT_POP 1 #define TCA_VLAN_ACT_PUSH 2 #define TCA_VLAN_ACT_MODIFY 3 +#define TCA_VLAN_ACT_POP_ETH 4 +#define TCA_VLAN_ACT_PUSH_ETH 5 struct tc_vlan { tc_gen; @@ -30,6 +32,8 @@ enum { TCA_VLAN_PUSH_VLAN_PROTOCOL, TCA_VLAN_PAD, TCA_VLAN_PUSH_VLAN_PRIORITY, + TCA_VLAN_PUSH_ETH_DST, + TCA_VLAN_PUSH_ETH_SRC, __TCA_VLAN_MAX, }; #define TCA_VLAN_MAX (__TCA_VLAN_MAX - 1) -- cgit v1.2.3 From a45294af9e96a3e060b6272fa7cd2c4b196de335 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 3 Oct 2020 00:44:31 +0200 Subject: net/sched: act_mpls: Add action to push MPLS LSE before Ethernet header Define the MAC_PUSH action which pushes an MPLS LSE before the mac header (instead of between the mac and the network headers as the plain PUSH action does). The only special case is when the skb has an offloaded VLAN. In that case, it has to be inlined before pushing the MPLS header. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_mpls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/tc_act/tc_mpls.h b/include/uapi/linux/tc_act/tc_mpls.h index 9360e95273c7..9e4e8f52a779 100644 --- a/include/uapi/linux/tc_act/tc_mpls.h +++ b/include/uapi/linux/tc_act/tc_mpls.h @@ -10,6 +10,7 @@ #define TCA_MPLS_ACT_PUSH 2 #define TCA_MPLS_ACT_MODIFY 3 #define TCA_MPLS_ACT_DEC_TTL 4 +#define TCA_MPLS_ACT_MAC_PUSH 5 struct tc_mpls { tc_gen; /* generic TC action fields. */ -- cgit v1.2.3 From 5f48846daf3321f8a1f8aa99cd6173e3980b7a29 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 2 Oct 2020 15:50:56 +0200 Subject: netfilter: nf_tables: Enable fast nft_cmp for inverted matches Add a boolean indicating NFT_CMP_NEQ. To include it into the match decision, it is sufficient to XOR it with the data comparison's result. While being at it, store the mask that is calculated during expression init and free the eval routine from having to recalculate it each time. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 78516de14d31..df2d91c814cb 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -25,8 +25,10 @@ void nf_tables_core_module_exit(void); struct nft_cmp_fast_expr { u32 data; + u32 mask; enum nft_registers sreg:8; u8 len; + bool inv; }; struct nft_immediate_expr { -- cgit v1.2.3 From 10fdd6d80e4c21ad48f3860d723f5b3b5965477b Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 1 Oct 2020 18:57:44 +0200 Subject: netfilter: nf_tables: Implement fast bitwise expression A typical use of bitwise expression is to mask out parts of an IP address when matching on the network part only. Optimize for this common use with a fast variant for NFT_BITWISE_BOOL-type expressions operating on 32bit-sized values. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index df2d91c814cb..8657e6815b07 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -23,6 +23,13 @@ extern struct nft_object_type nft_secmark_obj_type; int nf_tables_core_module_init(void); void nf_tables_core_module_exit(void); +struct nft_bitwise_fast_expr { + u32 mask; + u32 xor; + enum nft_registers sreg:8; + enum nft_registers dreg:8; +}; + struct nft_cmp_fast_expr { u32 data; u32 mask; @@ -68,6 +75,8 @@ struct nft_payload_set { extern const struct nft_expr_ops nft_payload_fast_ops; +extern const struct nft_expr_ops nft_bitwise_fast_ops; + extern struct static_key_false nft_counters_enabled; extern struct static_key_false nft_trace_enabled; -- cgit v1.2.3 From cf1166349c68816f4259d32559f54972b0d5c1a4 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 4 Oct 2020 18:12:51 +0200 Subject: net: devlink: Add unused port flavour Not all ports of a switch need to be used, particularly in embedded systems. Add a port flavour for ports which physically exist in the switch, but are not connected to the front panel etc, and so are unused. By having unused ports present in devlink, it gives a more accurate representation of the hardware. It also allows regions to be associated to such ports, so allowing, for example, to determine unused ports are correctly powered off, or to compare probable reset defaults of unused ports to used ports experiences issues. Actually registering unused ports and setting the flavour to unused is optional. The DSA core will register all such switch ports, but such ports are expected to be limited in number. Bigger ASICs may decide not to list unused ports. v2: Expand the description about why it is useful Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index ba467dc07852..5f1d6c327670 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -197,6 +197,9 @@ enum devlink_port_flavour { * port that faces the PCI VF. */ DEVLINK_PORT_FLAVOUR_VIRTUAL, /* Any virtual port facing the user. */ + DEVLINK_PORT_FLAVOUR_UNUSED, /* Port which exists in the switch, but + * is not used in any way. + */ }; enum devlink_param_cmode { -- cgit v1.2.3 From 3122433eb533aac7d08302ee4b3bd3adfcd280d3 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 4 Oct 2020 18:12:53 +0200 Subject: net: dsa: Register devlink ports before calling DSA driver setup() DSA drivers want to create regions on devlink ports as well as the devlink device instance, in order to export registers and other tables per port. To keep all this code together in the drivers, have the devlink ports registered early, so the setup() method can setup both device and port devlink regions. v3: Remove dp->setup Move common code out of switch statement. Fix wrong goto Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 8b0696e08cac..049140b2f593 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -215,6 +215,7 @@ struct dsa_port { u8 stp_state; struct net_device *bridge_dev; struct devlink_port devlink_port; + bool devlink_port_setup; struct phylink *pl; struct phylink_config pl_config; -- cgit v1.2.3 From 544e7c33ec2f8077685c254f5e3b03a85c0e62eb Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 4 Oct 2020 18:12:54 +0200 Subject: net: devlink: Add support for port regions Allow regions to be registered to a devlink port. The same netlink API is used, but the port index is provided to indicate when a region is a port region as opposed to a device region. Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- include/net/devlink.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 89ede1ce3a3a..237ba5e29a3b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -110,6 +110,7 @@ struct devlink_port_attrs { struct devlink_port { struct list_head list; struct list_head param_list; + struct list_head region_list; struct devlink *devlink; unsigned int index; bool registered; @@ -591,6 +592,26 @@ struct devlink_region_ops { void *priv; }; +/** + * struct devlink_port_region_ops - Region operations for a port + * @name: region name + * @destructor: callback used to free snapshot memory when deleting + * @snapshot: callback to request an immediate snapshot. On success, + * the data variable must be updated to point to the snapshot data. + * The function will be called while the devlink instance lock is + * held. + * @priv: Pointer to driver private data for the region operation + */ +struct devlink_port_region_ops { + const char *name; + void (*destructor)(const void *data); + int (*snapshot)(struct devlink_port *port, + const struct devlink_port_region_ops *ops, + struct netlink_ext_ack *extack, + u8 **data); + void *priv; +}; + struct devlink_fmsg; struct devlink_health_reporter; @@ -1445,7 +1466,13 @@ struct devlink_region * devlink_region_create(struct devlink *devlink, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size); +struct devlink_region * +devlink_port_region_create(struct devlink_port *port, + const struct devlink_port_region_ops *ops, + u32 region_max_snapshots, u64 region_size); void devlink_region_destroy(struct devlink_region *region); +void devlink_port_region_destroy(struct devlink_region *region); + int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id); void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id); int devlink_region_snapshot_create(struct devlink_region *region, -- cgit v1.2.3 From 08156ba430b412bd9c23fe6155a58c7cb166045c Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 4 Oct 2020 18:12:55 +0200 Subject: net: dsa: Add devlink port regions support to DSA Allow DSA drivers to make use of devlink port regions, via simple wrappers. Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 049140b2f593..ca426cf9927b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -681,6 +681,11 @@ struct devlink_region * dsa_devlink_region_create(struct dsa_switch *ds, const struct devlink_region_ops *ops, u32 region_max_snapshots, u64 region_size); +struct devlink_region * +dsa_devlink_port_region_create(struct dsa_switch *ds, + int port, + const struct devlink_port_region_ops *ops, + u32 region_max_snapshots, u64 region_size); void dsa_devlink_region_destroy(struct devlink_region *region); struct dsa_port *dsa_port_from_netdev(struct net_device *netdev); -- cgit v1.2.3 From 7d1e2a10681d3b6eeaace68885ef5de88ce03efe Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sun, 4 Oct 2020 18:12:56 +0200 Subject: net: dsa: Add helper for converting devlink port to ds and port Hide away from DSA drivers how devlink works. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index ca426cf9927b..c0185660881c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -701,6 +701,20 @@ static inline struct dsa_switch *dsa_devlink_to_ds(struct devlink *dl) return dl_priv->ds; } +static inline +struct dsa_switch *dsa_devlink_port_to_ds(struct devlink_port *port) +{ + struct devlink *dl = port->devlink; + struct dsa_devlink_priv *dl_priv = devlink_priv(dl); + + return dl_priv->ds; +} + +static inline int dsa_devlink_port_to_port(struct devlink_port *port) +{ + return port->index; +} + struct dsa_switch_driver { struct list_head list; const struct dsa_switch_ops *ops; -- cgit v1.2.3 From 2e554a7a5d8a8092ecb20c547734bb33fddd5046 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 3 Oct 2020 01:06:46 +0300 Subject: net: dsa: propagate switchdev vlan_filtering prepare phase to drivers A driver may refuse to enable VLAN filtering for any reason beyond what the DSA framework cares about, such as: - having tc-flower rules that rely on the switch being VLAN-aware - the particular switch does not support VLAN, even if the driver does (the DSA framework just checks for the presence of the .port_vlan_add and .port_vlan_del pointers) - simply not supporting this configuration to be toggled at runtime Currently, when a driver rejects a configuration it cannot support, it does this from the commit phase, which triggers various warnings in switchdev. So propagate the prepare phase to drivers, to give them the ability to refuse invalid configurations cleanly and avoid the warnings. Since we need to modify all function prototypes and check for the prepare phase from within the drivers, take that opportunity and move the existing driver restrictions within the prepare phase where that is possible and easy. Cc: Florian Fainelli Cc: Martin Blumenstingl Cc: Hauke Mehrtens Cc: Woojung Huh Cc: Microchip Linux Driver Support Cc: Sean Wang Cc: Landen Chao Cc: Andrew Lunn Cc: Vivien Didelot Cc: Jonathan McDowell Cc: Linus Walleij Cc: Alexandre Belloni Cc: Claudiu Manoil Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 3 ++- include/soc/mscc/ocelot.h | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index c0185660881c..35429a140dfa 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -552,7 +552,8 @@ struct dsa_switch_ops { * VLAN support */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, - bool vlan_filtering); + bool vlan_filtering, + struct switchdev_trans *trans); int (*port_vlan_prepare)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); void (*port_vlan_add)(struct dsa_switch *ds, int port, diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 46608494616f..1e9db9577441 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -730,8 +730,8 @@ int ocelot_get_ts_info(struct ocelot *ocelot, int port, void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs); void ocelot_adjust_link(struct ocelot *ocelot, int port, struct phy_device *phydev); -void ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, - bool vlan_aware); +int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port, bool enabled, + struct switchdev_trans *trans); void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state); int ocelot_port_bridge_join(struct ocelot *ocelot, int port, struct net_device *bridge); -- cgit v1.2.3 From c6db31ffe202c3120147e9f3455a4dbc90546d39 Mon Sep 17 00:00:00 2001 From: Igor Russkikh Date: Mon, 5 Oct 2020 18:39:37 +0300 Subject: ethtool: allow netdev driver to define phy tunables Define get/set phy tunable callbacks in ethtool ops. This will allow MAC drivers with integrated PHY still to implement these tunables. Reviewed-by: Andrew Lunn Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 060b20f0b20f..6408b446051f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -505,6 +505,10 @@ struct ethtool_ops { struct ethtool_fecparam *); void (*get_ethtool_phy_stats)(struct net_device *, struct ethtool_stats *, u64 *); + int (*get_phy_tunable)(struct net_device *, + const struct ethtool_tunable *, void *); + int (*set_phy_tunable)(struct net_device *, + const struct ethtool_tunable *, const void *); }; int ethtool_check_ops(const struct ethtool_ops *ops); -- cgit v1.2.3 From 451b05f413d3fd89ee84ae99a5029f619ed3cb78 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 5 Oct 2020 22:34:18 +0200 Subject: net: netdevice.h: sw_netstats_rx_add helper some drivers/network protocols update rx bytes/packets under u64_stats_update_begin/end sequence. Add a specific helper like dev_lstats_add() Signed-off-by: Fabian Frederick Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d126e36580c9..a0df43b13839 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2543,6 +2543,16 @@ struct pcpu_lstats { void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes); +static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len) +{ + struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->rx_bytes += len; + tstats->rx_packets++; + u64_stats_update_end(&tstats->syncp); +} + static inline void dev_lstats_add(struct net_device *dev, unsigned int len) { struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats); -- cgit v1.2.3 From ddcf3b70c5ae8444e920d28e30e7ad4e866c8015 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 5 Oct 2020 15:07:37 -0700 Subject: netlink: create helpers for checking type is an int There's a number of policies which check if type is a uint or sint. Factor the checking against the list of value sizes to a helper for easier reuse. v2: - new patch Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netlink.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 5a5ff97cc596..c5aa46f379bc 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -362,20 +362,21 @@ struct nla_policy { #define NLA_POLICY_BITFIELD32(valid) \ { .type = NLA_BITFIELD32, .bitfield32_valid = valid } +#define __NLA_IS_UINT_TYPE(tp) \ + (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || tp == NLA_U64) +#define __NLA_IS_SINT_TYPE(tp) \ + (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64) + #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp) \ - (__NLA_ENSURE(tp == NLA_U8 || tp == NLA_U16 || \ - tp == NLA_U32 || tp == NLA_U64 || \ + (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_SINT_TYPE(tp) \ - (__NLA_ENSURE(tp == NLA_S8 || tp == NLA_S16 || \ - tp == NLA_S32 || tp == NLA_S64) + tp) + (__NLA_ENSURE(__NLA_IS_SINT_TYPE(tp)) + tp) #define NLA_ENSURE_INT_OR_BINARY_TYPE(tp) \ - (__NLA_ENSURE(tp == NLA_S8 || tp == NLA_U8 || \ - tp == NLA_S16 || tp == NLA_U16 || \ - tp == NLA_S32 || tp == NLA_U32 || \ - tp == NLA_S64 || tp == NLA_U64 || \ + (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ + __NLA_IS_SINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_NO_VALIDATION_PTR(tp) \ -- cgit v1.2.3 From bdbb4e29df8b790db50cb73ce25d23543329f05f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 5 Oct 2020 15:07:38 -0700 Subject: netlink: add mask validation We don't have good validation policy for existing unsigned int attrs which serve as flags (for new ones we could use NLA_BITFIELD32). With increased use of policy dumping having the validation be expressed as part of the policy is important. Add validation policy in form of a mask of supported/valid bits. Support u64 in the uAPI to be future-proof, but really for now the embedded mask member can only hold 32 bits, so anything with bit 32+ set will always fail validation. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netlink.h | 10 ++++++++++ include/uapi/linux/netlink.h | 2 ++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index c5aa46f379bc..2b9e41075f19 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -200,6 +200,7 @@ enum nla_policy_validation { NLA_VALIDATE_RANGE_WARN_TOO_LONG, NLA_VALIDATE_MIN, NLA_VALIDATE_MAX, + NLA_VALIDATE_MASK, NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, }; @@ -317,6 +318,7 @@ struct nla_policy { u16 len; union { const u32 bitfield32_valid; + const u32 mask; const char *reject_message; const struct nla_policy *nested_policy; struct netlink_range_validation *range; @@ -368,6 +370,8 @@ struct nla_policy { (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) +#define NLA_ENSURE_UINT_TYPE(tp) \ + (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp) #define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ tp == NLA_MSECS || \ @@ -416,6 +420,12 @@ struct nla_policy { .max = _max, \ } +#define NLA_POLICY_MASK(tp, _mask) { \ + .type = NLA_ENSURE_UINT_TYPE(tp), \ + .validation_type = NLA_VALIDATE_MASK, \ + .mask = _mask, \ +} + #define NLA_POLICY_VALIDATE_FN(tp, fn, ...) { \ .type = NLA_ENSURE_NO_VALIDATION_PTR(tp), \ .validation_type = NLA_VALIDATE_FUNCTION, \ diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index eac8a6a648ea..d02e472ba54c 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -331,6 +331,7 @@ enum netlink_attribute_type { * the index, if limited inside the nesting (U32) * @NL_POLICY_TYPE_ATTR_BITFIELD32_MASK: valid mask for the * bitfield32 type (U32) + * @NL_POLICY_TYPE_ATTR_MASK: mask of valid bits for unsigned integers (U64) * @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment */ enum netlink_policy_type_attr { @@ -346,6 +347,7 @@ enum netlink_policy_type_attr { NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, NL_POLICY_TYPE_ATTR_PAD, + NL_POLICY_TYPE_ATTR_MASK, /* keep last */ __NL_POLICY_TYPE_ATTR_MAX, -- cgit v1.2.3 From eb88531bdbfaafb827192d1fc6c5a3fcc4fadd96 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sun, 27 Sep 2020 01:24:31 +0900 Subject: can: raw: add missing error queue support Error queue are not yet implemented in CAN-raw sockets. The problem: a userland call to recvmsg(soc, msg, MSG_ERRQUEUE) on a CAN-raw socket would unqueue messages from the normal queue without any kind of error or warning. As such, it prevented CAN drivers from using the functionalities that relies on the error queue such as skb_tx_timestamp(). SCM_CAN_RAW_ERRQUEUE is defined as the type for the CAN raw error queue. SCM stands for "Socket control messages". The name is inspired from SCM_J1939_ERRQUEUE of include/uapi/linux/can/j1939.h. Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20200926162527.270030-1-mailhol.vincent@wanadoo.fr Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/raw.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/can/raw.h b/include/uapi/linux/can/raw.h index 6a11d308eb5c..3386aa81fdf2 100644 --- a/include/uapi/linux/can/raw.h +++ b/include/uapi/linux/can/raw.h @@ -49,6 +49,9 @@ #include #define SOL_CAN_RAW (SOL_CAN_BASE + CAN_RAW) +enum { + SCM_CAN_RAW_ERRQUEUE = 1, +}; /* for socket options affecting the socket (not the global system) */ -- cgit v1.2.3 From f55a52bb2cdbc5a92ca209d0ada90a490a188f58 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 3 Oct 2020 00:41:46 +0900 Subject: can: dev: fix type of get_can_dlc() and get_canfd_dlc() macros The macros get_can_dlc() and get_canfd_dlc() are not visible in userland. As such, type u8 should be preferred over type __u8. Reference: https://lkml.org/lkml/2020/10/1/708 Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20201002154219.4887-3-mailhol.vincent@wanadoo.fr Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index ed0482b2f4b2..f8975d0b90bb 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -84,13 +84,13 @@ struct can_priv { /* * get_can_dlc(value) - helper macro to cast a given data length code (dlc) - * to __u8 and ensure the dlc value to be max. 8 bytes. + * to u8 and ensure the dlc value to be max. 8 bytes. * * To be used in the CAN netdriver receive path to ensure conformance with * ISO 11898-1 Chapter 8.4.2.3 (DLC field) */ -#define get_can_dlc(i) (min_t(__u8, (i), CAN_MAX_DLC)) -#define get_canfd_dlc(i) (min_t(__u8, (i), CANFD_MAX_DLC)) +#define get_can_dlc(i) (min_t(u8, (i), CAN_MAX_DLC)) +#define get_canfd_dlc(i) (min_t(u8, (i), CANFD_MAX_DLC)) /* Check for outgoing skbs that have not been created by the CAN subsystem */ static inline bool can_skb_headroom_valid(struct net_device *dev, -- cgit v1.2.3 From 49f3d12b0f70ea867b891ad2a97f6e51bb564e18 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Wed, 7 Oct 2020 07:57:17 +0200 Subject: bpf: Fix typo in uapi/linux/bpf.h Reported-by: Samanta Navarro Signed-off-by: Jakub Wilk Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201007055717.7319-1-jwilk@jwilk.net --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c446394135be..d83561e8cd2c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2253,7 +2253,7 @@ union bpf_attr { * Description * This helper is used in programs implementing policies at the * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. - * if the verdeict eBPF program returns **SK_PASS**), redirect it + * if the verdict eBPF program returns **SK_PASS**), redirect it * to the socket referenced by *map* (of type * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and * egress interfaces can be used for redirection. The -- cgit v1.2.3 From 1c47fa6b31c2683f03bc2f9174902bb7dcd35d83 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 3 Oct 2020 00:41:49 +0900 Subject: can: dev: add a helper function to calculate the duration of one bit Rename macro CAN_CALC_SYNC_SEG to CAN_SYNC_SEG and make it available through include/linux/can/dev.h Add an helper function can_bit_time() which returns the duration (in time quanta) of one CAN bit. Rationale for this patch: the sync segment and the bit time are two concepts which are defined in the CAN ISO standard. Device drivers for CAN might need those. Please refer to ISO 11898-1:2015, section 11.3.1.1 "Bit time" for additional information. Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20201002154219.4887-6-mailhol.vincent@wanadoo.fr [mkl: Let can_bit_time() return an unsinged int, make argument const] Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index f8975d0b90bb..41ff31795320 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -82,6 +82,21 @@ struct can_priv { #endif }; +#define CAN_SYNC_SEG 1 + +/* + * can_bit_time() - Duration of one bit + * + * Please refer to ISO 11898-1:2015, section 11.3.1.1 "Bit time" for + * additional information. + * + * Return: the number of time quanta in one bit. + */ +static inline unsigned int can_bit_time(const struct can_bittiming *bt) +{ + return CAN_SYNC_SEG + bt->prop_seg + bt->phase_seg1 + bt->phase_seg2; +} + /* * get_can_dlc(value) - helper macro to cast a given data length code (dlc) * to u8 and ensure the dlc value to be max. 8 bytes. -- cgit v1.2.3 From e057dd3fc20ffb3d7f150af46542a51b59b90127 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 28 Sep 2020 22:04:04 +0200 Subject: can: add ISO 15765-2:2016 transport protocol CAN Transport Protocols offer support for segmented Point-to-Point communication between CAN nodes via two defined CAN Identifiers. As CAN frames can only transport a small amount of data bytes (max. 8 bytes for 'classic' CAN and max. 64 bytes for CAN FD) this segmentation is needed to transport longer PDUs as needed e.g. for vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN traffic. This protocol driver implements data transfers according to ISO 15765-2:2016 for 'classic' CAN and CAN FD frame types. Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20200928200404.82229-1-socketcan@hartkopp.net [mkl: Removed "WITH Linux-syscall-note" from isotp.c. Fixed indention, a checkpatch warning and typos. Replaced __u{8,32} by u{8,32}. Removed always false (optlen < 0) check in isotp_setsockopt().] Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/isotp.h | 166 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 include/uapi/linux/can/isotp.h (limited to 'include') diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h new file mode 100644 index 000000000000..553006509f4e --- /dev/null +++ b/include/uapi/linux/can/isotp.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: ((GPL-2.0-only WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * linux/can/isotp.h + * + * Definitions for isotp CAN sockets (ISO 15765-2:2016) + * + * Copyright (c) 2020 Volkswagen Group Electronic Research + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Volkswagen nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * Alternatively, provided that this notice is retained in full, this + * software may be distributed under the terms of the GNU General + * Public License ("GPL") version 2, in which case the provisions of the + * GPL apply INSTEAD OF those given above. + * + * The provided data structures and external interfaces from this code + * are not restricted to be used by modules with a GPL compatible license. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + */ + +#ifndef _UAPI_CAN_ISOTP_H +#define _UAPI_CAN_ISOTP_H + +#include +#include + +#define SOL_CAN_ISOTP (SOL_CAN_BASE + CAN_ISOTP) + +/* for socket options affecting the socket (not the global system) */ + +#define CAN_ISOTP_OPTS 1 /* pass struct can_isotp_options */ + +#define CAN_ISOTP_RECV_FC 2 /* pass struct can_isotp_fc_options */ + +/* sockopts to force stmin timer values for protocol regression tests */ + +#define CAN_ISOTP_TX_STMIN 3 /* pass __u32 value in nano secs */ + /* use this time instead of value */ + /* provided in FC from the receiver */ + +#define CAN_ISOTP_RX_STMIN 4 /* pass __u32 value in nano secs */ + /* ignore received CF frames which */ + /* timestamps differ less than val */ + +#define CAN_ISOTP_LL_OPTS 5 /* pass struct can_isotp_ll_options */ + +struct can_isotp_options { + + __u32 flags; /* set flags for isotp behaviour. */ + /* __u32 value : flags see below */ + + __u32 frame_txtime; /* frame transmission time (N_As/N_Ar) */ + /* __u32 value : time in nano secs */ + + __u8 ext_address; /* set address for extended addressing */ + /* __u8 value : extended address */ + + __u8 txpad_content; /* set content of padding byte (tx) */ + /* __u8 value : content on tx path */ + + __u8 rxpad_content; /* set content of padding byte (rx) */ + /* __u8 value : content on rx path */ + + __u8 rx_ext_address; /* set address for extended addressing */ + /* __u8 value : extended address (rx) */ +}; + +struct can_isotp_fc_options { + + __u8 bs; /* blocksize provided in FC frame */ + /* __u8 value : blocksize. 0 = off */ + + __u8 stmin; /* separation time provided in FC frame */ + /* __u8 value : */ + /* 0x00 - 0x7F : 0 - 127 ms */ + /* 0x80 - 0xF0 : reserved */ + /* 0xF1 - 0xF9 : 100 us - 900 us */ + /* 0xFA - 0xFF : reserved */ + + __u8 wftmax; /* max. number of wait frame transmiss. */ + /* __u8 value : 0 = omit FC N_PDU WT */ +}; + +struct can_isotp_ll_options { + + __u8 mtu; /* generated & accepted CAN frame type */ + /* __u8 value : */ + /* CAN_MTU (16) -> standard CAN 2.0 */ + /* CANFD_MTU (72) -> CAN FD frame */ + + __u8 tx_dl; /* tx link layer data length in bytes */ + /* (configured maximum payload length) */ + /* __u8 value : 8,12,16,20,24,32,48,64 */ + /* => rx path supports all LL_DL values */ + + __u8 tx_flags; /* set into struct canfd_frame.flags */ + /* at frame creation: e.g. CANFD_BRS */ + /* Obsolete when the BRS flag is fixed */ + /* by the CAN netdriver configuration */ +}; + +/* flags for isotp behaviour */ + +#define CAN_ISOTP_LISTEN_MODE 0x001 /* listen only (do not send FC) */ +#define CAN_ISOTP_EXTEND_ADDR 0x002 /* enable extended addressing */ +#define CAN_ISOTP_TX_PADDING 0x004 /* enable CAN frame padding tx path */ +#define CAN_ISOTP_RX_PADDING 0x008 /* enable CAN frame padding rx path */ +#define CAN_ISOTP_CHK_PAD_LEN 0x010 /* check received CAN frame padding */ +#define CAN_ISOTP_CHK_PAD_DATA 0x020 /* check received CAN frame padding */ +#define CAN_ISOTP_HALF_DUPLEX 0x040 /* half duplex error state handling */ +#define CAN_ISOTP_FORCE_TXSTMIN 0x080 /* ignore stmin from received FC */ +#define CAN_ISOTP_FORCE_RXSTMIN 0x100 /* ignore CFs depending on rx stmin */ +#define CAN_ISOTP_RX_EXT_ADDR 0x200 /* different rx extended addressing */ +#define CAN_ISOTP_WAIT_TX_DONE 0x400 /* wait for tx completion */ + + +/* default values */ + +#define CAN_ISOTP_DEFAULT_FLAGS 0 +#define CAN_ISOTP_DEFAULT_EXT_ADDRESS 0x00 +#define CAN_ISOTP_DEFAULT_PAD_CONTENT 0xCC /* prevent bit-stuffing */ +#define CAN_ISOTP_DEFAULT_FRAME_TXTIME 0 +#define CAN_ISOTP_DEFAULT_RECV_BS 0 +#define CAN_ISOTP_DEFAULT_RECV_STMIN 0x00 +#define CAN_ISOTP_DEFAULT_RECV_WFTMAX 0 + +#define CAN_ISOTP_DEFAULT_LL_MTU CAN_MTU +#define CAN_ISOTP_DEFAULT_LL_TX_DL CAN_MAX_DLEN +#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS 0 + +/* + * Remark on CAN_ISOTP_DEFAULT_RECV_* values: + * + * We can strongly assume, that the Linux Kernel implementation of + * CAN_ISOTP is capable to run with BS=0, STmin=0 and WFTmax=0. + * But as we like to be able to behave as a commonly available ECU, + * these default settings can be changed via sockopts. + * For that reason the STmin value is intentionally _not_ checked for + * consistency and copied directly into the flow control (FC) frame. + * + */ + +#endif /* !_UAPI_CAN_ISOTP_H */ -- cgit v1.2.3 From ba6ff70a3bb76c1ff440d3a0044b82e97abb648f Mon Sep 17 00:00:00 2001 From: Rajkumar Manoharan Date: Sat, 3 Oct 2020 15:04:18 -0700 Subject: mac80211: copy configured beacon tx rate to driver The user is allowed to change beacon tx rate (HT/VHT/HE) from hostapd. This information needs to be passed to the driver when the rate control is offloaded to the firmware. The driver capability of allowing beacon rate is already validated in cfg80211, so simply passing the rate information to the driver is enough. Signed-off-by: Rajkumar Manoharan Link: https://lore.kernel.org/r/1601762658-15627-1-git-send-email-rmanohar@codeaurora.org [adjust commit message slightly] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 4747d446179a..e8e295dae744 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -628,6 +628,8 @@ struct ieee80211_fils_discovery { * @unsol_bcast_probe_resp_interval: Unsolicited broadcast probe response * interval. * @s1g: BSS is S1G BSS (affects Association Request format). + * @beacon_tx_rate: The configured beacon transmit rate that needs to be passed + * to driver when rate control is offloaded to firmware. */ struct ieee80211_bss_conf { const u8 *bssid; @@ -698,6 +700,7 @@ struct ieee80211_bss_conf { struct ieee80211_fils_discovery fils_discovery; u32 unsol_bcast_probe_resp_interval; bool s1g; + struct cfg80211_bitrate_mask beacon_tx_rate; }; /** -- cgit v1.2.3 From 846e463a70e910f2a831aea19f9a361422a2ff5b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 7 Oct 2020 09:51:11 -0700 Subject: net/sched: get rid of qdisc->padded kmalloc() of sufficiently big portion of memory is cache-aligned in regular conditions. If some debugging options are used, there is no reason qdisc structures would need 64-byte alignment if most other kernel structures are not aligned. This get rid of QDISC_ALIGN and QDISC_ALIGNTO. Addition of privdata field will help implementing the reverse of qdisc_priv() and documents where the private data is. Signed-off-by: Eric Dumazet Cc: Allen Pais Acked-by: Cong Wang Signed-off-by: Jakub Kicinski --- include/net/pkt_sched.h | 5 +---- include/net/sch_generic.h | 5 ++++- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index ac8c890a2657..4ed32e6b0201 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -19,12 +19,9 @@ struct qdisc_walker { int (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *); }; -#define QDISC_ALIGNTO 64 -#define QDISC_ALIGN(len) (((len) + QDISC_ALIGNTO-1) & ~(QDISC_ALIGNTO-1)) - static inline void *qdisc_priv(struct Qdisc *q) { - return (char *) q + QDISC_ALIGN(sizeof(struct Qdisc)); + return &q->privdata; } /* diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 6c762457122f..d8fd8676fc72 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -91,7 +91,7 @@ struct Qdisc { struct net_rate_estimator __rcu *rate_est; struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; - int padded; + int pad; refcount_t refcnt; /* @@ -112,6 +112,9 @@ struct Qdisc { /* for NOLOCK qdisc, true if there are no enqueued skbs */ bool empty; struct rcu_head rcu; + + /* private data */ + long privdata[] ____cacheline_aligned; }; static inline void qdisc_refcount_inc(struct Qdisc *qdisc) -- cgit v1.2.3 From eca43ee6c46db92dd850ce659316b0680d70e137 Mon Sep 17 00:00:00 2001 From: "Nikita V. Shirokov" Date: Fri, 9 Oct 2020 07:03:25 +0000 Subject: bpf: Add tcp_notsent_lowat bpf setsockopt Adding support for TCP_NOTSENT_LOWAT sockoption (https://lwn.net/Articles/560082/) in tcp bpf programs. Signed-off-by: Nikita V. Shirokov Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201009070325.226855-1-tehnerd@tehnerd.com --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d83561e8cd2c..42d2df799397 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1698,7 +1698,7 @@ union bpf_attr { * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. * Return -- cgit v1.2.3 From ccdf07219da6bd1f43c6ddcde4c0e36993c7365a Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:43 +0300 Subject: devlink: Add reload action option to devlink reload command Add devlink reload action to allow the user to request a specific reload action. The action parameter is optional, if not specified then devlink driver re-init action is used (backward compatible). Note that when required to do firmware activation some drivers may need to reload the driver. On the other hand some drivers may need to reset the firmware to reinitialize the driver entities. Therefore, the devlink reload command returns the actions which were actually performed. Reload actions supported are: driver_reinit: driver entities re-initialization, applying devlink-param and devlink-resource values. fw_activate: firmware activate. command examples: $devlink dev reload pci/0000:82:00.0 action driver_reinit reload_actions_performed: driver_reinit $devlink dev reload pci/0000:82:00.0 action fw_activate reload_actions_performed: driver_reinit fw_activate Signed-off-by: Moshe Shemesh Reviewed-by: Jakub Kicinski Reviewed-by: Jacob Keller Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 7 ++++--- include/uapi/linux/devlink.h | 13 +++++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 237ba5e29a3b..93c535ae5a4b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1150,10 +1150,11 @@ struct devlink_ops { * implemementation. */ u32 supported_flash_update_params; + unsigned long reload_actions; int (*reload_down)(struct devlink *devlink, bool netns_change, - struct netlink_ext_ack *extack); - int (*reload_up)(struct devlink *devlink, - struct netlink_ext_ack *extack); + enum devlink_reload_action action, struct netlink_ext_ack *extack); + int (*reload_up)(struct devlink *devlink, enum devlink_reload_action action, + u32 *actions_performed, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); int (*port_split)(struct devlink *devlink, unsigned int port_index, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 5f1d6c327670..74bdad252c36 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -301,6 +301,16 @@ enum { DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE, }; +enum devlink_reload_action { + DEVLINK_RELOAD_ACTION_UNSPEC, + DEVLINK_RELOAD_ACTION_DRIVER_REINIT, /* Driver entities re-instantiation */ + DEVLINK_RELOAD_ACTION_FW_ACTIVATE, /* FW activate */ + + /* Add new reload actions above */ + __DEVLINK_RELOAD_ACTION_MAX, + DEVLINK_RELOAD_ACTION_MAX = __DEVLINK_RELOAD_ACTION_MAX - 1 +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -493,6 +503,9 @@ enum devlink_attr { DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT, /* u64 */ DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK, /* bitfield32 */ + DEVLINK_ATTR_RELOAD_ACTION, /* u8 */ + DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, /* bitfield32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From dc64cc7c63102ac78bac3cfbc00ef3abd7a3fdf3 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:44 +0300 Subject: devlink: Add devlink reload limit option Add reload limit to demand restrictions on reload actions. Reload limits supported: no_reset: No reset allowed, no down time allowed, no link flap and no configuration is lost. By default reload limit is unspecified and so no constraints on reload actions are required. Some combinations of action and limit are invalid. For example, driver can not reinitialize its entities without any downtime. The no_reset reload limit will have usecase in this patchset to implement restricted fw_activate on mlx5. Have the uapi parameter of reload limit ready for future support of multiselection. Signed-off-by: Moshe Shemesh Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 8 ++++++-- include/uapi/linux/devlink.h | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 93c535ae5a4b..9f5c37c391f8 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1151,10 +1151,14 @@ struct devlink_ops { */ u32 supported_flash_update_params; unsigned long reload_actions; + unsigned long reload_limits; int (*reload_down)(struct devlink *devlink, bool netns_change, - enum devlink_reload_action action, struct netlink_ext_ack *extack); + enum devlink_reload_action action, + enum devlink_reload_limit limit, + struct netlink_ext_ack *extack); int (*reload_up)(struct devlink *devlink, enum devlink_reload_action action, - u32 *actions_performed, struct netlink_ext_ack *extack); + enum devlink_reload_limit limit, u32 *actions_performed, + struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); int (*port_split)(struct devlink *devlink, unsigned int port_index, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 74bdad252c36..82a5e66c1518 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -311,6 +311,19 @@ enum devlink_reload_action { DEVLINK_RELOAD_ACTION_MAX = __DEVLINK_RELOAD_ACTION_MAX - 1 }; +enum devlink_reload_limit { + DEVLINK_RELOAD_LIMIT_UNSPEC, /* unspecified, no constraints */ + DEVLINK_RELOAD_LIMIT_NO_RESET, /* No reset allowed, no down time allowed, + * no link flap and no configuration is lost. + */ + + /* Add new reload limit above */ + __DEVLINK_RELOAD_LIMIT_MAX, + DEVLINK_RELOAD_LIMIT_MAX = __DEVLINK_RELOAD_LIMIT_MAX - 1 +}; + +#define DEVLINK_RELOAD_LIMITS_VALID_MASK (BIT(__DEVLINK_RELOAD_LIMIT_MAX) - 1) + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -505,6 +518,7 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_ACTION, /* u8 */ DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, /* bitfield32 */ + DEVLINK_ATTR_RELOAD_LIMITS, /* bitfield32 */ /* add new attributes above here, update the policy in devlink.c */ -- cgit v1.2.3 From a254c264267e8746fb257806c166e54375cf9c06 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:45 +0300 Subject: devlink: Add reload stats Add reload stats to hold the history per reload action type and limit. For example, the number of times fw_activate has been performed on this device since the driver module was added or if the firmware activation was performed with or without reset. Add devlink notification on stats update. Expose devlink reload stats to the user through devlink dev get command. Examples: $ devlink dev show pci/0000:82:00.0: stats: reload: driver_reinit 2 fw_activate 1 fw_activate_no_reset 0 pci/0000:82:00.1: stats: reload: driver_reinit 1 fw_activate 0 fw_activate_no_reset 0 $ devlink dev show -jp { "dev": { "pci/0000:82:00.0": { "stats": { "reload": { "driver_reinit": 2, "fw_activate": 1, "fw_activate_no_reset": 0 } } }, "pci/0000:82:00.1": { "stats": { "reload": { "driver_reinit": 1, "fw_activate": 0, "fw_activate_no_reset": 0 } } } } } Signed-off-by: Moshe Shemesh Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 8 ++++++++ include/uapi/linux/devlink.h | 6 ++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 9f5c37c391f8..d091c6ba82ce 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -20,6 +20,13 @@ #include #include +#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ + (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) + +struct devlink_dev_stats { + u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; +}; + struct devlink_ops; struct devlink { @@ -38,6 +45,7 @@ struct devlink { struct list_head trap_policer_list; const struct devlink_ops *ops; struct xarray snapshot_ids; + struct devlink_dev_stats stats; struct device *dev; possible_net_t _net; struct mutex lock; /* Serializes access to devlink instance specific objects such as diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 82a5e66c1518..ab15fc597b74 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -520,6 +520,12 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, /* bitfield32 */ DEVLINK_ATTR_RELOAD_LIMITS, /* bitfield32 */ + DEVLINK_ATTR_DEV_STATS, /* nested */ + DEVLINK_ATTR_RELOAD_STATS, /* nested */ + DEVLINK_ATTR_RELOAD_STATS_ENTRY, /* nested */ + DEVLINK_ATTR_RELOAD_STATS_LIMIT, /* u8 */ + DEVLINK_ATTR_RELOAD_STATS_VALUE, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From 77069ba2e3adf48c472fbbd9cbd7a4f5370b17df Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:46 +0300 Subject: devlink: Add remote reload stats Add remote reload stats to hold the history of actions performed due devlink reload commands initiated by remote host. For example, in case firmware activation with reset finished successfully but was initiated by remote host. The function devlink_remote_reload_actions_performed() is exported to enable drivers update on remote reload actions performed as it was not initiated by their own devlink instance. Expose devlink remote reload stats to the user through devlink dev get command. Examples: $ devlink dev show pci/0000:82:00.0: stats: reload: driver_reinit 2 fw_activate 1 fw_activate_no_reset 0 remote_reload: driver_reinit 0 fw_activate 0 fw_activate_no_reset 0 pci/0000:82:00.1: stats: reload: driver_reinit 1 fw_activate 0 fw_activate_no_reset 0 remote_reload: driver_reinit 1 fw_activate 1 fw_activate_no_reset 0 $ devlink dev show -jp { "dev": { "pci/0000:82:00.0": { "stats": { "reload": { "driver_reinit": 2, "fw_activate": 1, "fw_activate_no_reset": 0 }, "remote_reload": { "driver_reinit": 0, "fw_activate": 0, "fw_activate_no_reset": 0 } } }, "pci/0000:82:00.1": { "stats": { "reload": { "driver_reinit": 1, "fw_activate": 0, "fw_activate_no_reset": 0 }, "remote_reload": { "driver_reinit": 1, "fw_activate": 1, "fw_activate_no_reset": 0 } } } } } Signed-off-by: Moshe Shemesh Reviewed-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 4 ++++ include/uapi/linux/devlink.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index d091c6ba82ce..d2771e57a278 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -25,6 +25,7 @@ struct devlink_dev_stats { u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; }; struct devlink_ops; @@ -1567,6 +1568,9 @@ void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter); bool devlink_is_reload_failed(const struct devlink *devlink); +void devlink_remote_reload_actions_performed(struct devlink *devlink, + enum devlink_reload_limit limit, + u32 actions_performed); void devlink_flash_update_begin_notify(struct devlink *devlink); void devlink_flash_update_end_notify(struct devlink *devlink); diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index ab15fc597b74..0113bc4db9f5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -525,6 +525,7 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_STATS_ENTRY, /* nested */ DEVLINK_ATTR_RELOAD_STATS_LIMIT, /* u8 */ DEVLINK_ATTR_RELOAD_STATS_VALUE, /* u32 */ + DEVLINK_ATTR_REMOTE_RELOAD_STATS, /* nested */ /* add new attributes above here, update the policy in devlink.c */ -- cgit v1.2.3 From 38b9f903f22b9baa5c4b9bfb07c8bbc49f5efbba Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:49 +0300 Subject: net/mlx5: Handle sync reset request event Once the driver gets sync_reset_request from firmware it prepares for the coming reset and sends acknowledge. After getting this event the driver expects device reset, either it will trigger PCI reset on sync_reset_now event or such PCI reset will be triggered by another PF of the same device. So it moves to reset requested mode and if it gets PCI reset triggered by the other PF it detect the reset and reloads. Signed-off-by: Moshe Shemesh Reviewed-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 239e5348ee09..add85094f9a5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -501,6 +501,7 @@ struct mlx5_mpfs; struct mlx5_eswitch; struct mlx5_lag; struct mlx5_devcom; +struct mlx5_fw_reset; struct mlx5_eq_table; struct mlx5_irq_table; @@ -578,6 +579,7 @@ struct mlx5_priv { struct mlx5_core_sriov sriov; struct mlx5_lag *lag; struct mlx5_devcom *devcom; + struct mlx5_fw_reset *fw_reset; struct mlx5_core_roce roce; struct mlx5_fc_stats fc_stats; struct mlx5_rl_table rl_table; -- cgit v1.2.3 From 195d9dece1686576ad1c7b45942b5cf9eacb3fbf Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:53 +0300 Subject: devlink: Add enable_remote_dev_reset generic parameter The enable_remote_dev_reset devlink param flags that the host admin allows device resets that can be initiated by other hosts. This parameter is useful for setups where a device is shared by different hosts, such as multi-host setup. Once the user set this parameter to false, the driver should NACK any attempt to reset the device while the driver is loaded. Signed-off-by: Moshe Shemesh Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index d2771e57a278..b01bb9bca5a2 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -469,6 +469,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE, DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, + DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -506,6 +507,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME "enable_roce" #define DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE DEVLINK_PARAM_TYPE_BOOL +#define DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME "enable_remote_dev_reset" +#define DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE DEVLINK_PARAM_TYPE_BOOL + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ -- cgit v1.2.3 From 2d69356752ff862dbb0c7e6725874740799d7708 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 7 Oct 2020 09:00:55 +0300 Subject: net/mlx5: Add support for fw live patch event Firmware live patch event notifies the driver that the firmware was just updated using live patch. In such case the driver should not reload or re-initiate entities, part to updating the firmware version and re-initiate the firmware tracer which can be updated by live patch with new strings database to help debugging an issue. Signed-off-by: Moshe Shemesh Reviewed-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 81ca5989009b..cf824366a7d1 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -366,6 +366,7 @@ enum { enum { MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT = 0x1, MLX5_GENERAL_SUBTYPE_PCI_POWER_CHANGE_EVENT = 0x5, + MLX5_GENERAL_SUBTYPE_FW_LIVE_PATCH_EVENT = 0x7, MLX5_GENERAL_SUBTYPE_PCI_SYNC_FOR_FW_UPDATE_EVENT = 0x8, }; -- cgit v1.2.3 From 923527dcb4d164925a2fed0b53c6a1625a60a472 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 8 Oct 2020 22:49:00 -0700 Subject: net/tls: remove a duplicate function prototype Remove one of the two instances of the function prototype for tls_validate_xmit_skb(). Signed-off-by: Randy Dunlap Cc: Boris Pismenny Cc: Aviad Yehezkel Cc: John Fastabend Cc: Daniel Borkmann Signed-off-by: Jakub Kicinski --- include/net/tls.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index e5dac7e74e79..baf1e99d8193 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -679,10 +679,6 @@ int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout); struct sk_buff *tls_encrypt_skb(struct sk_buff *skb); -struct sk_buff *tls_validate_xmit_skb(struct sock *sk, - struct net_device *dev, - struct sk_buff *skb); - int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); -- cgit v1.2.3 From 44f3625bc61653ea3bde9960298faf2f5518fda5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 8 Oct 2020 12:45:17 +0200 Subject: netlink: export policy in extended ACK Add a new attribute NLMSGERR_ATTR_POLICY to the extended ACK to advertise the policy, e.g. if an attribute was out of range, you'll know the range that's permissible. Add new NL_SET_ERR_MSG_ATTR_POL() and NL_SET_ERR_MSG_ATTR_POL() macros to set this, since realistically it's only useful to do this when the bad attribute (offset) is also returned. Use it in lib/nlattr.c which practically does all the policy validation. v2: - add and use netlink_policy_dump_attr_size_estimate() v3: - remove redundant break v4: - really remove redundant break ... sorry Reviewed-by: Jakub Kicinski Signed-off-by: Johannes Berg Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 30 ++++++++++++++++++++---------- include/net/netlink.h | 4 ++++ include/uapi/linux/netlink.h | 2 ++ 3 files changed, 26 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index e3e49f0e5c13..666cd0390699 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -68,12 +68,14 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) * @_msg: message string to report - don't access directly, use * %NL_SET_ERR_MSG * @bad_attr: attribute with error + * @policy: policy for a bad attribute * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length */ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; + const struct nla_policy *policy; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; }; @@ -95,21 +97,29 @@ struct netlink_ext_ack { #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) -#define NL_SET_BAD_ATTR(extack, attr) do { \ - if ((extack)) \ +#define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ + if ((extack)) { \ (extack)->bad_attr = (attr); \ + (extack)->policy = (pol); \ + } \ } while (0) -#define NL_SET_ERR_MSG_ATTR(extack, attr, msg) do { \ - static const char __msg[] = msg; \ - struct netlink_ext_ack *__extack = (extack); \ - \ - if (__extack) { \ - __extack->_msg = __msg; \ - __extack->bad_attr = (attr); \ - } \ +#define NL_SET_BAD_ATTR(extack, attr) NL_SET_BAD_ATTR_POLICY(extack, attr, NULL) + +#define NL_SET_ERR_MSG_ATTR_POL(extack, attr, pol, msg) do { \ + static const char __msg[] = msg; \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (__extack) { \ + __extack->_msg = __msg; \ + __extack->bad_attr = (attr); \ + __extack->policy = (pol); \ + } \ } while (0) +#define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ + NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) + static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { diff --git a/include/net/netlink.h b/include/net/netlink.h index 2b9e41075f19..7356f41d23ba 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1957,6 +1957,10 @@ int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state, bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state); int netlink_policy_dump_write(struct sk_buff *skb, struct netlink_policy_dump_state *state); +int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt); +int netlink_policy_dump_write_attr(struct sk_buff *skb, + const struct nla_policy *pt, + int nestattr); void netlink_policy_dump_free(struct netlink_policy_dump_state *state); #endif diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index d02e472ba54c..c3816ff7bfc3 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -129,6 +129,7 @@ struct nlmsgerr { * @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to * be used - in the success case - to identify a created * object or operation or similar (binary) + * @NLMSGERR_ATTR_POLICY: policy for a rejected attribute * @__NLMSGERR_ATTR_MAX: number of attributes * @NLMSGERR_ATTR_MAX: highest attribute number */ @@ -137,6 +138,7 @@ enum nlmsgerr_attrs { NLMSGERR_ATTR_MSG, NLMSGERR_ATTR_OFFS, NLMSGERR_ATTR_COOKIE, + NLMSGERR_ATTR_POLICY, __NLMSGERR_ATTR_MAX, NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 -- cgit v1.2.3 From dd2ce6a5373c6f5c830be54be10775458a8bd312 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:01 +0200 Subject: bpf: Improve bpf_redirect_neigh helper description Follow-up to address David's feedback that we should better describe internals of the bpf_redirect_neigh() helper. Suggested-by: David Ahern Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: David Ahern Link: https://lore.kernel.org/bpf/20201010234006.7075-2-daniel@iogearbox.net --- include/uapi/linux/bpf.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 42d2df799397..4272cc53d478 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3679,10 +3679,14 @@ union bpf_attr { * Redirect the packet to another net device of index *ifindex* * and fill in L2 addresses from neighboring subsystem. This helper * is somewhat similar to **bpf_redirect**\ (), except that it - * fills in e.g. MAC addresses based on the L3 information from - * the packet. This helper is supported for IPv4 and IPv6 protocols. + * populates L2 addresses as well, meaning, internally, the helper + * performs a FIB lookup based on the skb's networking header to + * get the address of the next hop and then relies on the neighbor + * lookup for the L2 address of the nexthop. + * * The *flags* argument is reserved and must be 0. The helper is - * currently only supported for tc BPF program types. + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. -- cgit v1.2.3 From 9aa1206e8f48222f35a0c809f33b2f4aaa1e2661 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:02 +0200 Subject: bpf: Add redirect_peer helper Add an efficient ingress to ingress netns switch that can be used out of tc BPF programs in order to redirect traffic from host ns ingress into a container veth device ingress without having to go via CPU backlog queue [0]. For local containers this can also be utilized and path via CPU backlog queue only needs to be taken once, not twice. On a high level this borrows from ipvlan which does similar switch in __netif_receive_skb_core() and then iterates via another_round. This helps to reduce latency for mentioned use cases. Pod to remote pod with redirect(), TCP_RR [1]: # percpu_netperf 10.217.1.33 RT_LATENCY: 122.450 (per CPU: 122.666 122.401 122.333 122.401 ) MEAN_LATENCY: 121.210 (per CPU: 121.100 121.260 121.320 121.160 ) STDDEV_LATENCY: 120.040 (per CPU: 119.420 119.910 125.460 115.370 ) MIN_LATENCY: 46.500 (per CPU: 47.000 47.000 47.000 45.000 ) P50_LATENCY: 118.500 (per CPU: 118.000 119.000 118.000 119.000 ) P90_LATENCY: 127.500 (per CPU: 127.000 128.000 127.000 128.000 ) P99_LATENCY: 130.750 (per CPU: 131.000 131.000 129.000 132.000 ) TRANSACTION_RATE: 32666.400 (per CPU: 8152.200 8169.842 8174.439 8169.897 ) Pod to remote pod with redirect_peer(), TCP_RR: # percpu_netperf 10.217.1.33 RT_LATENCY: 44.449 (per CPU: 43.767 43.127 45.279 45.622 ) MEAN_LATENCY: 45.065 (per CPU: 44.030 45.530 45.190 45.510 ) STDDEV_LATENCY: 84.823 (per CPU: 66.770 97.290 84.380 90.850 ) MIN_LATENCY: 33.500 (per CPU: 33.000 33.000 34.000 34.000 ) P50_LATENCY: 43.250 (per CPU: 43.000 43.000 43.000 44.000 ) P90_LATENCY: 46.750 (per CPU: 46.000 47.000 47.000 47.000 ) P99_LATENCY: 52.750 (per CPU: 51.000 54.000 53.000 53.000 ) TRANSACTION_RATE: 90039.500 (per CPU: 22848.186 23187.089 22085.077 21919.130 ) [0] https://linuxplumbersconf.org/event/7/contributions/674/attachments/568/1002/plumbers_2020_cilium_load_balancer.pdf [1] https://github.com/borkmann/netperf_scripts/blob/master/percpu_netperf Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201010234006.7075-3-daniel@iogearbox.net --- include/linux/netdevice.h | 4 ++++ include/uapi/linux/bpf.h | 17 +++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 28cfa53daf72..0533f86018dd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1277,6 +1277,9 @@ struct netdev_net_notifier { * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. + * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); + * If a device is paired with a peer device, return the peer instance. + * The caller must be under RCU read context. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1484,6 +1487,7 @@ struct net_device_ops { struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); + struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); }; /** diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4272cc53d478..b97bc5abb3b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3719,6 +3719,22 @@ union bpf_attr { * never return NULL. * Return * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the ingress + * hook and for veth device types. The peer device must reside in a + * different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3876,6 +3892,7 @@ union bpf_attr { FN(redirect_neigh), \ FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ + FN(redirect_peer), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 4a8f87e60f6db40e640f1db555d063b2c4dea5f1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 11 Oct 2020 01:40:03 +0200 Subject: bpf: Allow for map-in-map with dynamic inner array map entries Recent work in f4d05259213f ("bpf: Add map_meta_equal map ops") and 134fede4eecf ("bpf: Relax max_entries check for most of the inner map types") added support for dynamic inner max elements for most map-in-map types. Exceptions were maps like array or prog array where the map_gen_lookup() callback uses the maps' max_entries field as a constant when emitting instructions. We recently implemented Maglev consistent hashing into Cilium's load balancer which uses map-in-map with an outer map being hash and inner being array holding the Maglev backend table for each service. This has been designed this way in order to reduce overall memory consumption given the outer hash map allows to avoid preallocating a large, flat memory area for all services. Also, the number of service mappings is not always known a-priori. The use case for dynamic inner array map entries is to further reduce memory overhead, for example, some services might just have a small number of back ends while others could have a large number. Right now the Maglev backend table for small and large number of backends would need to have the same inner array map entries which adds a lot of unneeded overhead. Dynamic inner array map entries can be realized by avoiding the inlined code generation for their lookup. The lookup will still be efficient since it will be calling into array_map_lookup_elem() directly and thus avoiding retpoline. The patch adds a BPF_F_INNER_MAP flag to map creation which therefore skips inline code generation and relaxes array_map_meta_equal() check to ignore both maps' max_entries. This also still allows to have faster lookups for map-in-map when BPF_F_INNER_MAP is not specified and hence dynamic max_entries not needed. Example code generation where inner map is dynamic sized array: # bpftool p d x i 125 int handle__sys_enter(void * ctx): ; int handle__sys_enter(void *ctx) 0: (b4) w1 = 0 ; int key = 0; 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 ; 3: (07) r2 += -4 ; inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key); 4: (18) r1 = map[id:468] 6: (07) r1 += 272 7: (61) r0 = *(u32 *)(r2 +0) 8: (35) if r0 >= 0x3 goto pc+5 9: (67) r0 <<= 3 10: (0f) r0 += r1 11: (79) r0 = *(u64 *)(r0 +0) 12: (15) if r0 == 0x0 goto pc+1 13: (05) goto pc+1 14: (b7) r0 = 0 15: (b4) w6 = -1 ; if (!inner_map) 16: (15) if r0 == 0x0 goto pc+6 17: (bf) r2 = r10 ; 18: (07) r2 += -4 ; val = bpf_map_lookup_elem(inner_map, &key); 19: (bf) r1 = r0 | No inlining but instead 20: (85) call array_map_lookup_elem#149280 | call to array_map_lookup_elem() ; return val ? *val : -1; | for inner array lookup. 21: (15) if r0 == 0x0 goto pc+1 ; return val ? *val : -1; 22: (61) r6 = *(u32 *)(r0 +0) ; } 23: (bc) w0 = w6 24: (95) exit Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201010234006.7075-4-daniel@iogearbox.net --- include/linux/bpf.h | 2 +- include/uapi/linux/bpf.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dc63eeed4fd9..2b16bf48aab6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -82,7 +82,7 @@ struct bpf_map_ops { void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); void (*map_fd_put_ptr)(void *ptr); - u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); + int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, struct seq_file *m); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b97bc5abb3b8..bf5a99d803e4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -435,6 +435,9 @@ enum { /* Share perf_event among processes */ BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), }; /* Flags for BPF_PROG_QUERY. */ -- cgit v1.2.3 From 60a3815da702fd9e4759945f26cce5c47d3967ad Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 8 Oct 2020 01:14:47 +0200 Subject: netfilter: add inet ingress support This patch adds the NF_INET_INGRESS pseudohook for the NFPROTO_INET family. This is a mapping this new hook to the existing NFPROTO_NETDEV and NF_NETDEV_INGRESS hook. The hook does not guarantee that packets are inet only, users must filter out non-ip traffic explicitly. This infrastructure makes it easier to support this new hook in nf_tables. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index ca9e63d6e0e4..6a6179af0d7c 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -45,6 +45,7 @@ enum nf_inet_hooks { NF_INET_FORWARD, NF_INET_LOCAL_OUT, NF_INET_POST_ROUTING, + NF_INET_INGRESS, NF_INET_NUMHOOKS }; -- cgit v1.2.3 From d3519cb89f6d5949481afa5de3ee0fc6a051e231 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 8 Oct 2020 01:14:48 +0200 Subject: netfilter: nf_tables: add inet ingress support This patch adds a new ingress hook for the inet family. The inet ingress hook emulates the IP receive path code, therefore, unclean packets are drop before walking over the ruleset in this basechain. This patch also introduces the nft_base_chain_netdev() helper function to check if this hook is bound to one or more devices (through the hook list infrastructure). This check allows to perform the same handling for the inet ingress as it would be a netdev ingress chain from the control plane. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 6 +++++ include/net/netfilter/nf_tables_ipv4.h | 33 ++++++++++++++++++++++++ include/net/netfilter/nf_tables_ipv6.h | 46 ++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0bd2a081ae39..3965ce18226f 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1081,6 +1081,12 @@ struct nft_table { u8 *udata; }; +static inline bool nft_base_chain_netdev(int family, u32 hooknum) +{ + return family == NFPROTO_NETDEV || + (family == NFPROTO_INET && hooknum == NF_INET_INGRESS); +} + void nft_register_chain_type(const struct nft_chain_type *); void nft_unregister_chain_type(const struct nft_chain_type *); diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h index ed7b511f0a59..1f7bea39ad1b 100644 --- a/include/net/netfilter/nf_tables_ipv4.h +++ b/include/net/netfilter/nf_tables_ipv4.h @@ -53,4 +53,37 @@ static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt, nft_set_pktinfo_unspec(pkt, skb); } +static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt, + struct sk_buff *skb) +{ + struct iphdr *iph; + u32 len, thoff; + + if (!pskb_may_pull(skb, sizeof(*iph))) + return -1; + + iph = ip_hdr(skb); + if (iph->ihl < 5 || iph->version != 4) + goto inhdr_error; + + len = ntohs(iph->tot_len); + thoff = iph->ihl * 4; + if (skb->len < len) { + __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS); + return -1; + } else if (len < thoff) { + goto inhdr_error; + } + + pkt->tprot_set = true; + pkt->tprot = iph->protocol; + pkt->xt.thoff = thoff; + pkt->xt.fragoff = ntohs(iph->frag_off) & IP_OFFSET; + + return 0; + +inhdr_error: + __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INHDRERRORS); + return -1; +} #endif diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h index d0f1c537b017..867de29f3f7a 100644 --- a/include/net/netfilter/nf_tables_ipv6.h +++ b/include/net/netfilter/nf_tables_ipv6.h @@ -70,4 +70,50 @@ static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt, nft_set_pktinfo_unspec(pkt, skb); } +static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt, + struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IPV6) + unsigned int flags = IP6_FH_F_AUTH; + unsigned short frag_off; + unsigned int thoff = 0; + struct inet6_dev *idev; + struct ipv6hdr *ip6h; + int protohdr; + u32 pkt_len; + + if (!pskb_may_pull(skb, sizeof(*ip6h))) + return -1; + + ip6h = ipv6_hdr(skb); + if (ip6h->version != 6) + goto inhdr_error; + + pkt_len = ntohs(ip6h->payload_len); + if (pkt_len + sizeof(*ip6h) > skb->len) { + idev = __in6_dev_get(nft_in(pkt)); + __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INTRUNCATEDPKTS); + return -1; + } + + protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); + if (protohdr < 0) + goto inhdr_error; + + pkt->tprot_set = true; + pkt->tprot = protohdr; + pkt->xt.thoff = thoff; + pkt->xt.fragoff = frag_off; + + return 0; + +inhdr_error: + idev = __in6_dev_get(nft_in(pkt)); + __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INHDRERRORS); + return -1; +#else + return -1; +#endif +} + #endif -- cgit v1.2.3 From ef5659280eb13e8ac31c296f58cfdfa1684ac06b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sat, 10 Oct 2020 22:09:38 -0700 Subject: bpf, sockmap: Allow skipping sk_skb parser program Currently, we often run with a nop parser namely one that just does this, 'return skb->len'. This happens when either our verdict program can handle streaming data or it is only looking at socket data such as IP addresses and other metadata associated with the flow. The second case is common for a L3/L4 proxy for instance. So lets allow loading programs without the parser then we can skip the stream parser logic and avoid having to add a BPF program that is effectively a nop. Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/160239297866.8495.13345662302749219672.stgit@john-Precision-5820-Tower --- include/linux/skmsg.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 3119928fc103..fec0c5ac1c4f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -308,6 +308,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node); int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock); +void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock); +void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock); int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg); -- cgit v1.2.3 From ac911bfeb34b5d79fb4e23a08b8db0b89c529b53 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Oct 2020 09:43:53 +0200 Subject: can: isotp: implement cleanups / improvements from review As pointed out by Jakub Kicinski here: http://lore.kernel.org/r/20201009175751.5c54097f@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com this patch addresses the remarked issues: - remove empty line in comment - remove default=y for CAN_ISOTP in Kconfig - make use of pr_notice_once() - use GFP_ATOMIC instead of gfp_any() in soft hrtimer context The version strings in the CAN subsystem are removed by a separate patch. Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20201012074354.25839-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/isotp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h index 553006509f4e..7793b26aa154 100644 --- a/include/uapi/linux/can/isotp.h +++ b/include/uapi/linux/can/isotp.h @@ -160,7 +160,6 @@ struct can_isotp_ll_options { * these default settings can be changed via sockopts. * For that reason the STmin value is intentionally _not_ checked for * consistency and copied directly into the flow control (FC) frame. - * */ #endif /* !_UAPI_CAN_ISOTP_H */ -- cgit v1.2.3 From f726f3d37163f714034aa5fd1f92a1a73df4297f Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Oct 2020 09:43:54 +0200 Subject: can: remove obsolete version strings As pointed out by Jakub Kicinski here: http://lore.kernel.org/r/20201009175751.5c54097f@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com this patch removes the obsolete version information of the different CAN protocols and the AF_CAN core module. Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20201012074354.25839-2-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/linux/can/core.h | 7 ------- include/net/netns/can.h | 1 - 2 files changed, 8 deletions(-) (limited to 'include') diff --git a/include/linux/can/core.h b/include/linux/can/core.h index 7da9f1f82e8e..5fb8d0e3f9c1 100644 --- a/include/linux/can/core.h +++ b/include/linux/can/core.h @@ -18,13 +18,6 @@ #include #include -#define CAN_VERSION "20170425" - -/* increment this number each time you change some user-space interface */ -#define CAN_ABI_VERSION "9" - -#define CAN_VERSION_STRING "rev " CAN_VERSION " abi " CAN_ABI_VERSION - #define DNAME(dev) ((dev) ? (dev)->name : "any") /** diff --git a/include/net/netns/can.h b/include/net/netns/can.h index b6ab7d1530d7..52fbd8291a96 100644 --- a/include/net/netns/can.h +++ b/include/net/netns/can.h @@ -15,7 +15,6 @@ struct can_rcv_lists_stats; struct netns_can { #if IS_ENABLED(CONFIG_PROC_FS) struct proc_dir_entry *proc_dir; - struct proc_dir_entry *pde_version; struct proc_dir_entry *pde_stats; struct proc_dir_entry *pde_reset_stats; struct proc_dir_entry *pde_rcvlist_all; -- cgit v1.2.3 From ee92e4f1f95eb7b8820299f10fc5fba16d85cece Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Wed, 8 Apr 2020 14:47:39 -0500 Subject: net/mlx5: Add NIC TX domain namespace Add new namespace that represents the NIC TX domain. Signed-off-by: Huy Nguyen Signed-off-by: Raed Salem Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 92d991d93757..846d94ad04bc 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -76,6 +76,7 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_SNIFFER_RX, MLX5_FLOW_NAMESPACE_SNIFFER_TX, MLX5_FLOW_NAMESPACE_EGRESS, + MLX5_FLOW_NAMESPACE_EGRESS_KERNEL, MLX5_FLOW_NAMESPACE_RDMA_RX, MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL, MLX5_FLOW_NAMESPACE_RDMA_TX, -- cgit v1.2.3 From 9b9d454ddbf0c41391ed68ea82bc3d8ff6a65074 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Fri, 5 Jun 2020 20:17:51 -0500 Subject: net/mlx5e: IPsec: Add TX steering rule per IPsec state Add new FTE in TX IPsec FT per IPsec state. It has the same matching criteria as the RX steering rule. The IPsec FT is created/destroyed when the first/last rule is added/deleted respectively. Signed-off-by: Huy Nguyen Reviewed-by: Boris Pismenny Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/qp.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 36492a1342cf..d75ef8aa8fac 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -245,6 +245,10 @@ enum { MLX5_ETH_WQE_SWP_OUTER_L4_UDP = 1 << 5, }; +enum { + MLX5_ETH_WQE_FT_META_IPSEC = BIT(0), +}; + struct mlx5_wqe_eth_seg { u8 swp_outer_l4_offset; u8 swp_outer_l3_offset; @@ -253,7 +257,7 @@ struct mlx5_wqe_eth_seg { u8 cs_flags; u8 swp_flags; __be16 mss; - __be32 rsvd2; + __be32 flow_table_metadata; union { struct { __be16 sz; -- cgit v1.2.3 From 0d9826bc18ce356e8909919ad681ad65d0a6061e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 12 Oct 2020 17:06:06 +0200 Subject: netfilter: nf_log: missing vlan offload tag and proto Dump vlan tag and proto for the usual vlan offload case if the NF_LOG_MACDECODE flag is set on. Without this information the logging is misleading as there is no reference to the VLAN header. [12716.993704] test: IN=veth0 OUT= MACSRC=86:6c:92:ea:d6:73 MACDST=0e:3b:eb:86:73:76 VPROTO=8100 VID=10 MACPROTO=0800 SRC=192.168.10.2 DST=172.217.168.163 LEN=52 TOS=0x00 PREC=0x00 TTL=64 ID=2548 DF PROTO=TCP SPT=55848 DPT=80 WINDOW=501 RES=0x00 ACK FIN URGP=0 [12721.157643] test: IN=veth0 OUT= MACSRC=86:6c:92:ea:d6:73 MACDST=0e:3b:eb:86:73:76 VPROTO=8100 VID=10 MACPROTO=0806 ARP HTYPE=1 PTYPE=0x0800 OPCODE=2 MACSRC=86:6c:92:ea:d6:73 IPSRC=192.168.10.2 MACDST=0e:3b:eb:86:73:76 IPDST=192.168.10.1 Fixes: 83e96d443b37 ("netfilter: log: split family specific code to nf_log_{ip,ip6,common}.c files") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_log.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 0d3920896d50..716db4a0fed8 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -108,6 +108,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, unsigned int logflags); void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, struct sock *sk); +void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb); void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, -- cgit v1.2.3 From 44fa32f008ab7092842095a7a474c48303a2f186 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 12 Oct 2020 10:01:27 +0200 Subject: net: add function dev_fetch_sw_netstats for fetching pcpu_sw_netstats In several places the same code is used to populate rtnl_link_stats64 fields with data from pcpu_sw_netstats. Therefore factor out this code to a new function dev_fetch_sw_netstats(). v2: - constify argument netstats - don't ignore netstats being NULL or an ERRPTR - switch to EXPORT_SYMBOL_GPL Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/6d16a338-52f5-df69-0020-6bc771a7d498@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 948d7105e91a..964b494b0e8d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4499,6 +4499,8 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, struct rtnl_link_stats64 *storage); void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats); +void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, + const struct pcpu_sw_netstats __percpu *netstats); extern int netdev_max_backlog; extern int netdev_tstamp_prequeue; -- cgit v1.2.3 From d25e2e9388eda61b6e298585024ee3355f50c493 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 14 Oct 2020 21:34:32 +0200 Subject: netfilter: restore NF_INET_NUMHOOKS This definition is used by the iptables legacy UAPI, restore it. Fixes: d3519cb89f6d ("netfilter: nf_tables: add inet ingress support") Reported-by: Jason A. Donenfeld Tested-by: Jason A. Donenfeld Signed-off-by: Pablo Neira Ayuso Signed-off-by: Jakub Kicinski --- include/net/netfilter/nf_tables.h | 4 +++- include/uapi/linux/netfilter.h | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3965ce18226f..3f7e56b1171e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -14,6 +14,8 @@ #include #include +#define NFT_MAX_HOOKS (NF_INET_INGRESS + 1) + struct module; #define NFT_JUMP_STACK_SIZE 16 @@ -979,7 +981,7 @@ struct nft_chain_type { int family; struct module *owner; unsigned int hook_mask; - nf_hookfn *hooks[NF_MAX_HOOKS]; + nf_hookfn *hooks[NFT_MAX_HOOKS]; int (*ops_register)(struct net *net, const struct nf_hook_ops *ops); void (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops); }; diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index 6a6179af0d7c..ef9a44286e23 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -45,8 +45,8 @@ enum nf_inet_hooks { NF_INET_FORWARD, NF_INET_LOCAL_OUT, NF_INET_POST_ROUTING, - NF_INET_INGRESS, - NF_INET_NUMHOOKS + NF_INET_NUMHOOKS, + NF_INET_INGRESS = NF_INET_NUMHOOKS, }; enum nf_dev_hooks { -- cgit v1.2.3 From d086a1c65aabb5a4e1edc580ca583e2964c62b44 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 14 Oct 2020 11:56:42 +0300 Subject: net: sched: Fix suspicious RCU usage while accessing tcf_tunnel_info The access of tcf_tunnel_info() produces the following splat, so fix it by dereferencing the tcf_tunnel_key_params pointer with marker that internal tcfa_liock is held. ============================= WARNING: suspicious RCU usage 5.9.0+ #1 Not tainted ----------------------------- include/net/tc_act/tc_tunnel_key.h:59 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by tc/34839: #0: ffff88828572c2a0 (&p->tcfa_lock){+...}-{2:2}, at: tc_setup_flow_action+0xb3/0x48b5 stack backtrace: CPU: 1 PID: 34839 Comm: tc Not tainted 5.9.0+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x9a/0xd0 tc_setup_flow_action+0x14cb/0x48b5 fl_hw_replace_filter+0x347/0x690 [cls_flower] fl_change+0x2bad/0x4875 [cls_flower] tc_new_tfilter+0xf6f/0x1ba0 rtnetlink_rcv_msg+0x5f2/0x870 netlink_rcv_skb+0x124/0x350 netlink_unicast+0x433/0x700 netlink_sendmsg+0x6f1/0xbd0 sock_sendmsg+0xb0/0xe0 ____sys_sendmsg+0x4fa/0x6d0 ___sys_sendmsg+0x12e/0x1b0 __sys_sendmsg+0xa4/0x120 do_syscall_64+0x2d/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f1f8cd4fe57 Code: 0c 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 RSP: 002b:00007ffdc1e193b8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f1f8cd4fe57 RDX: 0000000000000000 RSI: 00007ffdc1e19420 RDI: 0000000000000003 RBP: 000000005f85aafa R08: 0000000000000001 R09: 00007ffdc1e1936c R10: 000000000040522d R11: 0000000000000246 R12: 0000000000000001 R13: 0000000000000000 R14: 00007ffdc1e1d6f0 R15: 0000000000482420 Fixes: 3ebaf6da0716 ("net: sched: Do not assume RTNL is held in tunnel key action helpers") Fixes: 7a47281439ba ("net: sched: lock action when translating it to flow_action infra") Signed-off-by: Leon Romanovsky Acked-by: Cong Wang Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_tunnel_key.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index e1057b255f69..879fe8cff581 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -56,7 +56,10 @@ static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); - struct tcf_tunnel_key_params *params = rtnl_dereference(t->params); + struct tcf_tunnel_key_params *params; + + params = rcu_dereference_protected(t->params, + lockdep_is_held(&a->tcfa_lock)); return ¶ms->tcft_enc_metadata->u.tun_info; #else -- cgit v1.2.3 From 346e320cb2103edef709c4466a29140c4a8e527a Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 15 Oct 2020 18:39:27 +0200 Subject: netfilter: nftables: allow re-computing sctp CRC-32C in 'payload' statements nftables payload statements are used to mangle SCTP headers, but they can only replace the Internet Checksum. As a consequence, nftables rules that mangle sport/dport/vtag in SCTP headers potentially generate packets that are discarded by the receiver, unless the CRC-32C is "offloaded" (e.g the rule mangles a skb having 'ip_summed' equal to 'CHECKSUM_PARTIAL'. Fix this extending uAPI definitions and L4 checksum update function, in a way that userspace programs (e.g. nft) can instruct the kernel to compute CRC-32C in SCTP headers. Also ensure that LIBCRC32C is built if NF_TABLES is 'y' or 'm' in the kernel build configuration. Signed-off-by: Davide Caratti Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Jakub Kicinski --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 352ee51707a1..98272cb5f617 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -749,10 +749,12 @@ enum nft_payload_bases { * * @NFT_PAYLOAD_CSUM_NONE: no checksumming * @NFT_PAYLOAD_CSUM_INET: internet checksum (RFC 791) + * @NFT_PAYLOAD_CSUM_SCTP: CRC-32c, for use in SCTP header (RFC 3309) */ enum nft_payload_csum_types { NFT_PAYLOAD_CSUM_NONE, NFT_PAYLOAD_CSUM_INET, + NFT_PAYLOAD_CSUM_SCTP, }; enum nft_payload_csum_flags { -- cgit v1.2.3