From 5d5de3a431d87ac51d43da8d796891d014975ab7 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 16 Feb 2023 10:48:21 +0800 Subject: bpf: Only allocate one bpf_mem_cache for bpf_cpumask_ma The size of bpf_cpumask is fixed, so there is no need to allocate many bpf_mem_caches for bpf_cpumask_ma, just one bpf_mem_cache is enough. Also add comments for bpf_mem_alloc_init() in bpf_mem_alloc.h to prevent future miuse. Signed-off-by: Hou Tao Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20230216024821.2202916-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 3e164b8efaa9..a7104af61ab4 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -14,6 +14,13 @@ struct bpf_mem_alloc { struct work_struct work; }; +/* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects. + * Alloc and free are done with bpf_mem_cache_{alloc,free}(). + * + * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects. + * Alloc and free are done with bpf_mem_{alloc,free}() and the size of + * the returned object is given by the size argument of bpf_mem_alloc(). + */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); -- cgit v1.2.3 From 7e0dac2807e6c4ae8c56941d74971fdb0763b4f9 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:45 -0800 Subject: bpf: Refactor process_dynptr_func This change cleans up process_dynptr_func's flow to be more intuitive and updates some comments with more context. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-3-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index cf1bb1cf4a7b..b26ff2a8f63b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -616,9 +616,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, enum bpf_arg_type arg_type); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); -struct bpf_call_arg_meta; -int process_dynptr_func(struct bpf_verifier_env *env, int regno, - enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, -- cgit v1.2.3 From 8357b366cbb09b17c90e2cd758360a6bd2ea7507 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:47 -0800 Subject: bpf: Define no-ops for externally called bpf dynptr functions Some bpf dynptr functions will be called from places where if CONFIG_BPF_SYSCALL is not set, then the dynptr function is undefined. For example, when skb type dynptrs are added in the next commit, dynptr functions are called from net/core/filter.c This patch defines no-op implementations of these dynptr functions so that they do not break compilation by being an undefined reference. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-5-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 75 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 520b238abd5a..296841a31749 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1124,6 +1124,33 @@ static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( return bpf_func(ctx, insnsi); } +/* the implementation of the opaque uapi struct bpf_dynptr */ +struct bpf_dynptr_kern { + void *data; + /* Size represents the number of usable bytes of dynptr data. + * If for example the offset is at 4 for a local dynptr whose data is + * of type u64, the number of usable bytes is 4. + * + * The upper 8 bits are reserved. It is as follows: + * Bits 0 - 23 = size + * Bits 24 - 30 = dynptr type + * Bit 31 = whether dynptr is read-only + */ + u32 size; + u32 offset; +} __aligned(8); + +enum bpf_dynptr_type { + BPF_DYNPTR_TYPE_INVALID, + /* Points to memory that is local to the bpf program */ + BPF_DYNPTR_TYPE_LOCAL, + /* Underlying data is a ringbuf record */ + BPF_DYNPTR_TYPE_RINGBUF, +}; + +int bpf_dynptr_check_size(u32 size); +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr); + #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); @@ -2266,6 +2293,11 @@ static inline bool has_current_bpf_ctx(void) } void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog); + +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, + enum bpf_dynptr_type type, u32 offset, u32 size); +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); +void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2495,6 +2527,19 @@ static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) static inline void bpf_cgrp_storage_free(struct cgroup *cgroup) { } + +static inline void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, + enum bpf_dynptr_type type, u32 offset, u32 size) +{ +} + +static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) +{ +} + +static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, @@ -2913,36 +2958,6 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data); void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); -/* the implementation of the opaque uapi struct bpf_dynptr */ -struct bpf_dynptr_kern { - void *data; - /* Size represents the number of usable bytes of dynptr data. - * If for example the offset is at 4 for a local dynptr whose data is - * of type u64, the number of usable bytes is 4. - * - * The upper 8 bits are reserved. It is as follows: - * Bits 0 - 23 = size - * Bits 24 - 30 = dynptr type - * Bit 31 = whether dynptr is read-only - */ - u32 size; - u32 offset; -} __aligned(8); - -enum bpf_dynptr_type { - BPF_DYNPTR_TYPE_INVALID, - /* Points to memory that is local to the bpf program */ - BPF_DYNPTR_TYPE_LOCAL, - /* Underlying data is a kernel-produced ringbuf record */ - BPF_DYNPTR_TYPE_RINGBUF, -}; - -void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, - enum bpf_dynptr_type type, u32 offset, u32 size); -void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); -int bpf_dynptr_check_size(u32 size); -u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr); - #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); void bpf_cgroup_atype_put(int cgroup_atype); -- cgit v1.2.3 From b5964b968ac64c2ec2debee7518499113b27c34e Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:50 -0800 Subject: bpf: Add skb dynptrs Add skb dynptrs, which are dynptrs whose underlying pointer points to a skb. The dynptr acts on skb data. skb dynptrs have two main benefits. One is that they allow operations on sizes that are not statically known at compile-time (eg variable-sized accesses). Another is that parsing the packet data through dynptrs (instead of through direct access of skb->data and skb->data_end) can be more ergonomic and less brittle (eg does not need manual if checking for being within bounds of data_end). For bpf prog types that don't support writes on skb data, the dynptr is read-only (bpf_dynptr_write() will return an error) For reads and writes through the bpf_dynptr_read() and bpf_dynptr_write() interfaces, reading and writing from/to data in the head as well as from/to non-linear paged buffers is supported. Data slices through the bpf_dynptr_data API are not supported; instead bpf_dynptr_slice() and bpf_dynptr_slice_rdwr() (added in subsequent commit) should be used. For examples of how skb dynptrs can be used, please see the attached selftests. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-8-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 14 +++++++++++++- include/linux/filter.h | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 296841a31749..e7436d7615b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -607,11 +607,14 @@ enum bpf_type_flag { */ NON_OWN_REF = BIT(14 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to sk_buff */ + DYNPTR_TYPE_SKB = BIT(15 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; -#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF) +#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1146,6 +1149,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_LOCAL, /* Underlying data is a ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, + /* Underlying data is a sk_buff */ + BPF_DYNPTR_TYPE_SKB, }; int bpf_dynptr_check_size(u32 size); @@ -2846,6 +2851,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); +int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, + struct bpf_dynptr_kern *ptr); #else static inline bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, @@ -2867,6 +2874,11 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, { return 0; } +static inline int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, + struct bpf_dynptr_kern *ptr) +{ + return -EOPNOTSUPP; +} #endif #ifdef CONFIG_INET diff --git a/include/linux/filter.h b/include/linux/filter.h index 1727898f1641..de18e844d15e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1542,4 +1542,22 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index return XDP_REDIRECT; } +#ifdef CONFIG_NET +int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len); +int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, + u32 len, u64 flags); +#else /* CONFIG_NET */ +static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, + void *to, u32 len) +{ + return -EOPNOTSUPP; +} + +static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, + const void *from, u32 len, u64 flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_NET */ + #endif /* __LINUX_FILTER_H__ */ -- cgit v1.2.3 From 05421aecd4ed65da0dc17b0c3c13779ef334e9e5 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:51 -0800 Subject: bpf: Add xdp dynptrs Add xdp dynptrs, which are dynptrs whose underlying pointer points to a xdp_buff. The dynptr acts on xdp data. xdp dynptrs have two main benefits. One is that they allow operations on sizes that are not statically known at compile-time (eg variable-sized accesses). Another is that parsing the packet data through dynptrs (instead of through direct access of xdp->data and xdp->data_end) can be more ergonomic and less brittle (eg does not need manual if checking for being within bounds of data_end). For reads and writes on the dynptr, this includes reading/writing from/to and across fragments. Data slices through the bpf_dynptr_data API are not supported; instead bpf_dynptr_slice() and bpf_dynptr_slice_rdwr() should be used. For examples of how xdp dynptrs can be used, please see the attached selftests. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-9-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 +++++++- include/linux/filter.h | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e7436d7615b0..23ec684e660d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -610,11 +610,15 @@ enum bpf_type_flag { /* DYNPTR points to sk_buff */ DYNPTR_TYPE_SKB = BIT(15 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to xdp_buff */ + DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; -#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB) +#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ + | DYNPTR_TYPE_XDP) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1151,6 +1155,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_RINGBUF, /* Underlying data is a sk_buff */ BPF_DYNPTR_TYPE_SKB, + /* Underlying data is a xdp_buff */ + BPF_DYNPTR_TYPE_XDP, }; int bpf_dynptr_check_size(u32 size); diff --git a/include/linux/filter.h b/include/linux/filter.h index de18e844d15e..3f6992261ec5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1546,6 +1546,8 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len); int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags); +int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); +int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) @@ -1558,6 +1560,18 @@ static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, { return -EOPNOTSUPP; } + +static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, + void *buf, u32 len) +{ + return -EOPNOTSUPP; +} + +static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, + void *buf, u32 len) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */ -- cgit v1.2.3 From 66e3a13e7c2c44d0c9dd6bb244680ca7529a8845 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:52 -0800 Subject: bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr. The user must pass in a buffer to store the contents of the data slice if a direct pointer to the data cannot be obtained. For skb and xdp type dynptrs, these two APIs are the only way to obtain a data slice. However, for other types of dynptrs, there is no difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data. For skb type dynptrs, the data is copied into the user provided buffer if any of the data is not in the linear portion of the skb. For xdp type dynptrs, the data is copied into the user provided buffer if the data is between xdp frags. If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then the skb will be uncloned (see bpf_unclone_prologue()). Please note that any bpf_dynptr_write() automatically invalidates any prior data slices of the skb dynptr. This is because the skb may be cloned or may need to pull its paged buffer into the head. As such, any bpf_dynptr_write() will automatically have its prior data slices invalidated, even if the write is to data in the skb head of an uncloned skb. Please note as well that any other helper calls that change the underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data slices of the skb dynptr as well, for the same reasons. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-10-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 3f6992261ec5..efa5d4a1677e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1548,6 +1548,9 @@ int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags); int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); +void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); +void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, + void *buf, unsigned long len, bool flush); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) @@ -1572,6 +1575,17 @@ static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, { return -EOPNOTSUPP; } + +static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) +{ + return NULL; +} + +static inline void *bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, + unsigned long len, bool flush) +{ + return NULL; +} #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */ -- cgit v1.2.3 From 9db44fdd8105da00669d425acab887c668df75f6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 25 Feb 2023 16:40:09 +0100 Subject: bpf: Support kptrs in local storage maps Enable support for kptrs in local storage maps by wiring up the freeing of these kptrs from map value. Freeing of bpf_local_storage_map is only delayed in case there are special fields, therefore bpf_selem_free_* path can also only dereference smap safely in that case. This is recorded using a bool utilizing a hole in bpF_local_storage_elem. It could have been tagged in the pointer value smap using the lowest bit (since alignment > 1), but since there was already a hole I went with the simpler option. Only the map structure freeing is delayed using RCU barriers, as the buckets aren't used when selem is being freed, so they can be freed once all readers of the bucket lists can no longer access it. Cc: Martin KaFai Lau Cc: KP Singh Cc: Paul E. McKenney Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230225154010.391965-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 6d37a40cd90e..0fe92986412b 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -74,6 +74,12 @@ struct bpf_local_storage_elem { struct hlist_node snode; /* Linked to bpf_local_storage */ struct bpf_local_storage __rcu *local_storage; struct rcu_head rcu; + bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU + * callbacks? bpf_local_storage_map_free only + * executes rcu_barrier when there are special + * fields, this field remembers that to ensure we + * don't access already freed smap in sdata. + */ /* 8 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. -- cgit v1.2.3 From 20c09d92faeefb8536f705d3a4629e0dc314c8a1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 2 Mar 2023 20:14:43 -0800 Subject: bpf: Introduce kptr_rcu. The life time of certain kernel structures like 'struct cgroup' is protected by RCU. Hence it's safe to dereference them directly from __kptr tagged pointers in bpf maps. The resulting pointer is MEM_RCU and can be passed to kfuncs that expect KF_RCU. Derefrence of other kptr-s returns PTR_UNTRUSTED. For example: struct map_value { struct cgroup __kptr *cgrp; }; SEC("tp_btf/cgroup_mkdir") int BPF_PROG(test_cgrp_get_ancestors, struct cgroup *cgrp_arg, const char *path) { struct cgroup *cg, *cg2; cg = bpf_cgroup_acquire(cgrp_arg); // cg is PTR_TRUSTED and ref_obj_id > 0 bpf_kptr_xchg(&v->cgrp, cg); cg2 = v->cgrp; // This is new feature introduced by this patch. // cg2 is PTR_MAYBE_NULL | MEM_RCU. // When cg2 != NULL, it's a valid cgroup, but its percpu_ref could be zero if (cg2) bpf_cgroup_ancestor(cg2, level); // safe to do. } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Tejun Heo Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230303041446.3630-4-alexei.starovoitov@gmail.com --- include/linux/btf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 49e0fe6d8274..556b3e2e7471 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -70,7 +70,7 @@ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ -#define KF_RCU (1 << 7) /* kfunc only takes rcu pointer arguments */ +#define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the -- cgit v1.2.3 From 6fcd486b3a0a628c41f12b3a7329a18a2c74b351 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 2 Mar 2023 20:14:46 -0800 Subject: bpf: Refactor RCU enforcement in the verifier. bpf_rcu_read_lock/unlock() are only available in clang compiled kernels. Lack of such key mechanism makes it impossible for sleepable bpf programs to use RCU pointers. Allow bpf_rcu_read_lock/unlock() in GCC compiled kernels (though GCC doesn't support btf_type_tag yet) and allowlist certain field dereferences in important data structures like tast_struct, cgroup, socket that are used by sleepable programs either as RCU pointer or full trusted pointer (which is valid outside of RCU CS). Use BTF_TYPE_SAFE_RCU and BTF_TYPE_SAFE_TRUSTED macros for such tagging. They will be removed once GCC supports btf_type_tag. With that refactor check_ptr_to_btf_access(). Make it strict in enforcing PTR_TRUSTED and PTR_UNTRUSTED while deprecating old PTR_TO_BTF_ID without modifier flags. There is a chance that this strict enforcement might break existing programs (especially on GCC compiled kernels), but this cleanup has to start sooner than later. Note PTR_TO_CTX access still yields old deprecated PTR_TO_BTF_ID. Once it's converted to strict PTR_TRUSTED or PTR_UNTRUSTED the kfuncs and helpers will be able to default to KF_TRUSTED_ARGS. KF_RCU will remain as a weaker version of KF_TRUSTED_ARGS where obj refcnt could be 0. Adjust rcu_read_lock selftest to run on gcc and clang compiled kernels. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230303041446.3630-7-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 2 +- include/linux/bpf_verifier.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 23ec684e660d..d3456804f7aa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2279,7 +2279,7 @@ struct bpf_core_ctx { bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off); + int off, const char *suffix); bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, const struct btf *reg_btf, u32 reg_id, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b26ff2a8f63b..18538bad2b8c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -537,7 +537,6 @@ struct bpf_verifier_env { bool bypass_spec_v1; bool bypass_spec_v4; bool seen_direct_write; - bool rcu_tag_supported; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; -- cgit v1.2.3 From e768e3c5aab44ee63f58649d4c8cbbb3270e5c06 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 3 Mar 2023 15:15:42 +0100 Subject: bpf: Use separate RCU callbacks for freeing selem Martin suggested that instead of using a byte in the hole (which he has a use for in his future patch) in bpf_local_storage_elem, we can dispatch a different call_rcu callback based on whether we need to free special fields in bpf_local_storage_elem data. The free path, described in commit 9db44fdd8105 ("bpf: Support kptrs in local storage maps"), only waits for call_rcu callbacks when there are special (kptrs, etc.) fields in the map value, hence it is necessary that we only access smap in this case. Therefore, dispatch different RCU callbacks based on the BPF map has a valid btf_record, which dereference and use smap's btf_record only when it is valid. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230303141542.300068-1-memxor@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf_local_storage.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 0fe92986412b..6d37a40cd90e 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -74,12 +74,6 @@ struct bpf_local_storage_elem { struct hlist_node snode; /* Linked to bpf_local_storage */ struct bpf_local_storage __rcu *local_storage; struct rcu_head rcu; - bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU - * callbacks? bpf_local_storage_map_free only - * executes rcu_barrier when there are special - * fields, this field remembers that to ensure we - * don't access already freed smap in sdata. - */ /* 8 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. -- cgit v1.2.3 From 2d5bcdcda8799cf21f9ab84598c946dd320207a2 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 7 Mar 2023 08:14:06 -0700 Subject: bpf: Increase size of BTF_ID_LIST without CONFIG_DEBUG_INFO_BTF again After commit 66e3a13e7c2c ("bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr"), clang builds without CONFIG_DEBUG_INFO_BTF warn: kernel/bpf/verifier.c:10298:24: warning: array index 16 is past the end of the array (that has type 'u32[16]' (aka 'unsigned int[16]')) [-Warray-bounds] meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { ^ ~~~~~~~~~~~~~~~~~~~~~~~~ kernel/bpf/verifier.c:9150:1: note: array 'special_kfunc_list' declared here BTF_ID_LIST(special_kfunc_list) ^ include/linux/btf_ids.h:207:27: note: expanded from macro 'BTF_ID_LIST' #define BTF_ID_LIST(name) static u32 __maybe_unused name[16]; ^ 1 warning generated. A warning of this nature was previously addressed by commit beb3d47d1d3d ("bpf: Fix a BTF_ID_LIST bug with CONFIG_DEBUG_INFO_BTF not set") but there have been new kfuncs added since then. Quadruple the size of the CONFIG_DEBUG_INFO_BTF=n definition so that this problem is unlikely to show up for some time. Link: https://github.com/ClangBuiltLinux/linux/issues/1810 Signed-off-by: Nathan Chancellor Reviewed-by: Tom Rix Link: https://lore.kernel.org/r/20230307-bpf-kfuncs-warray-bounds-v1-1-00ad3191f3a6@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 3a4f7cd882ca..00950cc03bff 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -204,7 +204,7 @@ extern struct btf_id_set8 name; #else -#define BTF_ID_LIST(name) static u32 __maybe_unused name[16]; +#define BTF_ID_LIST(name) static u32 __maybe_unused name[64]; #define BTF_ID(prefix, name) #define BTF_ID_FLAGS(prefix, name, ...) #define BTF_ID_UNUSED -- cgit v1.2.3 From 90a5527d7686d3ebe0dd2a831356a6c7d7dc31bc Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:45:58 +0000 Subject: bpf: add new map ops ->map_mem_usage Add a new map ops ->map_mem_usage to print the memory usage of a bpf map. This is a preparation for the followup change. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-2-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d3456804f7aa..9059520bbb5e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -161,6 +161,8 @@ struct bpf_map_ops { bpf_callback_t callback_fn, void *callback_ctx, u64 flags); + u64 (*map_mem_usage)(const struct bpf_map *map); + /* BTF id of struct allocated by map_alloc */ int *map_btf_id; -- cgit v1.2.3 From 7490b7f1c02ef825ef98f7230662049d4a464a21 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:11 +0000 Subject: bpf, net: bpf_local_storage memory usage A new helper is introduced into bpf_local_storage map to calculate the memory usage. This helper is also used by other maps like bpf_cgrp_storage, bpf_inode_storage, bpf_task_storage and etc. Note that currently the dynamically allocated storage elements are not counted in the usage, since it will take extra runtime overhead in the elements update or delete path. So let's put it aside now, and implement it in the future when someone really need it. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-15-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 6d37a40cd90e..d934248b8e81 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -164,5 +164,6 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags); void bpf_local_storage_free_rcu(struct rcu_head *rcu); +u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map); #endif /* _BPF_LOCAL_STORAGE_H */ -- cgit v1.2.3 From 9629363cd05642fe43aded44938adec067ad1da3 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:14 +0000 Subject: bpf: offload map memory usage A new helper is introduced to calculate offload map memory usage. But currently the memory dynamically allocated in netdev dev_ops, like nsim_map_update_elem, is not counted. Let's just put it aside now. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-18-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9059520bbb5e..6792a7940e1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2624,6 +2624,7 @@ static inline bool bpf_map_is_offloaded(struct bpf_map *map) struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); void bpf_map_offload_map_free(struct bpf_map *map); +u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map); int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); @@ -2695,6 +2696,11 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) { } +static inline u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map) +{ + return 0; +} + static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) -- cgit v1.2.3 From 0194b64578e905dc8f112e641a71c306bd58ddde Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 6 Mar 2023 22:51:35 +0100 Subject: net: phy: improve phy_read_poll_timeout cond sometimes is (val & MASK) what may result in a false positive if val is a negative errno. We shouldn't evaluate cond if val < 0. This has no functional impact here, but it's not nice. Therefore switch order of the checks. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/6d8274ac-4344-23b4-d9a3-cad4c39517d4@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 36bf0bbc8efa..fefd5091bc24 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1130,16 +1130,15 @@ static inline int phy_read(struct phy_device *phydev, u32 regnum) #define phy_read_poll_timeout(phydev, regnum, val, cond, sleep_us, \ timeout_us, sleep_before_read) \ ({ \ - int __ret = read_poll_timeout(phy_read, val, (cond) || val < 0, \ + int __ret = read_poll_timeout(phy_read, val, val < 0 || (cond), \ sleep_us, timeout_us, sleep_before_read, phydev, regnum); \ - if (val < 0) \ + if (val < 0) \ __ret = val; \ if (__ret) \ phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \ __ret; \ }) - /** * __phy_read - convenience function for reading a given PHY register * @phydev: the phy_device struct -- cgit v1.2.3 From 40bbae583ec38ea31e728bf42a4ea72bded22ab6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Mar 2023 20:43:13 +0000 Subject: net: remove enum skb_free_reason enum skb_drop_reason is more generic, we can adopt it instead. Provide dev_kfree_skb_irq_reason() and dev_kfree_skb_any_reason(). This means drivers can use more precise drop reasons if they want to. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Reviewed-by: Yunsheng Lin Link: https://lore.kernel.org/r/20230306204313.10492-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6a14b7b11766..ee483071cf59 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -52,6 +52,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -3804,13 +3805,8 @@ static inline unsigned int get_netdev_rx_queue_index( int netif_get_num_default_rss_queues(void); -enum skb_free_reason { - SKB_REASON_CONSUMED, - SKB_REASON_DROPPED, -}; - -void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason); -void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason); +void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason); +void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason); /* * It is not allowed to call kfree_skb() or consume_skb() from hardware @@ -3833,22 +3829,22 @@ void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason); */ static inline void dev_kfree_skb_irq(struct sk_buff *skb) { - __dev_kfree_skb_irq(skb, SKB_REASON_DROPPED); + dev_kfree_skb_irq_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } static inline void dev_consume_skb_irq(struct sk_buff *skb) { - __dev_kfree_skb_irq(skb, SKB_REASON_CONSUMED); + dev_kfree_skb_irq_reason(skb, SKB_CONSUMED); } static inline void dev_kfree_skb_any(struct sk_buff *skb) { - __dev_kfree_skb_any(skb, SKB_REASON_DROPPED); + dev_kfree_skb_any_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } static inline void dev_consume_skb_any(struct sk_buff *skb) { - __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED); + dev_kfree_skb_any_reason(skb, SKB_CONSUMED); } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, -- cgit v1.2.3 From 10369080454d87ee5b2db211ce947cb3118f0e13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Mar 2023 14:59:59 +0000 Subject: net: reclaim skb->scm_io_uring bit Commit 0091bfc81741 ("io_uring/af_unix: defer registered files gc to io_uring release") added one bit to struct sk_buff. This structure is critical for networking, and we try very hard to not add bloat on it, unless absolutely required. For instance, we can use a specific destructor as a wrapper around unix_destruct_scm(), to identify skbs that unix_gc() has to special case. Signed-off-by: Eric Dumazet Cc: Pavel Begunkov Cc: Thadeu Lima de Souza Cascardo Cc: Jens Axboe Reviewed-by: Jens Axboe Reviewed-by: Pavel Begunkov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ff7ad331fb82..fe661011644b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -810,7 +810,6 @@ typedef unsigned char *sk_buff_data_t; * @csum_level: indicates the number of consecutive checksums found in * the packet minus one that have been verified as * CHECKSUM_UNNECESSARY (max 3) - * @scm_io_uring: SKB holds io_uring registered files * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required @@ -989,7 +988,6 @@ struct sk_buff { #endif __u8 slow_gro:1; __u8 csum_not_inet:1; - __u8 scm_io_uring:1; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ -- cgit v1.2.3 From 28e144cf5f72ce1c304571bc448e37c27495903a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 7 Mar 2023 16:31:30 -0500 Subject: netfilter: move br_nf_check_hbh_len to utils Rename br_nf_check_hbh_len() to nf_ip6_check_hbh_len() and move it to netfilter utils, so that it can be used by other modules, like ovs and tc. Signed-off-by: Xin Long Reviewed-by: Simon Horman Reviewed-by: Nikolay Aleksandrov Reviewed-by: Aaron Conole Signed-off-by: Florian Westphal --- include/linux/netfilter_ipv6.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 48314ade1506..7834c0be2831 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -197,6 +197,8 @@ static inline int nf_cookie_v6_check(const struct ipv6hdr *iph, __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol); +int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen); + int ipv6_netfilter_init(void); void ipv6_netfilter_fini(void); -- cgit v1.2.3 From 215bf4962f6c9605710012fad222a5fec001b3ad Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:15 -0800 Subject: bpf: add iterator kfuncs registration and validation logic Add ability to register kfuncs that implement BPF open-coded iterator contract and enforce naming and function proto convention. Enforcement happens at the time of kfunc registration and significantly simplifies the rest of iterators logic in the verifier. More details follow in subsequent patches, but we enforce the following conditions. All kfuncs (constructor, next, destructor) have to be named consistenly as bpf_iter__{new,next,destroy}(), respectively. represents iterator type, and iterator state should be represented as a matching `struct bpf_iter_` state type. Also, all iter kfuncs should have a pointer to this `struct bpf_iter_` as the very first argument. Additionally: - Constructor, i.e., bpf_iter__new(), can have arbitrary extra number of arguments. Return type is not enforced either. - Next method, i.e., bpf_iter__next(), has to return a pointer type and should have exactly one argument: `struct bpf_iter_ *` (const/volatile/restrict and typedefs are ignored). - Destructor, i.e., bpf_iter__destroy(), should return void and should have exactly one argument, similar to the next method. - struct bpf_iter_ size is enforced to be positive and a multiple of 8 bytes (to fit stack slots correctly). Such strictness and consistency allows to build generic helpers abstracting important, but boilerplate, details to be able to use open-coded iterators effectively and ergonomically (see bpf_for_each() in subsequent patches). It also simplifies the verifier logic in some places. At the same time, this doesn't hurt generality of possible iterator implementations. Win-win. Constructor kfunc is marked with a new KF_ITER_NEW flags, next method is marked with KF_ITER_NEXT (and should also have KF_RET_NULL, of course), while destructor kfunc is marked as KF_ITER_DESTROY. Additionally, we add a trivial kfunc name validation: it should be a valid non-NULL and non-empty string. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ include/linux/btf.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 18538bad2b8c..e2dc7f064449 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -59,6 +59,8 @@ struct bpf_active_lock { u32 id; }; +#define ITER_PREFIX "bpf_iter_" + struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; diff --git a/include/linux/btf.h b/include/linux/btf.h index 556b3e2e7471..1bba0827e8c4 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -71,6 +71,10 @@ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ #define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ +/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */ +#define KF_ITER_NEW (1 << 8) /* kfunc implements BPF iter constructor */ +#define KF_ITER_NEXT (1 << 9) /* kfunc implements BPF iter next method */ +#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the -- cgit v1.2.3 From 06accc8779c1d558a5b5a21f2ac82b0c95827ddd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:16 -0800 Subject: bpf: add support for open-coded iterator loops Teach verifier about the concept of the open-coded (or inline) iterators. This patch adds generic iterator loop verification logic, new STACK_ITER stack slot type to contain iterator state, and necessary kfunc plumbing for iterator's constructor, destructor and next methods. Next patch implements first specific iterator (numbers iterator for implementing for() loop logic). Such split allows to have more focused commits for verifier logic and separate commit that we could point later to demonstrating what does it take to add a new kind of iterator. Each kind of iterator has its own associated struct bpf_iter_, where denotes a specific type of iterator. struct bpf_iter_ state is supposed to live on BPF program stack, so there will be no way to change its size later on without breaking backwards compatibility, so choose wisely! But given this struct is specific to a given of iterator, this allows a lot of flexibility: simple iterators could be fine with just one stack slot (8 bytes), like numbers iterator in the next patch, while some other more complicated iterators might need way more to keep their iterator state. Either way, such design allows to avoid runtime memory allocations, which otherwise would be necessary if we fixed on-the-stack size and it turned out to be too small for a given iterator implementation. The way BPF verifier logic is implemented, there are no artificial restrictions on a number of active iterators, it should work correctly using multiple active iterators at the same time. This also means you can have multiple nested iteration loops. struct bpf_iter_ reference can be safely passed to subprograms as well. General flow is easiest to demonstrate with a simple example using number iterator implemented in next patch. Here's the simplest possible loop: struct bpf_iter_num it; int *v; bpf_iter_num_new(&it, 2, 5); while ((v = bpf_iter_num_next(&it))) { bpf_printk("X = %d", *v); } bpf_iter_num_destroy(&it); Above snippet should output "X = 2", "X = 3", "X = 4". Note that 5 is exclusive and is not returned. This matches similar APIs (e.g., slices in Go or Rust) that implement a range of elements, where end index is non-inclusive. In the above example, we see a trio of function: - constructor, bpf_iter_num_new(), which initializes iterator state (struct bpf_iter_num it) on the stack. If any of the input arguments are invalid, constructor should make sure to still initialize it such that subsequent bpf_iter_num_next() calls will return NULL. I.e., on error, return error and construct empty iterator. - next method, bpf_iter_num_next(), which accepts pointer to iterator state and produces an element. Next method should always return a pointer. The contract between BPF verifier is that next method will always eventually return NULL when elements are exhausted. Once NULL is returned, subsequent next calls should keep returning NULL. In the case of numbers iterator, bpf_iter_num_next() returns a pointer to an int (storage for this integer is inside the iterator state itself), which can be dereferenced after corresponding NULL check. - once done with the iterator, it's mandated that user cleans up its state with the call to destructor, bpf_iter_num_destroy() in this case. Destructor frees up any resources and marks stack space used by struct bpf_iter_num as usable for something else. Any other iterator implementation will have to implement at least these three methods. It is enforced that for any given type of iterator only applicable constructor/destructor/next are callable. I.e., verifier ensures you can't pass number iterator state into, say, cgroup iterator's next method. It is important to keep the naming pattern consistent to be able to create generic macros to help with BPF iter usability. E.g., one of the follow up patches adds generic bpf_for_each() macro to bpf_misc.h in selftests, which allows to utilize iterator "trio" nicely without having to code the above somewhat tedious loop explicitly every time. This is enforced at kfunc registration point by one of the previous patches in this series. At the implementation level, iterator state tracking for verification purposes is very similar to dynptr. We add STACK_ITER stack slot type, reserve necessary number of slots, depending on sizeof(struct bpf_iter_), and keep track of necessary extra state in the "main" slot, which is marked with non-zero ref_obj_id. Other slots are also marked as STACK_ITER, but have zero ref_obj_id. This is simpler than having a separate "is_first_slot" flag. Another big distinction is that STACK_ITER is *always refcounted*, which simplifies implementation without sacrificing usability. So no need for extra "iter_id", no need to anticipate reuse of STACK_ITER slots for new constructors, etc. Keeping it simple here. As far as the verification logic goes, there are two extensive comments: in process_iter_next_call() and iter_active_depths_differ() explaining some important and sometimes subtle aspects. Please refer to them for details. But from 10,000-foot point of view, next methods are the points of forking a verification state, which are conceptually similar to what verifier is doing when validating conditional jump. We branch out at a `call bpf_iter__next` instruction and simulate two outcomes: NULL (iteration is done) and non-NULL (new element is returned). NULL is simulated first and is supposed to reach exit without looping. After that non-NULL case is validated and it either reaches exit (for trivial examples with no real loop), or reaches another `call bpf_iter__next` instruction with the state equivalent to already (partially) validated one. State equivalency at that point means we technically are going to be looping forever without "breaking out" out of established "state envelope" (i.e., subsequent iterations don't add any new knowledge or constraints to the verifier state, so running 1, 2, 10, or a million of them doesn't matter). But taking into account the contract stating that iterator next method *has to* return NULL eventually, we can conclude that loop body is safe and will eventually terminate. Given we validated logic outside of the loop (NULL case), and concluded that loop body is safe (though potentially looping many times), verifier can claim safety of the overall program logic. The rest of the patch is necessary plumbing for state tracking, marking, validation, and necessary further kfunc plumbing to allow implementing iterator constructor, destructor, and next methods. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e2dc7f064449..0c052bc79940 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -61,6 +61,12 @@ struct bpf_active_lock { #define ITER_PREFIX "bpf_iter_" +enum bpf_iter_state { + BPF_ITER_STATE_INVALID, /* for non-first slot */ + BPF_ITER_STATE_ACTIVE, + BPF_ITER_STATE_DRAINED, +}; + struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; @@ -105,6 +111,18 @@ struct bpf_reg_state { bool first_slot; } dynptr; + /* For bpf_iter stack slots */ + struct { + /* BTF container and BTF type ID describing + * struct bpf_iter_ of an iterator state + */ + struct btf *btf; + u32 btf_id; + /* packing following two fields to fit iter state into 16 bytes */ + enum bpf_iter_state state:2; + int depth:30; + } iter; + /* Max size from any of the above. */ struct { unsigned long raw1; @@ -143,6 +161,8 @@ struct bpf_reg_state { * same reference to the socket, to determine proper reference freeing. * For stack slots that are dynptrs, this is used to track references to * the dynptr to determine proper reference freeing. + * Similarly to dynptrs, we use ID to track "belonging" of a reference + * to a specific instance of bpf_iter. */ u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned @@ -213,9 +233,11 @@ enum bpf_stack_slot_type { * is stored in bpf_stack_state->spilled_ptr.dynptr.type */ STACK_DYNPTR, + STACK_ITER, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ + #define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern) #define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE) @@ -450,6 +472,7 @@ struct bpf_insn_aux_data { bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ + bool is_iter_next; /* bpf_iter__next() kfunc call */ u8 alu_state; /* used in combination with alu_limit */ /* below fields are initialized once */ -- cgit v1.2.3 From 6018e1f407cccf39b804d1f75ad4de7be4e6cc45 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:17 -0800 Subject: bpf: implement numbers iterator Implement the first open-coded iterator type over a range of integers. It's public API consists of: - bpf_iter_num_new() constructor, which accepts [start, end) range (that is, start is inclusive, end is exclusive). - bpf_iter_num_next() which will keep returning read-only pointer to int until the range is exhausted, at which point NULL will be returned. If bpf_iter_num_next() is kept calling after this, NULL will be persistently returned. - bpf_iter_num_destroy() destructor, which needs to be called at some point to clean up iterator state. BPF verifier enforces that iterator destructor is called at some point before BPF program exits. Note that `start = end = X` is a valid combination to setup an empty iterator. bpf_iter_num_new() will return 0 (success) for any such combination. If bpf_iter_num_new() detects invalid combination of input arguments, it returns error, resets iterator state to, effectively, empty iterator, so any subsequent call to bpf_iter_num_next() will keep returning NULL. BPF verifier has no knowledge that returned integers are in the [start, end) value range, as both `start` and `end` are not statically known and enforced: they are runtime values. While the implementation is pretty trivial, some care needs to be taken to avoid overflows and underflows. Subsequent selftests will validate correctness of [start, end) semantics, especially around extremes (INT_MIN and INT_MAX). Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can be specified. bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded BPF loops and bpf_loop() helper and is the basis for implementing ergonomic BPF loops with no statically known or verified bounds. Subsequent patches implement bpf_for() macro, demonstrating how this can be wrapped into something that works and feels like a normal for() loop in C language. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6792a7940e1e..e64ff1e89fb2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1617,8 +1617,12 @@ struct bpf_array { #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 -/* Maximum number of loops for bpf_loop */ -#define BPF_MAX_LOOPS BIT(23) +/* Maximum number of loops for bpf_loop and bpf_iter_num. + * It's enum to expose it (and thus make it discoverable) through BTF. + */ +enum { + BPF_MAX_LOOPS = 8 * 1024 * 1024, +}; #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ BPF_F_RDONLY_PROG | \ -- cgit v1.2.3 From 6978052448f9eb19f7b03243ac0416104e5ee50d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 8 Mar 2023 15:20:06 +0100 Subject: netlink: remove unused 'compare' function No users in the tree. Tested with allmodconfig build. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230308142006.20879-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index c43ac7690eca..3e8743252167 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -50,7 +50,6 @@ struct netlink_kernel_cfg { struct mutex *cb_mutex; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); - bool (*compare)(struct net *net, struct sock *sk); }; struct sock *__netlink_kernel_create(struct net *net, int unit, -- cgit v1.2.3 From 4b5ce570dbef57a20acdd71b0c65376009012354 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Mar 2023 22:01:49 -0800 Subject: bpf: ensure state checkpointing at iter_next() call sites State equivalence check and checkpointing performed in is_state_visited() employs certain heuristics to try to save memory by avoiding state checkpoints if not enough jumps and instructions happened since last checkpoint. This leads to unpredictability of whether a particular instruction will be checkpointed and how regularly. While normally this is not causing much problems (except inconveniences for predictable verifier tests, which we overcome with BPF_F_TEST_STATE_FREQ flag), turns out it's not the case for open-coded iterators. Checking and saving state checkpoints at iter_next() call is crucial for fast convergence of open-coded iterator loop logic, so we need to force it. If we don't do that, is_state_visited() might skip saving a checkpoint, causing unnecessarily long sequence of not checkpointed instructions and jumps, leading to exhaustion of jump history buffer, and potentially other undesired outcomes. It is expected that with correct open-coded iterators convergence will happen quickly, so we don't run a risk of exhausting memory. This patch adds, in addition to prune and jump instruction marks, also a "forced checkpoint" mark, and makes sure that any iter_next() call instruction is marked as such. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230310060149.625887-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0c052bc79940..81d525d057c7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -477,8 +477,12 @@ struct bpf_insn_aux_data { /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ - bool prune_point; bool jmp_point; + bool prune_point; + /* ensure we check state equivalence and save state checkpoint and + * this instruction, regardless of any heuristics + */ + bool force_checkpoint; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ -- cgit v1.2.3 From 4cbd23cc92c49173e402753cab62b8a7754ed18f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:20 -0800 Subject: bpf: Move a few bpf_local_storage functions to static scope This patch moves the bpf_local_storage_free_rcu() and bpf_selem_unlink_map() to static because they are not used outside of bpf_local_storage.c. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index d934248b8e81..502ad7093f13 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -147,8 +147,6 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); - struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, bool charge_mem, gfp_t gfp_flags); @@ -163,7 +161,6 @@ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags); -void bpf_local_storage_free_rcu(struct rcu_head *rcu); u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map); #endif /* _BPF_LOCAL_STORAGE_H */ -- cgit v1.2.3 From 2ffcb6fc50174d1efc8f98633eb2647d84483c68 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:21 -0800 Subject: bpf: Refactor codes into bpf_local_storage_destroy This patch first renames bpf_local_storage_unlink_nolock to bpf_local_storage_destroy(). It better reflects that it is only used when the storage's owner (sk/task/cgrp/inode) is being kfree(). All bpf_local_storage_destroy's caller is taking the spin lock and then free the storage. This patch also moves these two steps into the bpf_local_storage_destroy. This is a preparation work for a later patch that uses bpf_mem_cache_alloc/free in the bpf_local_storage. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 502ad7093f13..5908a954ddc2 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -128,7 +128,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit); -bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage); +void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache, -- cgit v1.2.3 From fc6652aab6ad545de70b772550da9043d0b47f1c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:24 -0800 Subject: bpf: Remember smap in bpf_local_storage This patch remembers which smap triggers the allocation of a 'struct bpf_local_storage' object. The local_storage is allocated during the very first selem added to the owner. The smap pointer is needed when using the bpf_mem_cache_free in a later patch because it needs to free to the correct smap's bpf_mem_alloc object. When a selem is being removed, it needs to check if it is the selem that triggers the creation of the local_storage. If it is, the local_storage->smap pointer will be reset to NULL. This NULL reset is done under the local_storage->lock in bpf_selem_unlink_storage_nolock() when a selem is being removed. Also note that the local_storage may not go away even local_storage->smap is NULL because there may be other selem still stored in the local_storage. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 5908a954ddc2..613b1805ed9f 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -83,6 +83,7 @@ struct bpf_local_storage_elem { struct bpf_local_storage { struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE_SIZE]; + struct bpf_local_storage_map __rcu *smap; struct hlist_head list; /* List of bpf_local_storage_elem */ void *owner; /* The object that owns the above "list" of * bpf_local_storage_elem. -- cgit v1.2.3 From a47eabf216f77cb6f22ceb38d46f1bb95968579c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:25 -0800 Subject: bpf: Repurpose use_trace_rcu to reuse_now in bpf_local_storage This patch re-purpose the use_trace_rcu to mean if the freed memory can be reused immediately or not. The use_trace_rcu is renamed to reuse_now. Other than the boolean test is reversed, it should be a no-op. The following explains the reason for the rename and how it will be used in a later patch. In a later patch, bpf_mem_cache_alloc/free will be used in the bpf_local_storage. The bpf mem allocator will reuse the freed memory immediately. Some of the free paths in bpf_local_storage does not support memory to be reused immediately. These paths are the "delete" elem cases from the bpf_*_storage_delete() helper and the map_delete_elem() syscall. Note that "delete" elem before the owner's (sk/task/cgrp/inode) lifetime ended is not the common usage for the local storage. The common free path, bpf_local_storage_destroy(), can reuse the memory immediately. This common path means the storage stays with its owner until the owner is destroyed. The above mentioned "delete" elem paths that cannot reuse immediately always has the 'use_trace_rcu == true'. The cases that is safe for immediate reuse always have 'use_trace_rcu == false'. Instead of adding another arg in a later patch, this patch re-purpose this arg to reuse_now and have the test logic reversed. In a later patch, 'reuse_now == true' will free to the bpf_mem_cache_free() where the memory can be reused immediately. 'reuse_now == false' will go through the call_rcu_tasks_trace(). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-7-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 613b1805ed9f..18a31add2255 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -143,7 +143,7 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); +void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now); void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem); -- cgit v1.2.3 From c0d63f309186d8492577c67c67984c714b6b72bc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:28 -0800 Subject: bpf: Add bpf_selem_free() This patch refactors the selem freeing logic into bpf_selem_free(). It is a preparation work for a later patch using bpf_mem_cache_alloc/free. The other kfree(selem) cases are also changed to bpf_selem_free(..., reuse_now = true). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-10-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 18a31add2255..a34f61467a2f 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -152,6 +152,10 @@ struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, bool charge_mem, gfp_t gfp_flags); +void bpf_selem_free(struct bpf_local_storage_elem *selem, + struct bpf_local_storage_map *smap, + bool reuse_now); + int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, -- cgit v1.2.3 From 74843b57ec70af7b67b7e6153374834ee18d139f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 9 Mar 2023 10:01:08 -0800 Subject: bpf: Change btf_record_find enum parameter to field_mask btf_record_find's 3rd parameter can be multiple enum btf_field_type's masked together. The function is called with BPF_KPTR in two places in verifier.c, so it works with masked values already. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230309180111.1618459-4-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e64ff1e89fb2..3a38db315f7f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1925,7 +1925,7 @@ void bpf_prog_free_id(struct bpf_prog *prog); void bpf_map_free_id(struct bpf_map *map); struct btf_field *btf_record_find(const struct btf_record *rec, - u32 offset, enum btf_field_type type); + u32 offset, u32 field_mask); void btf_record_free(struct btf_record *rec); void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); -- cgit v1.2.3 From c8e18754091479fac3f5b6c053c6bc4be0b7fb11 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 10 Mar 2023 15:07:41 -0800 Subject: bpf: Support __kptr to local kptrs If a PTR_TO_BTF_ID type comes from program BTF - not vmlinux or module BTF - it must have been allocated by bpf_obj_new and therefore must be free'd with bpf_obj_drop. Such a PTR_TO_BTF_ID is considered a "local kptr" and is tagged with MEM_ALLOC type tag by bpf_obj_new. This patch adds support for treating __kptr-tagged pointers to "local kptrs" as having an implicit bpf_obj_drop destructor for referenced kptr acquire / release semantics. Consider the following example: struct node_data { long key; long data; struct bpf_rb_node node; }; struct map_value { struct node_data __kptr *node; }; struct { __uint(type, BPF_MAP_TYPE_ARRAY); __type(key, int); __type(value, struct map_value); __uint(max_entries, 1); } some_nodes SEC(".maps"); If struct node_data had a matching definition in kernel BTF, the verifier would expect a destructor for the type to be registered. Since struct node_data does not match any type in kernel BTF, the verifier knows that there is no kfunc that provides a PTR_TO_BTF_ID to this type, and that such a PTR_TO_BTF_ID can only come from bpf_obj_new. So instead of searching for a registered dtor, a bpf_obj_drop dtor can be assumed. This allows the runtime to properly destruct such kptrs in bpf_obj_free_fields, which enables maps to clean up map_vals w/ such kptrs when going away. Implementation notes: * "kernel_btf" variable is renamed to "kptr_btf" in btf_parse_kptr. Before this patch, the variable would only ever point to vmlinux or module BTFs, but now it can point to some program BTF for local kptr type. It's later used to populate the (btf, btf_id) pair in kptr btf field. * It's necessary to btf_get the program BTF when populating btf_field for local kptr. btf_record_free later does a btf_put. * Behavior for non-local referenced kptrs is not modified, as bpf_find_btf_id helper only searches vmlinux and module BTFs for matching BTF type. If such a type is found, btf_field_kptr's btf will pass btf_is_kernel check, and the associated release function is some one-argument dtor. If btf_is_kernel check fails, associated release function is two-arg bpf_obj_drop_impl. Before this patch only btf_field_kptr's w/ kernel or module BTFs were created. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230310230743.2320707-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++++++++- include/linux/btf.h | 2 -- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3a38db315f7f..756b85f0d0d3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -189,10 +189,19 @@ enum btf_field_type { BPF_RB_NODE | BPF_RB_ROOT, }; +typedef void (*btf_dtor_kfunc_t)(void *); +typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *); + struct btf_field_kptr { struct btf *btf; struct module *module; - btf_dtor_kfunc_t dtor; + union { + /* dtor used if btf_is_kernel(btf), otherwise the type + * is program-allocated and obj_drop is used + */ + btf_dtor_kfunc_t dtor; + btf_dtor_obj_drop obj_drop; + }; u32 btf_id; }; diff --git a/include/linux/btf.h b/include/linux/btf.h index 1bba0827e8c4..d53b10cc55f2 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -121,8 +121,6 @@ struct btf_struct_metas { struct btf_struct_meta types[]; }; -typedef void (*btf_dtor_kfunc_t)(void *); - extern const struct file_operations btf_fops; void btf_get(struct btf *btf); -- cgit v1.2.3 From 22df776a9a866713d9decfb92b633bcfdb571954 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Wed, 15 Feb 2023 17:30:33 -0600 Subject: tasks: Extract rcu_users out of union In commit 3fbd7ee285b2b ("tasks: Add a count of task RCU users"), a count on the number of RCU users was added to struct task_struct. This was done so as to enable the removal of task_rcu_dereference(), and allow tasks to be protected by RCU even after exiting and being removed from the runqueue. In this commit, the 'refcount_t rcu_users' field that keeps track of this refcount was put into a union co-located with 'struct rcu_head rcu', so as to avoid taking up any extra space in task_struct. This was possible to do safely, because the field was only ever decremented by a static set of specific callers, and then never incremented again. While this restriction of there only being a small, static set of users of this field has worked fine, it prevents us from leveraging the field to use RCU to protect tasks in other contexts. During tracing, for example, it would be useful to be able to collect some tasks that performed a certain operation, put them in a map, and then periodically summarize who they are, which cgroup they're in, how much CPU time they've utilized, etc. While this can currently be done with 'usage', it becomes tricky when a task is already in a map, or if a reference should only be taken if a task is valid and will not soon be reaped. Ideally, we could do something like pass a reference to a map value, and then try to acquire a reference to the task in an RCU read region by using refcount_inc_not_zero(). Similarly, in sched_ext, schedulers are using integer pids to remember tasks, and then looking them up with find_task_by_pid_ns(). This is slow, error prone, and adds complexity. It would be more convenient and performant if BPF schedulers could instead store tasks directly in maps, and then leverage RCU to ensure they can be safely accessed with low overhead. Finally, overloading fields like this is error prone. Someone that wants to use 'rcu_users' could easily overlook the fact that once the rcu callback is scheduled, the refcount will go back to being nonzero, thus precluding the use of refcount_inc_not_zero(). Furthermore, as described below, it's possible to extract the fields of the union without changing the size of task_struct. There are several possible ways to enable this: 1. The lightest touch approach is likely the one proposed in this patch, which is to simply extract 'rcu_users' and 'rcu' from the union, so that scheduling the 'rcu' callback doesn't overwrite the 'rcu_users' refcount. If we have a trusted task pointer, this would allow us to use refcnt_inc_not_zero() inside of an RCU region to determine if we can safely acquire a reference to the task and store it in a map. As mentioned below, this can be done without changing the size of task_struct, by moving the location of the union to another location that has padding gaps we can fill in. 2. Removing 'refcount_t rcu_users', and instead having the entire task be freed in an rcu callback. This is likely the most sound overall design, though it changes the behavioral semantics exposed to callers, who currently expect that a task that's successfully looked up in e.g. the pid_list with find_task_by_pid_ns(), can always have a 'usage' reference acquired on them, as it's guaranteed to be > 0 until after the next gp. In order for this approach to work, we'd have to audit all callers. This approach also slightly changes behavior observed by user space by not invoking trace_sched_process_free() until the whole task_struct is actually being freed, rather than just after it's exited. It also may change timings, as memory will be freed in an RCU callback rather than immediately when the final 'usage' refcount drops to 0. This also is arguably a benefit, as it provides more predictable performance to callers who are refcounting tasks. 3. There may be other solutions as well that don't require changing the layout of task_struct. For example, we could possibly do something complex from the BPF side, such as listen for task exit and remove a task from a map when the task is exiting. This would likely require significant custom handling for task_struct in the verifier, so a more generalizable solution is likely warranted. As mentioned above, this patch proposes the lightest-touch approach which allows callers elsewhere in the kernel to use 'rcu_users' to ensure the lifetime of a task, by extracting 'rcu_users' and 'rcu' from the union. There is no size change in task_struct with this patch. Cc: Linus Torvalds Cc: Eric W. Biederman Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: David Vernet Acked-by: Oleg Nesterov Link: https://lore.kernel.org/r/20230215233033.889644-1-void@manifault.com Signed-off-by: Alexei Starovoitov --- include/linux/sched.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 63d242164b1a..b11b4517760f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1318,11 +1318,6 @@ struct task_struct { struct tlbflush_unmap_batch tlb_ubc; - union { - refcount_t rcu_users; - struct rcu_head rcu; - }; - /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; @@ -1459,6 +1454,8 @@ struct task_struct { unsigned long saved_state_change; # endif #endif + struct rcu_head rcu; + refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; -- cgit v1.2.3 From 9e36a204bd43553a9cd4bd574612cd9a5df791ea Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 13 Mar 2023 14:46:41 -0700 Subject: bpf: Disable migration when freeing stashed local kptr using obj drop When a local kptr is stashed in a map and freed when the map goes away, currently an error like the below appears: [ 39.195695] BUG: using smp_processor_id() in preemptible [00000000] code: kworker/u32:15/2875 [ 39.196549] caller is bpf_mem_free+0x56/0xc0 [ 39.196958] CPU: 15 PID: 2875 Comm: kworker/u32:15 Tainted: G O 6.2.0-13016-g22df776a9a86 #4477 [ 39.197897] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 [ 39.198949] Workqueue: events_unbound bpf_map_free_deferred [ 39.199470] Call Trace: [ 39.199703] [ 39.199911] dump_stack_lvl+0x60/0x70 [ 39.200267] check_preemption_disabled+0xbf/0xe0 [ 39.200704] bpf_mem_free+0x56/0xc0 [ 39.201032] ? bpf_obj_new_impl+0xa0/0xa0 [ 39.201430] bpf_obj_free_fields+0x1cd/0x200 [ 39.201838] array_map_free+0xad/0x220 [ 39.202193] ? finish_task_switch+0xe5/0x3c0 [ 39.202614] bpf_map_free_deferred+0xea/0x210 [ 39.203006] ? lockdep_hardirqs_on_prepare+0xe/0x220 [ 39.203460] process_one_work+0x64f/0xbe0 [ 39.203822] ? pwq_dec_nr_in_flight+0x110/0x110 [ 39.204264] ? do_raw_spin_lock+0x107/0x1c0 [ 39.204662] ? lockdep_hardirqs_on_prepare+0xe/0x220 [ 39.205107] worker_thread+0x74/0x7a0 [ 39.205451] ? process_one_work+0xbe0/0xbe0 [ 39.205818] kthread+0x171/0x1a0 [ 39.206111] ? kthread_complete_and_exit+0x20/0x20 [ 39.206552] ret_from_fork+0x1f/0x30 [ 39.206886] This happens because the call to __bpf_obj_drop_impl I added in the patch adding support for stashing local kptrs doesn't disable migration. Prior to that patch, __bpf_obj_drop_impl logic only ran when called by a BPF progarm, whereas now it can be called from map free path, so it's necessary to explicitly disable migration. Also, refactor a bit to just call __bpf_obj_drop_impl directly instead of bothering w/ dtor union and setting pointer-to-obj_drop. Fixes: c8e187540914 ("bpf: Support __kptr to local kptrs") Reported-by: Alexei Starovoitov Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230313214641.3731908-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 756b85f0d0d3..71cc92a4ba48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -190,18 +190,14 @@ enum btf_field_type { }; typedef void (*btf_dtor_kfunc_t)(void *); -typedef void (*btf_dtor_obj_drop)(void *, const struct btf_record *); struct btf_field_kptr { struct btf *btf; struct module *module; - union { - /* dtor used if btf_is_kernel(btf), otherwise the type - * is program-allocated and obj_drop is used - */ - btf_dtor_kfunc_t dtor; - btf_dtor_obj_drop obj_drop; - }; + /* dtor used if btf_is_kernel(btf), otherwise the type is + * program-allocated, dtor is NULL, and __bpf_obj_drop_impl is used + */ + btf_dtor_kfunc_t dtor; u32 btf_id; }; -- cgit v1.2.3 From 2c854e5fcd7e243f5a7cf6a6afa0ef83060c903c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Mon, 13 Mar 2023 22:55:51 +0100 Subject: net: page_pool, skbuff: make skb_mark_for_recycle() always available skb_mark_for_recycle() is guarded with CONFIG_PAGE_POOL, this creates unneeded complication when using it in the generic code. For now, it's only used in the drivers always selecting Page Pool, so this works. Move the guards so that preprocessor will cut out only the operation itself and the function will still be a noop on !PAGE_POOL systems, but available there as well. No functional changes. Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202303020342.Wi2PRFFH-lkp@intel.com Signed-off-by: Alexander Lobakin Link: https://lore.kernel.org/r/20230313215553.1045175-3-aleksander.lobakin@intel.com Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fe661011644b..3f3a2a82a86b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5069,12 +5069,12 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb) #endif } -#ifdef CONFIG_PAGE_POOL static inline void skb_mark_for_recycle(struct sk_buff *skb) { +#ifdef CONFIG_PAGE_POOL skb->pp_recycle = 1; -} #endif +} #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ -- cgit v1.2.3 From 31bf1dbccfb0a9861d4846755096b3fff5687f8a Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Fri, 10 Mar 2023 08:40:59 +0100 Subject: bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules This resolves two problems with attachment of fentry/fexit/fmod_ret/lsm to functions located in modules: 1. The verifier tries to find the address to attach to in kallsyms. This is always done by searching the entire kallsyms, not respecting the module in which the function is located. Such approach causes an incorrect attachment address to be computed if the function to attach to is shadowed by a function of the same name located earlier in kallsyms. 2. If the address to attach to is located in a module, the module reference is only acquired in register_fentry. If the module is unloaded between the place where the address is found (bpf_check_attach_target in the verifier) and register_fentry, it is possible that another module is loaded to the same address which may lead to potential errors. Since the attachment must contain the BTF of the program to attach to, we extract the module from it and search for the function address in the correct module (resolving problem no. 1). Then, the module reference is taken directly in bpf_check_attach_target and stored in the bpf program (in bpf_prog_aux). The reference is only released when the program is unloaded (resolving problem no. 2). Signed-off-by: Viktor Malik Acked-by: Jiri Olsa Reviewed-by: Luis Chamberlain Link: https://lore.kernel.org/r/3f6a9d8ae850532b5ef864ef16327b0f7a669063.1678432753.git.vmalik@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 71cc92a4ba48..3ef98fb92987 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1103,6 +1103,7 @@ struct bpf_trampoline { struct bpf_attach_target_info { struct btf_func_model fmodel; long tgt_addr; + struct module *tgt_mod; const char *tgt_name; const struct btf_type *tgt_type; }; @@ -1406,6 +1407,7 @@ struct bpf_prog_aux { * main prog always has linfo_idx == 0 */ u32 linfo_idx; + struct module *mod; u32 num_exentries; struct exception_table_entry *extable; union { -- cgit v1.2.3 From c1fef618d611b31964ab397aa0bf0611da94bade Mon Sep 17 00:00:00 2001 From: Sandipan Patra Date: Mon, 13 Mar 2023 22:42:23 -0700 Subject: net/mlx5: Implement thermal zone Implement thermal zone support for mlx5 based HW. The NIC uses temperature sensor provided by ASIC to report current temperature to thermal core. Signed-off-by: Sandipan Patra Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20230314054234.267365-5-saeed@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 3 +++ include/linux/mlx5/mlx5_ifc.h | 26 ++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f33389b42209..7a898113b6b7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -134,6 +134,7 @@ enum { MLX5_REG_PCAM = 0x507f, MLX5_REG_NODE_DESC = 0x6001, MLX5_REG_HOST_ENDIANNESS = 0x7004, + MLX5_REG_MTMP = 0x900A, MLX5_REG_MCIA = 0x9014, MLX5_REG_MFRL = 0x9028, MLX5_REG_MLCR = 0x902b, @@ -731,6 +732,7 @@ struct mlx5_fw_tracer; struct mlx5_vxlan; struct mlx5_geneve; struct mlx5_hv_vhca; +struct mlx5_thermal; #define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity)) #define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)) @@ -808,6 +810,7 @@ struct mlx5_core_dev { struct mlx5_rsc_dump *rsc_dump; u32 vsc_addr; struct mlx5_hv_vhca *hv_vhca; + struct mlx5_thermal *thermal; }; struct mlx5_db { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 66d76e97a087..d2c164f0778c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10869,6 +10869,31 @@ struct mlx5_ifc_mrtc_reg_bits { u8 time_l[0x20]; }; +struct mlx5_ifc_mtmp_reg_bits { + u8 reserved_at_0[0x14]; + u8 sensor_index[0xc]; + + u8 reserved_at_20[0x10]; + u8 temperature[0x10]; + + u8 mte[0x1]; + u8 mtr[0x1]; + u8 reserved_at_42[0xe]; + u8 max_temperature[0x10]; + + u8 tee[0x2]; + u8 reserved_at_62[0xe]; + u8 temp_threshold_hi[0x10]; + + u8 reserved_at_80[0x10]; + u8 temp_threshold_lo[0x10]; + + u8 reserved_at_a0[0x20]; + + u8 sensor_name_hi[0x20]; + u8 sensor_name_lo[0x20]; +}; + union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_bufferx_reg_bits bufferx_reg; struct mlx5_ifc_eth_2819_cntrs_grp_data_layout_bits eth_2819_cntrs_grp_data_layout; @@ -10931,6 +10956,7 @@ union mlx5_ifc_ports_control_registers_document_bits { struct mlx5_ifc_mfrl_reg_bits mfrl_reg; struct mlx5_ifc_mtutc_reg_bits mtutc_reg; struct mlx5_ifc_mrtc_reg_bits mrtc_reg; + struct mlx5_ifc_mtmp_reg_bits mtmp_reg; u8 reserved_at_0[0x60e0]; }; -- cgit v1.2.3 From 028522e2844393abc44f7bd7477eb4a455f01579 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 13 Mar 2023 22:42:29 -0700 Subject: net/mlx5: Move needed PTYS functions to core layer Downstream patches require devlink params to access the PTYS register, move the needed functions from mlx5e to the core layer. Signed-off-by: Gal Pressman Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20230314054234.267365-11-saeed@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/mlx5/port.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index e96ee1e348cb..98b2e1e149f9 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -141,6 +141,12 @@ enum mlx5_ptys_width { MLX5_PTYS_WIDTH_12X = 1 << 4, }; +struct mlx5_port_eth_proto { + u32 cap; + u32 admin; + u32 oper; +}; + #define MLX5E_PROT_MASK(link_mode) (1U << link_mode) #define MLX5_GET_ETH_PROTO(reg, out, ext, field) \ (ext ? MLX5_GET(reg, out, ext_##field) : \ @@ -218,4 +224,14 @@ int mlx5_set_trust_state(struct mlx5_core_dev *mdev, u8 trust_state); int mlx5_query_trust_state(struct mlx5_core_dev *mdev, u8 *trust_state); int mlx5_set_dscp2prio(struct mlx5_core_dev *mdev, u8 dscp, u8 prio); int mlx5_query_dscp2prio(struct mlx5_core_dev *mdev, u8 *dscp2prio); + +int mlx5_port_query_eth_proto(struct mlx5_core_dev *dev, u8 port, bool ext, + struct mlx5_port_eth_proto *eproto); +bool mlx5_ptys_ext_supported(struct mlx5_core_dev *mdev); +u32 mlx5_port_ptys2speed(struct mlx5_core_dev *mdev, u32 eth_proto_oper, + bool force_legacy); +u32 mlx5_port_speed2linkmodes(struct mlx5_core_dev *mdev, u32 speed, + bool force_legacy); +int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); + #endif /* __MLX5_PORT_H__ */ -- cgit v1.2.3 From 053fdaa841bd1af9fe9c2c30bba81119059aac95 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Tue, 14 Mar 2023 15:13:08 -0500 Subject: nfc: mrvl: Move platform_data struct into driver There are no users of nfcmrvl platform_data struct outside of the driver and none will be added, so move it into the driver. Signed-off-by: Rob Herring Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/platform_data/nfcmrvl.h | 48 ----------------------------------- 1 file changed, 48 deletions(-) delete mode 100644 include/linux/platform_data/nfcmrvl.h (limited to 'include/linux') diff --git a/include/linux/platform_data/nfcmrvl.h b/include/linux/platform_data/nfcmrvl.h deleted file mode 100644 index 9e75ac8d19be..000000000000 --- a/include/linux/platform_data/nfcmrvl.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (C) 2015, Marvell International Ltd. - * - * This software file (the "File") is distributed by Marvell International - * Ltd. under the terms of the GNU General Public License Version 2, June 1991 - * (the "License"). You may use, redistribute and/or modify this File in - * accordance with the terms and conditions of the License, a copy of which - * is available on the worldwide web at - * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt. - * - * THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE - * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE - * ARE EXPRESSLY DISCLAIMED. The License provides additional details about - * this warranty disclaimer. - */ - -#ifndef _NFCMRVL_PTF_H_ -#define _NFCMRVL_PTF_H_ - -struct nfcmrvl_platform_data { - /* - * Generic - */ - - /* GPIO that is wired to RESET_N signal */ - int reset_n_io; - /* Tell if transport is muxed in HCI one */ - unsigned int hci_muxed; - - /* - * UART specific - */ - - /* Tell if UART needs flow control at init */ - unsigned int flow_control; - /* Tell if firmware supports break control for power management */ - unsigned int break_control; - - - /* - * I2C specific - */ - - unsigned int irq; - unsigned int irq_polarity; -}; - -#endif /* _NFCMRVL_PTF_H_ */ -- cgit v1.2.3 From 8c44fa12c8fa09c6c12f0dc25129a6d13ee0a1ea Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Mar 2023 15:11:45 +0200 Subject: net: Add MDB net device operations Add MDB net device operations that will be invoked by rtnetlink code in response to received RTM_{NEW,DEL,GET}MDB messages. Subsequent patches will implement these operations in the bridge and VXLAN drivers. Signed-off-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ee483071cf59..23b0d7eaaadd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1307,6 +1307,17 @@ struct netdev_net_notifier { * Used to add FDB entries to dump requests. Implementers should add * entries to skb and update idx with the number of entries. * + * int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[], + * u16 nlmsg_flags, struct netlink_ext_ack *extack); + * Adds an MDB entry to dev. + * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], + * struct netlink_ext_ack *extack); + * Deletes the MDB entry from dev. + * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, + * struct netlink_callback *cb); + * Dumps MDB entries from dev. The first argument (marker) in the netlink + * callback is used by core rtnetlink code. + * * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, * u16 flags, struct netlink_ext_ack *extack) * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq, @@ -1569,6 +1580,16 @@ struct net_device_ops { const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack); + int (*ndo_mdb_add)(struct net_device *dev, + struct nlattr *tb[], + u16 nlmsg_flags, + struct netlink_ext_ack *extack); + int (*ndo_mdb_del)(struct net_device *dev, + struct nlattr *tb[], + struct netlink_ext_ack *extack); + int (*ndo_mdb_dump)(struct net_device *dev, + struct sk_buff *skb, + struct netlink_callback *cb); int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, -- cgit v1.2.3 From 33e972bdf0b0aa208b67164c64eef3c307e4b303 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 16 Mar 2023 15:31:56 +0000 Subject: ipv4: constify ip_mc_sf_allow() socket argument This clarifies ip_mc_sf_allow() intent. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/igmp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index b19d3284551f..ebf4349a53af 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -122,7 +122,7 @@ extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, sockptr_t optval, sockptr_t optlen); extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t offset); -extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt, +extern int ip_mc_sf_allow(const struct sock *sk, __be32 local, __be32 rmt, int dif, int sdif); extern void ip_mc_init_dev(struct in_device *); extern void ip_mc_destroy_dev(struct in_device *); -- cgit v1.2.3 From bd5314f8dd2d41330eecb60f0490c3fcfe1fc99d Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Fri, 17 Mar 2023 10:56:01 +0100 Subject: kallsyms, bpf: Move find_kallsyms_symbol_value out of internal header Moving find_kallsyms_symbol_value from kernel/module/internal.h to include/linux/module.h. The reason is that internal.h is not prepared to be included when CONFIG_MODULES=n. find_kallsyms_symbol_value is used by kernel/bpf/verifier.c and including internal.h from it (without modules) leads into a compilation error: In file included from ../include/linux/container_of.h:5, from ../include/linux/list.h:5, from ../include/linux/timer.h:5, from ../include/linux/workqueue.h:9, from ../include/linux/bpf.h:10, from ../include/linux/bpf-cgroup.h:5, from ../kernel/bpf/verifier.c:7: ../kernel/bpf/../module/internal.h: In function 'mod_find': ../include/linux/container_of.h:20:54: error: invalid use of undefined type 'struct module' 20 | static_assert(__same_type(*(ptr), ((type *)0)->member) || \ | ^~ [...] This patch fixes the above error. Fixes: 31bf1dbccfb0 ("bpf: Fix attaching fentry/fexit/fmod_ret/lsm to modules") Reported-by: kernel test robot Signed-off-by: Viktor Malik Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/oe-kbuild-all/202303161404.OrmfCy09-lkp@intel.com/ Link: https://lore.kernel.org/bpf/20230317095601.386738-1-vmalik@redhat.com --- include/linux/module.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 4435ad9439ab..41cfd3be57e5 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -616,6 +616,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, /* Look for this name: can be of form module:name. */ unsigned long module_kallsyms_lookup_name(const char *name); +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); + extern void __noreturn __module_put_and_kthread_exit(struct module *mod, long code); #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code) @@ -796,6 +798,12 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name) return 0; } +static inline unsigned long find_kallsyms_symbol_value(struct module *mod, + const char *name) +{ + return 0; +} + static inline int register_module_notifier(struct notifier_block *nb) { /* no events will happen anyway, so this can always succeed */ -- cgit v1.2.3 From 6365ba64b4dbe8b59ddaeaa724b281f3787715d5 Mon Sep 17 00:00:00 2001 From: Jeremi Piotrowski Date: Wed, 8 Mar 2023 15:05:31 +0000 Subject: ptp: kvm: Use decrypted memory in confidential guest on x86 KVM_HC_CLOCK_PAIRING currently fails inside SEV-SNP guests because the guest passes an address to static data to the host. In confidential computing the host can't access arbitrary guest memory so handling the hypercall runs into an "rmpfault". To make the hypercall work, the guest needs to explicitly mark the memory as decrypted. Do that in kvm_arch_ptp_init(), but retain the previous behavior for non-confidential guests to save us from having to allocate memory. Add a new arch-specific function (kvm_arch_ptp_exit()) to free the allocation and mark the memory as encrypted again. Signed-off-by: Jeremi Piotrowski Link: https://lore.kernel.org/r/20230308150531.477741-1-jpiotrowski@linux.microsoft.com Signed-off-by: Jakub Kicinski --- include/linux/ptp_kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ptp_kvm.h b/include/linux/ptp_kvm.h index c2e28deef33a..746fd67c3480 100644 --- a/include/linux/ptp_kvm.h +++ b/include/linux/ptp_kvm.h @@ -14,6 +14,7 @@ struct timespec64; struct clocksource; int kvm_arch_ptp_init(void); +void kvm_arch_ptp_exit(void); int kvm_arch_ptp_get_clock(struct timespec64 *ts); int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec, struct clocksource **cs); -- cgit v1.2.3 From 36bd28c1cb0dbf48645cfe43159907fb3253b33a Mon Sep 17 00:00:00 2001 From: haozhe chang Date: Thu, 16 Mar 2023 17:58:20 +0800 Subject: wwan: core: Support slicing in port TX flow of WWAN subsystem wwan_port_fops_write inputs the SKB parameter to the TX callback of the WWAN device driver. However, the WWAN device (e.g., t7xx) may have an MTU less than the size of SKB, causing the TX buffer to be sliced and copied once more in the WWAN device driver. This patch implements the slicing in the WWAN subsystem and gives the WWAN devices driver the option to slice(by frag_len) or not. By doing so, the additional memory copy is reduced. Meanwhile, this patch gives WWAN devices driver the option to reserve headroom in fragments for the device-specific metadata. Signed-off-by: haozhe chang Reviewed-by: Loic Poulain Link: https://lore.kernel.org/r/20230316095826.181904-1-haozhe.chang@mediatek.com Signed-off-by: Jakub Kicinski --- include/linux/wwan.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 24d76500b1cc..01fa15506286 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -64,11 +64,21 @@ struct wwan_port_ops { poll_table *wait); }; +/** struct wwan_port_caps - The WWAN port capbilities + * @frag_len: WWAN port TX fragments length + * @headroom_len: WWAN port TX fragments reserved headroom length + */ +struct wwan_port_caps { + size_t frag_len; + unsigned int headroom_len; +}; + /** * wwan_create_port - Add a new WWAN port * @parent: Device to use as parent and shared by all WWAN ports * @type: WWAN port type * @ops: WWAN port operations + * @caps: WWAN port capabilities * @drvdata: Pointer to caller driver data * * Allocate and register a new WWAN port. The port will be automatically exposed @@ -86,6 +96,7 @@ struct wwan_port_ops { struct wwan_port *wwan_create_port(struct device *parent, enum wwan_port_type type, const struct wwan_port_ops *ops, + struct wwan_port_caps *caps, void *drvdata); /** -- cgit v1.2.3 From 6ee44c518159c364e5d30ed85d357fe1d8e2c141 Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Thu, 16 Mar 2023 09:07:58 +0200 Subject: net/mlx5e: TC, Add support for VxLAN GBP encap/decap flows offload Add HW offloading support for TC flows with VxLAN GBP encap/decap. Example of encap rule: tc filter add dev eth0 protocol ip ingress flower \ action tunnel_key set id 42 vxlan_opts 512 \ action mirred egress redirect dev vxlan1 Example of decap rule: tc filter add dev vxlan1 protocol ip ingress flower \ enc_key_id 42 enc_dst_port 4789 vxlan_opts 1024 \ action tunnel_key unset action mirred egress redirect dev eth0 Signed-off-by: Gavin Li Reviewed-by: Gavi Teitz Reviewed-by: Roi Dayan Reviewed-by: Maor Dickman Acked-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/device.h | 6 ++++++ include/linux/mlx5/mlx5_ifc.h | 13 +++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 71b06ebad402..af4dd536a52c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1357,6 +1357,12 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_ESW_INGRESS_ACL_MAX(mdev, cap) \ MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, flow_table_properties_esw_acl_ingress.cap) +#define MLX5_CAP_ESW_FT_FIELD_SUPPORT_2(mdev, cap) \ + MLX5_CAP_ESW_FLOWTABLE(mdev, ft_field_support_2_esw_fdb.cap) + +#define MLX5_CAP_ESW_FT_FIELD_SUPPORT_2_MAX(mdev, cap) \ + MLX5_CAP_ESW_FLOWTABLE_MAX(mdev, ft_field_support_2_esw_fdb.cap) + #define MLX5_CAP_ESW(mdev, cap) \ MLX5_GET(e_switch_cap, \ mdev->caps.hca[MLX5_CAP_ESWITCH]->cur, cap) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index d2c164f0778c..e47d6c58da35 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -404,10 +404,13 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 metadata_reg_c_0[0x1]; }; +/* Table 2170 - Flow Table Fields Supported 2 Format */ struct mlx5_ifc_flow_table_fields_supported_2_bits { u8 reserved_at_0[0xe]; u8 bth_opcode[0x1]; - u8 reserved_at_f[0x11]; + u8 reserved_at_f[0x1]; + u8 tunnel_header_0_1[0x1]; + u8 reserved_at_11[0xf]; u8 reserved_at_20[0x60]; }; @@ -895,7 +898,13 @@ struct mlx5_ifc_flow_table_eswitch_cap_bits { struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress; - u8 reserved_at_800[0x1000]; + u8 reserved_at_800[0xC00]; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_esw_fdb; + + struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_esw_fdb; + + u8 reserved_at_1500[0x300]; u8 sw_steering_fdb_action_drop_icm_address_rx[0x40]; -- cgit v1.2.3 From 94c540fbfc80ef95ec398fc77da1920de2946edb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Mar 2023 15:55:30 +0000 Subject: udp: preserve const qualifier in udp_sk() We can change udp_sk() to propagate const qualifier of its argument, thanks to container_of_const() This should avoid some potential errors caused by accidental (const -> not_const) promotion. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/udp.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index a2892e151644..43c1fb2d2c21 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -97,10 +97,7 @@ struct udp_sock { #define UDP_MAX_SEGMENTS (1 << 6UL) -static inline struct udp_sock *udp_sk(const struct sock *sk) -{ - return (struct udp_sock *)sk; -} +#define udp_sk(ptr) container_of_const(ptr, struct udp_sock, inet.sk) static inline void udp_set_no_check6_tx(struct sock *sk, bool val) { -- cgit v1.2.3 From 47fcae28b9ec409423ba7f67f93e8345acce8a36 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Mar 2023 15:55:33 +0000 Subject: ipv6: raw: preserve const qualifier in raw6_sk() We can change raw6_sk() to propagate its argument const qualifier, thanks to container_of_const(). Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/ipv6.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 37dfdcfcdd54..839247a4f48e 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -336,10 +336,7 @@ static inline struct ipv6_pinfo *inet6_sk(const struct sock *__sk) return sk_fullsock(__sk) ? inet_sk(__sk)->pinet6 : NULL; } -static inline struct raw6_sock *raw6_sk(const struct sock *sk) -{ - return (struct raw6_sock *)sk; -} +#define raw6_sk(ptr) container_of_const(ptr, struct raw6_sock, inet.sk) #define ipv6_only_sock(sk) (sk->sk_ipv6only) #define ipv6_sk_rxinfo(sk) ((sk)->sk_family == PF_INET6 && \ -- cgit v1.2.3 From ae6084b739925d84ef4a481d8eaa2187455f2e2f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Mar 2023 15:55:34 +0000 Subject: dccp: preserve const qualifier in dccp_sk() We can change dccp_sk() to propagate its argument const qualifier, thanks to container_of_const(). Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/dccp.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 07e547c02fd8..325af611909f 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -305,10 +305,8 @@ struct dccp_sock { struct timer_list dccps_xmit_timer; }; -static inline struct dccp_sock *dccp_sk(const struct sock *sk) -{ - return (struct dccp_sock *)sk; -} +#define dccp_sk(ptr) container_of_const(ptr, struct dccp_sock, \ + dccps_inet_connection.icsk_inet.sk) static inline const char *dccp_role(const struct sock *sk) { -- cgit v1.2.3 From e9d9da91548b21e189fcd0259a0f2d26d1afc509 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Mar 2023 15:55:39 +0000 Subject: tcp: preserve const qualifier in tcp_sk() We can change tcp_sk() to propagate its argument const qualifier, thanks to container_of_const(). We have two places where a const sock pointer has to be upgraded to a write one. We have been using const qualifier for lockless listeners to clearly identify points where writes could happen. Add tcp_sk_rw() helper to better document these. tcp_inbound_md5_hash(), __tcp_grow_window(), tcp_reset_check() and tcp_rack_reo_wnd() get an additional const qualififer for their @tp local variables. smc_check_reset_syn_req() also needs a similar change. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/tcp.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ca7f05a130d2..b4c08ac86983 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -472,10 +472,12 @@ enum tsq_flags { TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED), }; -static inline struct tcp_sock *tcp_sk(const struct sock *sk) -{ - return (struct tcp_sock *)sk; -} +#define tcp_sk(ptr) container_of_const(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) + +/* Variant of tcp_sk() upgrading a const sock to a read/write tcp socket. + * Used in context of (lockless) tcp listeners. + */ +#define tcp_sk_rw(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) struct tcp_timewait_sock { struct inet_timewait_sock tw_sk; -- cgit v1.2.3 From a69e332b4ef9f697a4558bab9c442a02b3659fcb Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 18 Mar 2023 21:32:41 +0100 Subject: net: phy: smsc: export functions for use by meson-gxl PHY driver The Amlogic Meson internal PHY's have the same register layout as certain SMSC PHY's (also for non-c22-standard registers). This seems to be more than just coincidence. Apparently they also need the same workaround for EDPD mode (energy detect power down). Therefore let's export SMSC PHY driver functionality for use by the meson-gxl PHY driver. Signed-off-by: Heiner Kallweit Signed-off-by: Chris Healy Signed-off-by: David S. Miller --- include/linux/smscphy.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smscphy.h b/include/linux/smscphy.h index 1a136271ba6a..80f37c1dba58 100644 --- a/include/linux/smscphy.h +++ b/include/linux/smscphy.h @@ -28,4 +28,10 @@ #define MII_LAN83C185_MODE_POWERDOWN 0xC0 /* Power Down mode */ #define MII_LAN83C185_MODE_ALL 0xE0 /* All capable mode */ +int smsc_phy_config_intr(struct phy_device *phydev); +irqreturn_t smsc_phy_handle_interrupt(struct phy_device *phydev); +int smsc_phy_config_init(struct phy_device *phydev); +int lan87xx_read_status(struct phy_device *phydev); +int smsc_phy_probe(struct phy_device *phydev); + #endif /* __LINUX_SMSCPHY_H__ */ -- cgit v1.2.3 From 4765a9722e09765866e131ec31f7b9cf4c1f4854 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Sun, 19 Mar 2023 12:57:50 +0000 Subject: net: pcs: add driver for MediaTek SGMII PCS The SGMII core found in several MediaTek SoCs is identical to what can also be found in MediaTek's MT7531 Ethernet switch IC. As this has not always been clear, both drivers developed different implementations to deal with the PCS. Recently Alexander Couzens pointed out this fact which lead to the development of this shared driver. Add a dedicated driver, mostly by copying the code now found in the Ethernet driver. The now redundant code will be removed by a follow-up commit. Suggested-by: Alexander Couzens Suggested-by: Russell King (Oracle) Signed-off-by: Daniel Golle Tested-by: Frank Wunderlich Reviewed-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- include/linux/pcs/pcs-mtk-lynxi.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 include/linux/pcs/pcs-mtk-lynxi.h (limited to 'include/linux') diff --git a/include/linux/pcs/pcs-mtk-lynxi.h b/include/linux/pcs/pcs-mtk-lynxi.h new file mode 100644 index 000000000000..be3b4ab32f4a --- /dev/null +++ b/include/linux/pcs/pcs-mtk-lynxi.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_PCS_MTK_LYNXI_H +#define __LINUX_PCS_MTK_LYNXI_H + +#include +#include + +#define MTK_SGMII_FLAG_PN_SWAP BIT(0) +struct phylink_pcs *mtk_pcs_lynxi_create(struct device *dev, + struct regmap *regmap, + u32 ana_rgc3, u32 flags); +void mtk_pcs_lynxi_destroy(struct phylink_pcs *pcs); +#endif -- cgit v1.2.3 From 04aae213e719ec2bb310158c4025316ace50589b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Mar 2023 18:41:13 -0700 Subject: net: skbuff: rename __pkt_vlan_present_offset to __mono_tc_offset vlan_present is gone since commit 354259fa73e2 ("net: remove skb->vlan_present") rename the offset field to what BPF is currently looking for in this byte - mono_delivery_time and tc_at_ingress. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230321014115.997841-2-kuba@kernel.org Signed-off-by: Martin KaFai Lau --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3f3a2a82a86b..5a63878a4550 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -955,7 +955,7 @@ struct sk_buff { __u8 csum_valid:1; /* private: */ - __u8 __pkt_vlan_present_offset[0]; + __u8 __mono_tc_offset[0]; /* public: */ __u8 remcsum_offload:1; __u8 csum_complete_sw:1; @@ -1078,7 +1078,7 @@ struct sk_buff { #define TC_AT_INGRESS_MASK (1 << 7) #define SKB_MONO_DELIVERY_TIME_MASK (1 << 5) #endif -#define PKT_VLAN_PRESENT_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) +#define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) #ifdef __KERNEL__ /* -- cgit v1.2.3 From b94e032b7ad6318b36615f0e0cc3b0d61a5531e8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Mar 2023 18:41:14 -0700 Subject: net: skbuff: reorder bytes 2 and 3 of the bitfield BPF needs to know the offsets of fields it tries to access. Zero-length fields are added to make offsetof() work. This unfortunately partitions the bitfield (fields across the zero-length members can't be coalesced). Reorder bytes 2 and 3, BPF needs to know the offset of fields previously in byte 3 and some fields in byte 2 should really be optional. The two bytes are always in the same cacheline so it should not matter. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230321014115.997841-3-kuba@kernel.org Signed-off-by: Martin KaFai Lau --- include/linux/skbuff.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5a63878a4550..36d31e74db37 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -944,16 +944,6 @@ struct sk_buff { __u8 ip_summed:2; __u8 ooo_okay:1; - __u8 l4_hash:1; - __u8 sw_hash:1; - __u8 wifi_acked_valid:1; - __u8 wifi_acked:1; - __u8 no_fcs:1; - /* Indicates the inner headers are valid in the skbuff. */ - __u8 encapsulation:1; - __u8 encap_hdr_csum:1; - __u8 csum_valid:1; - /* private: */ __u8 __mono_tc_offset[0]; /* public: */ @@ -966,6 +956,16 @@ struct sk_buff { __u8 tc_skip_classify:1; __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ #endif + + __u8 l4_hash:1; + __u8 sw_hash:1; + __u8 wifi_acked_valid:1; + __u8 wifi_acked:1; + __u8 no_fcs:1; + /* Indicates the inner headers are valid in the skbuff. */ + __u8 encapsulation:1; + __u8 encap_hdr_csum:1; + __u8 csum_valid:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif -- cgit v1.2.3 From c0ba861117c3e8deb03855d7dc5a7717958bbb18 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Mar 2023 18:41:15 -0700 Subject: net: skbuff: move the fields BPF cares about directly next to the offset marker To avoid more possible BPF dependencies with moving bitfields around keep the fields BPF cares about right next to the offset marker. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20230321014115.997841-4-kuba@kernel.org Signed-off-by: Martin KaFai Lau --- include/linux/skbuff.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 36d31e74db37..6aeb0e7b9511 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -947,15 +947,15 @@ struct sk_buff { /* private: */ __u8 __mono_tc_offset[0]; /* public: */ - __u8 remcsum_offload:1; - __u8 csum_complete_sw:1; - __u8 csum_level:2; - __u8 dst_pending_confirm:1; __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ #ifdef CONFIG_NET_CLS_ACT - __u8 tc_skip_classify:1; __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ + __u8 tc_skip_classify:1; #endif + __u8 remcsum_offload:1; + __u8 csum_complete_sw:1; + __u8 csum_level:2; + __u8 dst_pending_confirm:1; __u8 l4_hash:1; __u8 sw_hash:1; @@ -1072,11 +1072,11 @@ struct sk_buff { * around, you also must adapt these constants. */ #ifdef __BIG_ENDIAN_BITFIELD -#define TC_AT_INGRESS_MASK (1 << 0) -#define SKB_MONO_DELIVERY_TIME_MASK (1 << 2) +#define SKB_MONO_DELIVERY_TIME_MASK (1 << 7) +#define TC_AT_INGRESS_MASK (1 << 6) #else -#define TC_AT_INGRESS_MASK (1 << 7) -#define SKB_MONO_DELIVERY_TIME_MASK (1 << 5) +#define SKB_MONO_DELIVERY_TIME_MASK (1 << 0) +#define TC_AT_INGRESS_MASK (1 << 1) #endif #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) -- cgit v1.2.3 From fe602c87df1b6927562f4ee61edd851bb9578a49 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Mar 2023 04:01:15 +0000 Subject: net: remove rcu_dereference_bh_rtnl() This helper is no longer used in the tree. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 92ad75549e9c..f0c87baaf6c0 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -61,16 +61,6 @@ static inline bool lockdep_rtnl_is_held(void) #define rcu_dereference_rtnl(p) \ rcu_dereference_check(p, lockdep_rtnl_is_held()) -/** - * rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking - * @p: The pointer to read, prior to dereference - * - * Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh() - * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh() - */ -#define rcu_dereference_bh_rtnl(p) \ - rcu_dereference_bh_check(p, lockdep_rtnl_is_held()) - /** * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL * @p: The pointer to read, prior to dereferencing -- cgit v1.2.3 From d7ba4cc900bf1eea2d8c807c6b1fc6bd61f41237 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Wed, 22 Mar 2023 12:47:54 -0700 Subject: bpf: return long from bpf_map_ops funcs This patch changes the return types of bpf_map_ops functions to long, where previously int was returned. Using long allows for bpf programs to maintain the sign bit in the absence of sign extension during situations where inlined bpf helper funcs make calls to the bpf_map_ops funcs and a negative error is returned. The definitions of the helper funcs are generated from comments in the bpf uapi header at `include/uapi/linux/bpf.h`. The return type of these helpers was previously changed from int to long in commit bdb7b79b4ce8. For any case where one of the map helpers call the bpf_map_ops funcs that are still returning 32-bit int, a compiler might not include sign extension instructions to properly convert the 32-bit negative value a 64-bit negative value. For example: bpf assembly excerpt of an inlined helper calling a kernel function and checking for a specific error: ; err = bpf_map_update_elem(&mymap, &key, &val, BPF_NOEXIST); ... 46: call 0xffffffffe103291c ; htab_map_update_elem ; if (err && err != -EEXIST) { 4b: cmp $0xffffffffffffffef,%rax ; cmp -EEXIST,%rax kernel function assembly excerpt of return value from `htab_map_update_elem` returning 32-bit int: movl $0xffffffef, %r9d ... movl %r9d, %eax ...results in the comparison: cmp $0xffffffffffffffef, $0x00000000ffffffef Fixes: bdb7b79b4ce8 ("bpf: Switch most helper return values from 32-bit int to 64-bit long") Tested-by: Eduard Zingerman Signed-off-by: JP Kobryn Link: https://lore.kernel.org/r/20230322194754.185781-3-inwardvessel@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 14 +++++++------- include/linux/filter.h | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3ef98fb92987..ec0df059f562 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -96,11 +96,11 @@ struct bpf_map_ops { /* funcs callable from userspace and from eBPF programs */ void *(*map_lookup_elem)(struct bpf_map *map, void *key); - int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); - int (*map_delete_elem)(struct bpf_map *map, void *key); - int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); - int (*map_pop_elem)(struct bpf_map *map, void *value); - int (*map_peek_elem)(struct bpf_map *map, void *value); + long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); + long (*map_delete_elem)(struct bpf_map *map, void *key); + long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); + long (*map_pop_elem)(struct bpf_map *map, void *value); + long (*map_peek_elem)(struct bpf_map *map, void *value); void *(*map_lookup_percpu_elem)(struct bpf_map *map, void *key, u32 cpu); /* funcs called by prog_array and perf_event_array map */ @@ -139,7 +139,7 @@ struct bpf_map_ops { struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* Misc helpers.*/ - int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); + long (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure @@ -157,7 +157,7 @@ struct bpf_map_ops { int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee); - int (*map_for_each_callback)(struct bpf_map *map, + long (*map_for_each_callback)(struct bpf_map *map, bpf_callback_t callback_fn, void *callback_ctx, u64 flags); diff --git a/include/linux/filter.h b/include/linux/filter.h index efa5d4a1677e..23c08c31bea9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1504,9 +1504,9 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, - u64 flags, const u64 flag_mask, - void *lookup_elem(struct bpf_map *map, u32 key)) +static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, + u64 flags, const u64 flag_mask, + void *lookup_elem(struct bpf_map *map, u32 key)) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; -- cgit v1.2.3 From 1cc6571f562774f1d928dc8b3cff50829b86e970 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Tue, 21 Mar 2023 10:07:25 -0500 Subject: netdev: Enforce index cap in netdev_get_tx_queue When requesting a TX queue at a given index, warn on out-of-bounds referencing if the index is greater than the allocated number of queues. Specifically, since this function is used heavily in the networking stack use DEBUG_NET_WARN_ON_ONCE to avoid executing a new branch on every packet. Signed-off-by: Nick Child Link: https://lore.kernel.org/r/20230321150725.127229-2-nnac123@linux.ibm.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7621c512765f..674ee5daa7b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2484,6 +2484,7 @@ static inline struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, unsigned int index) { + DEBUG_NET_WARN_ON_ONCE(index >= dev->num_tx_queues); return &dev->_tx[index]; } -- cgit v1.2.3 From 4ee9b0dcf09f426fbad7ed132d73ea2ba379d8ee Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 21 Mar 2023 15:58:54 +0000 Subject: net: phylink: remove an_enabled The Autoneg bit in the advertising bitmap and state->an_enabled are always identical. state->an_enabled is now no longer used by any drivers, so lets kill this duplication. Signed-off-by: Russell King (Oracle) Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index c492c26202b5..9ff56b050584 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -93,7 +93,6 @@ static inline bool phylink_autoneg_inband(unsigned int mode) * the medium link mode (@speed and @duplex) and the speed/duplex of the phy * interface mode (@interface) are different. * @link: true if the link is up. - * @an_enabled: true if autonegotiation is enabled/desired. * @an_complete: true if autonegotiation has completed. */ struct phylink_link_state { @@ -105,7 +104,6 @@ struct phylink_link_state { int pause; int rate_matching; unsigned int link:1; - unsigned int an_enabled:1; unsigned int an_complete:1; }; -- cgit v1.2.3 From b671c2067a04c0668df174ff5dfdb573d1f9b074 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:23:58 -0700 Subject: bpf: Retire the struct_ops map kvalue->refcnt. We have replaced kvalue-refcnt with synchronize_rcu() to wait for an RCU grace period. Maintenance of kvalue->refcnt was a complicated task, as we had to simultaneously keep track of two reference counts: one for the reference count of bpf_map. When the kvalue->refcnt reaches zero, we also have to reduce the reference count on bpf_map - yet these steps are not performed in an atomic manner and require us to be vigilant when managing them. By eliminating kvalue->refcnt, we can make our maintenance more straightforward as the refcount of bpf_map is now solely managed! To prevent the trampoline image of a struct_ops from being released while it is still in use, we wait for an RCU grace period. The setsockopt(TCP_CONGESTION, "...") command allows you to change your socket's congestion control algorithm and can result in releasing the old struct_ops implementation. It is fine. However, this function is exposed through bpf_setsockopt(), it may be accessed by BPF programs as well. To ensure that the trampoline image belonging to struct_op can be safely called while its method is in use, the trampoline safeguarde the BPF program with rcu_read_lock(). Doing so prevents any destruction of the associated images before returning from a trampoline and requires us to wait for an RCU grace period. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20230323032405.3735486-2-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ec0df059f562..f04098468d7a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1945,6 +1945,7 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); void bpf_map_inc(struct bpf_map *map); void bpf_map_inc_with_uref(struct bpf_map *map); +struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref); struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); -- cgit v1.2.3 From 68b04864ca425d1894c96b8141d4fba1181f11cb Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:24:00 -0700 Subject: bpf: Create links for BPF struct_ops maps. Make bpf_link support struct_ops. Previously, struct_ops were always used alone without any associated links. Upon updating its value, a struct_ops would be activated automatically. Yet other BPF program types required to make a bpf_link with their instances before they could become active. Now, however, you can create an inactive struct_ops, and create a link to activate it later. With bpf_links, struct_ops has a behavior similar to other BPF program types. You can pin/unpin them from their links and the struct_ops will be deactivated when its link is removed while previously need someone to delete the value for it to be deactivated. bpf_links are responsible for registering their associated struct_ops. You can only use a struct_ops that has the BPF_F_LINK flag set to create a bpf_link, while a structs without this flag behaves in the same manner as before and is registered upon updating its value. The BPF_LINK_TYPE_STRUCT_OPS serves a dual purpose. Not only is it used to craft the links for BPF struct_ops programs, but also to create links for BPF struct_ops them-self. Since the links of BPF struct_ops programs are only used to create trampolines internally, they are never seen in other contexts. Thus, they can be reused for struct_ops themself. To maintain a reference to the map supporting this link, we add bpf_struct_ops_link as an additional type. The pointer of the map is RCU and won't be necessary until later in the patchset. Signed-off-by: Kui-Feng Lee Link: https://lore.kernel.org/r/20230323032405.3735486-4-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f04098468d7a..8552279efe46 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1518,6 +1518,7 @@ struct bpf_struct_ops { void *kdata, const void *udata); int (*reg)(void *kdata); void (*unreg)(void *kdata); + int (*validate)(void *kdata); const struct btf_type *type; const struct btf_type *value_type; const char *name; @@ -1552,6 +1553,7 @@ static inline void bpf_module_put(const void *data, struct module *owner) else module_put(owner); } +int bpf_struct_ops_link_create(union bpf_attr *attr); #ifdef CONFIG_NET /* Define it here to avoid the use of forward declaration */ @@ -1592,6 +1594,11 @@ static inline int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, { return -EINVAL; } +static inline int bpf_struct_ops_link_create(union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} + #endif #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) -- cgit v1.2.3 From aef56f2e918bf8fc8de25f0b36e8c2aba44116ec Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Wed, 22 Mar 2023 20:24:02 -0700 Subject: bpf: Update the struct_ops of a bpf_link. By improving the BPF_LINK_UPDATE command of bpf(), it should allow you to conveniently switch between different struct_ops on a single bpf_link. This would enable smoother transitions from one struct_ops to another. The struct_ops maps passing along with BPF_LINK_UPDATE should have the BPF_F_LINK flag. Signed-off-by: Kui-Feng Lee Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230323032405.3735486-6-kuifeng@meta.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8552279efe46..2d8f3f639e68 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1476,6 +1476,8 @@ struct bpf_link_ops { void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq); int (*fill_link_info)(const struct bpf_link *link, struct bpf_link_info *info); + int (*update_map)(struct bpf_link *link, struct bpf_map *new_map, + struct bpf_map *old_map); }; struct bpf_tramp_link { @@ -1518,6 +1520,7 @@ struct bpf_struct_ops { void *kdata, const void *udata); int (*reg)(void *kdata); void (*unreg)(void *kdata); + int (*update)(void *kdata, void *old_kdata); int (*validate)(void *kdata); const struct btf_type *type; const struct btf_type *value_type; -- cgit v1.2.3 From 3eb8eea2a453463f5606ce3e46cf225f88671440 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 21 Mar 2023 22:38:48 -0700 Subject: docs: networking: document NAPI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add basic documentation about NAPI. We can stop linking to the ancient doc on the LF wiki. Link: https://lore.kernel.org/all/20230315223044.471002-1-kuba@kernel.org/ Reviewed-by: Bagas Sanjaya Reviewed-by: Toke Høiland-Jørgensen Acked-by: Pavel Pisa # for ctucanfd-driver.rst Reviewed-by: Tony Nguyen Reviewed-by: Florian Fainelli Reviewed-by: Stephen Hemminger Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/20230322053848.198452-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 674ee5daa7b1..18a5be6ddd0f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -509,15 +509,18 @@ static inline bool napi_reschedule(struct napi_struct *napi) return false; } -bool napi_complete_done(struct napi_struct *n, int work_done); /** - * napi_complete - NAPI processing complete - * @n: NAPI context + * napi_complete_done - NAPI processing complete + * @n: NAPI context + * @work_done: number of packets processed * - * Mark NAPI processing as complete. - * Consider using napi_complete_done() instead. + * Mark NAPI processing as complete. Should only be called if poll budget + * has not been completely consumed. + * Prefer over napi_complete(). * Return false if device should avoid rearming interrupts. */ +bool napi_complete_done(struct napi_struct *n, int work_done); + static inline bool napi_complete(struct napi_struct *n) { return napi_complete_done(n, 0); -- cgit v1.2.3 From 9821d8d4628e630ab56f47a8e6b878a2576e069b Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Tue, 14 Feb 2023 09:29:46 +0200 Subject: lib: cpu_rmap: Use allocator for rmap entries Use a proper allocator for rmap entries using a naive for loop. The allocator relies on whether an entry is NULL to be considered free. Remove the used field of rmap which is not needed. Also, avoid crashing the kernel if an entry is not available. Cc: Thomas Gleixner Signed-off-by: Eli Cohen Signed-off-by: Saeed Mahameed Reviewed-by: Jacob Keller --- include/linux/cpu_rmap.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h index be8aea04d023..0ec745e6cd36 100644 --- a/include/linux/cpu_rmap.h +++ b/include/linux/cpu_rmap.h @@ -16,14 +16,13 @@ * struct cpu_rmap - CPU affinity reverse-map * @refcount: kref for object * @size: Number of objects to be reverse-mapped - * @used: Number of objects added * @obj: Pointer to array of object pointers * @near: For each CPU, the index and distance to the nearest object, * based on affinity masks */ struct cpu_rmap { struct kref refcount; - u16 size, used; + u16 size; void **obj; struct { u16 index; -- cgit v1.2.3 From 71f0a2478605c100358a9f9e174849fa643bf8a7 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Tue, 14 Feb 2023 11:05:46 +0200 Subject: lib: cpu_rmap: Add irq_cpu_rmap_remove to complement irq_cpu_rmap_add Add a function to complement irq_cpu_rmap_add(). It removes the irq from the reverse mapping by setting the notifier to NULL. The function calls irq_set_affinity_notifier() with NULL at the notify argument which then cancel any pending notifier work and decrement reference on the notifier. When ref count reaches zero, the glue pointer is kfree and the rmap entry is set to NULL serving both to avoid second attempt to release it and also making the rmap entry available for subsequent mapping. It should be noted the drivers usually creates the reverse mapping at initialization time and remove it at unload time so we do not expect failures in allocating rmap due to kref holding the glue entry. Cc: Thomas Gleixner Signed-off-by: Eli Cohen Signed-off-by: Saeed Mahameed Reviewed-by: Jacob Keller --- include/linux/cpu_rmap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h index 0ec745e6cd36..cae324d10965 100644 --- a/include/linux/cpu_rmap.h +++ b/include/linux/cpu_rmap.h @@ -60,6 +60,7 @@ static inline struct cpu_rmap *alloc_irq_cpu_rmap(unsigned int size) } extern void free_irq_cpu_rmap(struct cpu_rmap *rmap); +int irq_cpu_rmap_remove(struct cpu_rmap *rmap, int irq); extern int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq); #endif /* __LINUX_CPU_RMAP_H */ -- cgit v1.2.3 From fb0a6a268dcd6fe144c99d60a1166e34c6991d5f Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 5 Jan 2023 11:31:46 +0200 Subject: net/mlx5: Provide external API for allocating vectors Provide external API to be used by other drivers relying on mlx5_core, for allocating MSIX vectors. An example for such a driver would be mlx5_vdpa. Signed-off-by: Eli Cohen Reviewed-by: Shay Drory Signed-off-by: Saeed Mahameed Reviewed-by: Jacob Keller --- include/linux/mlx5/driver.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f33389b42209..df0f82110249 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1308,4 +1308,10 @@ enum { MLX5_OCTWORD = 16, }; +struct msi_map mlx5_msix_alloc(struct mlx5_core_dev *dev, + irqreturn_t (*handler)(int, void *), + const struct irq_affinity_desc *affdesc, + const char *name); +void mlx5_msix_free(struct mlx5_core_dev *dev, struct msi_map map); + #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From e65a5c6edbc6ca4853e6076bd81db1a410592a09 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2023 14:52:42 -0700 Subject: bpf: Add a few bpf mem allocator functions This patch adds a few bpf mem allocator functions which will be used in the bpf_local_storage in a later patch. bpf_mem_cache_alloc_flags(..., gfp_t flags) is added. When the flags == GFP_KERNEL, it will fallback to __alloc(..., GFP_KERNEL). bpf_local_storage knows its running context is sleepable (GFP_KERNEL) and provides a better guarantee on memory allocation. bpf_local_storage has some uncommon cases that its selem cannot be reused immediately. It handles its own rcu_head and goes through a rcu_trace gp and then free it. bpf_mem_cache_raw_free() is added for direct free purpose without leaking the LLIST_NODE_SZ internal knowledge. During free time, the 'struct bpf_mem_alloc *ma' is no longer available. However, the caller should know if it is percpu memory or not and it can call different raw_free functions. bpf_local_storage does not support percpu value, so only the non-percpu 'bpf_mem_cache_raw_free()' is added in this patch. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230322215246.1675516-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index a7104af61ab4..3929be5743f4 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -31,5 +31,7 @@ void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); /* kmem_cache_alloc/free equivalent: */ void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); +void bpf_mem_cache_raw_free(void *ptr); +void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags); #endif /* _BPF_MEM_ALLOC_H */ -- cgit v1.2.3 From 08a7ce384e33e53e0732c500a8af67a73f8fceca Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2023 14:52:43 -0700 Subject: bpf: Use bpf_mem_cache_alloc/free in bpf_local_storage_elem This patch uses bpf_mem_alloc for the task and cgroup local storage that the bpf prog can easily get a hold of the storage owner's PTR_TO_BTF_ID. eg. bpf_get_current_task_btf() can be used in some of the kmalloc code path which will cause deadlock/recursion. bpf_mem_cache_alloc is deadlock free and will solve a legit use case in [1]. For sk storage, its batch creation benchmark shows a few percent regression when the sk create/destroy batch size is larger than 32. The sk creation/destruction happens much more often and depends on external traffic. Considering it is hypothetical to be able to cause deadlock with sk storage, it can cross the bridge to use bpf_mem_alloc till a legit (ie. useful) use case comes up. For inode storage, bpf_local_storage_destroy() is called before waiting for a rcu gp and its memory cannot be reused immediately. inode stays with kmalloc/kfree after the rcu [or tasks_trace] gp. A 'bool bpf_ma' argument is added to bpf_local_storage_map_alloc(). Only task and cgroup storage have 'bpf_ma == true' which means to use bpf_mem_cache_alloc/free(). This patch only changes selem to use bpf_mem_alloc for task and cgroup. The next patch will change the local_storage to use bpf_mem_alloc also for task and cgroup. Here is some more details on the changes: * memory allocation: After bpf_mem_cache_alloc(), the SDATA(selem)->data is zero-ed because bpf_mem_cache_alloc() could return a reused selem. It is to keep the existing bpf_map_kzalloc() behavior. Only SDATA(selem)->data is zero-ed. SDATA(selem)->data is the visible part to the bpf prog. No need to use zero_map_value() to do the zeroing because bpf_selem_free(..., reuse_now = true) ensures no bpf prog is using the selem before returning the selem through bpf_mem_cache_free(). For the internal fields of selem, they will be initialized when linking to the new smap and the new local_storage. When 'bpf_ma == false', nothing changes in this patch. It will stay with the bpf_map_kzalloc(). * memory free: The bpf_selem_free() and bpf_selem_free_rcu() are modified to handle the bpf_ma == true case. For the common selem free path where its owner is also being destroyed, the mem is freed in bpf_local_storage_destroy(), the owner (task and cgroup) has gone through a rcu gp. The memory can be reused immediately, so bpf_local_storage_destroy() will call bpf_selem_free(..., reuse_now = true) which will do bpf_mem_cache_free() for immediate reuse consideration. An exception is the delete elem code path. The delete elem code path is called from the helper bpf_*_storage_delete() and the syscall bpf_map_delete_elem(). This path is an unusual case for local storage because the common use case is to have the local storage staying with its owner life time so that the bpf prog and the user space does not have to monitor the owner's destruction. For the delete elem path, the selem cannot be reused immediately because there could be bpf prog using it. It will call bpf_selem_free(..., reuse_now = false) and it will wait for a rcu tasks trace gp before freeing the elem. The rcu callback is changed to do bpf_mem_cache_raw_free() instead of kfree(). When 'bpf_ma == false', it should be the same as before. __bpf_selem_free() is added to do the kfree_rcu and call_tasks_trace_rcu(). A few words on the 'reuse_now == true'. When 'reuse_now == true', it is still racing with bpf_local_storage_map_free which is under rcu protection, so it still needs to wait for a rcu gp instead of kfree(). Otherwise, the selem may be reused by slab for a totally different struct while the bpf_local_storage_map_free() is still using it (as a rcu reader). For the inode case, there may be other rcu readers also. In short, when bpf_ma == false and reuse_now == true => vanilla rcu. [1]: https://lore.kernel.org/bpf/20221118190109.1512674-1-namhyung@kernel.org/ Cc: Namhyung Kim Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230322215246.1675516-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index a34f61467a2f..30efbcab2798 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 @@ -55,6 +56,8 @@ struct bpf_local_storage_map { u32 bucket_log; u16 elem_size; u16 cache_idx; + struct bpf_mem_alloc selem_ma; + bool bpf_ma; }; struct bpf_local_storage_data { @@ -122,7 +125,8 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr); struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, - struct bpf_local_storage_cache *cache); + struct bpf_local_storage_cache *cache, + bool bpf_ma); struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, -- cgit v1.2.3 From 6ae9d5e99e1dd26babdd9502759fa25a3fd348ad Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2023 14:52:44 -0700 Subject: bpf: Use bpf_mem_cache_alloc/free for bpf_local_storage This patch uses bpf_mem_cache_alloc/free for allocating and freeing bpf_local_storage for task and cgroup storage. The changes are similar to the previous patch. A few things that worth to mention for bpf_local_storage: The local_storage is freed when the last selem is deleted. Before deleting a selem from local_storage, it needs to retrieve the local_storage->smap because the bpf_selem_unlink_storage_nolock() may have set it to NULL. Note that local_storage->smap may have already been NULL when the selem created this local_storage has been removed. In this case, call_rcu will be used to free the local_storage. Also, the bpf_ma (true or false) value is needed before calling bpf_local_storage_free(). The bpf_ma can either be obtained from the local_storage->smap (if available) or any of its selem's smap. A new helper check_storage_bpf_ma() is added to obtain bpf_ma for a deleting bpf_local_storage. When bpf_local_storage_alloc getting a reused memory, all fields are either in the correct values or will be initialized. 'cache[]' must already be all NULLs. 'list' must be empty. Others will be initialized. Cc: Namhyung Kim Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230322215246.1675516-4-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 30efbcab2798..173ec7f43ed1 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -57,6 +57,7 @@ struct bpf_local_storage_map { u16 elem_size; u16 cache_idx; struct bpf_mem_alloc selem_ma; + struct bpf_mem_alloc storage_ma; bool bpf_ma; }; -- cgit v1.2.3 From 3948b05950fdd64002a5f182c65ba5cf2d53cf71 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Mar 2023 09:28:42 -0700 Subject: net: introduce a config option to tweak MAX_SKB_FRAGS Currently, MAX_SKB_FRAGS value is 17. For standard tcp sendmsg() traffic, no big deal because tcp_sendmsg() attempts order-3 allocations, stuffing 32768 bytes per frag. But with zero copy, we use order-0 pages. For BIG TCP to show its full potential, we add a config option to be able to fit up to 45 segments per skb. This is also needed for BIG TCP rx zerocopy, as zerocopy currently does not support skbs with frag list. We have used MAX_SKB_FRAGS=45 value for years at Google before we deployed 4K MTU, with no adverse effect, other than a recent issue in mlx4, fixed in commit 26782aad00cc ("net/mlx4: MLX4_TX_BOUNCE_BUFFER_SIZE depends on MAX_SKB_FRAGS") Back then, goal was to be able to receive full size (64KB) GRO packets without the frag_list overhead. Note that /proc/sys/net/core/max_skb_frags can also be used to limit the number of fragments TCP can use in tx packets. By default we keep the old/legacy value of 17 until we get more coverage for the updated values. Sizes of struct skb_shared_info on 64bit arches MAX_SKB_FRAGS | sizeof(struct skb_shared_info): ============================================== 17 320 21 320+64 = 384 25 320+128 = 448 29 320+192 = 512 33 320+256 = 576 37 320+320 = 640 41 320+384 = 704 45 320+448 = 768 This inflation might cause problems for drivers assuming they could pack both the incoming packet (for MTU=1500) and skb_shared_info in half a page, using build_skb(). v3: fix build error when CONFIG_NET=n v2: fix two build errors assuming MAX_SKB_FRAGS was "unsigned long" Signed-off-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Reviewed-by: Jason Xing Link: https://lore.kernel.org/r/20230323162842.1935061-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fe661011644b..82511b2f61ea 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -345,18 +345,12 @@ struct sk_buff_head { struct sk_buff; -/* To allow 64K frame to be packed as single skb without frag_list we - * require 64K/PAGE_SIZE pages plus 1 additional page to allow for - * buffers which do not start on a page boundary. - * - * Since GRO uses frags we allocate at least 16 regardless of page - * size. - */ -#if (65536/PAGE_SIZE + 1) < 16 -#define MAX_SKB_FRAGS 16UL -#else -#define MAX_SKB_FRAGS (65536/PAGE_SIZE + 1) +#ifndef CONFIG_MAX_SKB_FRAGS +# define CONFIG_MAX_SKB_FRAGS 17 #endif + +#define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS + extern int sysctl_max_skb_frags; /* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to -- cgit v1.2.3 From 3e4d5ba9a3f85f21f1ebdee5a5901bb43389abc5 Mon Sep 17 00:00:00 2001 From: Shay Agroskin Date: Thu, 23 Mar 2023 18:36:04 +0200 Subject: netlink: Add a macro to set policy message with format string Similar to NL_SET_ERR_MSG_FMT, add a macro which sets netlink policy error message with a format string. Signed-off-by: Shay Agroskin Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 3e8743252167..19c0791ed9d5 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -161,9 +161,31 @@ struct netlink_ext_ack { } \ } while (0) +#define NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, pol, fmt, args...) do { \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (!__extack) \ + break; \ + \ + if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ + "%s" fmt "%s", "", ##args, "") >= \ + NETLINK_MAX_FMTMSG_LEN) \ + net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ + ##args, "\n"); \ + \ + do_trace_netlink_extack(__extack->_msg_buf); \ + \ + __extack->_msg = __extack->_msg_buf; \ + __extack->bad_attr = (attr); \ + __extack->policy = (pol); \ +} while (0) + #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) +#define NL_SET_ERR_MSG_ATTR_FMT(extack, attr, msg, args...) \ + NL_SET_ERR_MSG_ATTR_POL_FMT(extack, attr, NULL, msg, ##args) + #define NL_SET_ERR_ATTR_MISS(extack, nest, type) do { \ struct netlink_ext_ack *__extack = (extack); \ \ -- cgit v1.2.3 From 233eb4e786b57ea686b51c13a04cc2839fd682fc Mon Sep 17 00:00:00 2001 From: Shay Agroskin Date: Thu, 23 Mar 2023 18:36:05 +0200 Subject: ethtool: Add support for configuring tx_push_buf_len This attribute, which is part of ethtool's ring param configuration allows the user to specify the maximum number of the packet's payload that can be written directly to the device. Example usage: # ethtool -G [interface] tx-push-buf-len [number of bytes] Co-developed-by: Jakub Kicinski Signed-off-by: Shay Agroskin Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 2792185dda22..798d35890118 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -75,6 +75,8 @@ enum { * @tx_push: The flag of tx push mode * @rx_push: The flag of rx push mode * @cqe_size: Size of TX/RX completion queue event + * @tx_push_buf_len: Size of TX push buffer + * @tx_push_buf_max_len: Maximum allowed size of TX push buffer */ struct kernel_ethtool_ringparam { u32 rx_buf_len; @@ -82,6 +84,8 @@ struct kernel_ethtool_ringparam { u8 tx_push; u8 rx_push; u32 cqe_size; + u32 tx_push_buf_len; + u32 tx_push_buf_max_len; }; /** @@ -90,12 +94,14 @@ struct kernel_ethtool_ringparam { * @ETHTOOL_RING_USE_CQE_SIZE: capture for setting cqe_size * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push + * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len */ enum ethtool_supported_ring_param { - ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), - ETHTOOL_RING_USE_CQE_SIZE = BIT(1), - ETHTOOL_RING_USE_TX_PUSH = BIT(2), - ETHTOOL_RING_USE_RX_PUSH = BIT(3), + ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), + ETHTOOL_RING_USE_CQE_SIZE = BIT(1), + ETHTOOL_RING_USE_TX_PUSH = BIT(2), + ETHTOOL_RING_USE_RX_PUSH = BIT(3), + ETHTOOL_RING_USE_TX_PUSH_BUF_LEN = BIT(4), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) -- cgit v1.2.3 From e5ab9eff46b04c5a04778e40d7092fed3fda52ca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Mar 2023 21:55:30 +0100 Subject: atomics: Provide atomic_add_negative() variants atomic_add_negative() does not provide the relaxed/acquire/release variants. Provide them in preparation for a new scalable reference count algorithm. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20230323102800.101763813@linutronix.de --- include/linux/atomic/atomic-arch-fallback.h | 208 ++++++++++++++++++++++++++-- include/linux/atomic/atomic-instrumented.h | 68 ++++++++- include/linux/atomic/atomic-long.h | 38 ++++- 3 files changed, 303 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/atomic/atomic-arch-fallback.h b/include/linux/atomic/atomic-arch-fallback.h index 77bc5522e61c..4226379a232d 100644 --- a/include/linux/atomic/atomic-arch-fallback.h +++ b/include/linux/atomic/atomic-arch-fallback.h @@ -1208,15 +1208,21 @@ arch_atomic_inc_and_test(atomic_t *v) #define arch_atomic_inc_and_test arch_atomic_inc_and_test #endif +#ifndef arch_atomic_add_negative_relaxed +#ifdef arch_atomic_add_negative +#define arch_atomic_add_negative_acquire arch_atomic_add_negative +#define arch_atomic_add_negative_release arch_atomic_add_negative +#define arch_atomic_add_negative_relaxed arch_atomic_add_negative +#endif /* arch_atomic_add_negative */ + #ifndef arch_atomic_add_negative /** - * arch_atomic_add_negative - add and test if negative + * arch_atomic_add_negative - Add and test if negative * @i: integer value to add * @v: pointer of type atomic_t * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. */ static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) @@ -1226,6 +1232,95 @@ arch_atomic_add_negative(int i, atomic_t *v) #define arch_atomic_add_negative arch_atomic_add_negative #endif +#ifndef arch_atomic_add_negative_acquire +/** + * arch_atomic_add_negative_acquire - Add and test if negative + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. + */ +static __always_inline bool +arch_atomic_add_negative_acquire(int i, atomic_t *v) +{ + return arch_atomic_add_return_acquire(i, v) < 0; +} +#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire +#endif + +#ifndef arch_atomic_add_negative_release +/** + * arch_atomic_add_negative_release - Add and test if negative + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. + */ +static __always_inline bool +arch_atomic_add_negative_release(int i, atomic_t *v) +{ + return arch_atomic_add_return_release(i, v) < 0; +} +#define arch_atomic_add_negative_release arch_atomic_add_negative_release +#endif + +#ifndef arch_atomic_add_negative_relaxed +/** + * arch_atomic_add_negative_relaxed - Add and test if negative + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. + */ +static __always_inline bool +arch_atomic_add_negative_relaxed(int i, atomic_t *v) +{ + return arch_atomic_add_return_relaxed(i, v) < 0; +} +#define arch_atomic_add_negative_relaxed arch_atomic_add_negative_relaxed +#endif + +#else /* arch_atomic_add_negative_relaxed */ + +#ifndef arch_atomic_add_negative_acquire +static __always_inline bool +arch_atomic_add_negative_acquire(int i, atomic_t *v) +{ + bool ret = arch_atomic_add_negative_relaxed(i, v); + __atomic_acquire_fence(); + return ret; +} +#define arch_atomic_add_negative_acquire arch_atomic_add_negative_acquire +#endif + +#ifndef arch_atomic_add_negative_release +static __always_inline bool +arch_atomic_add_negative_release(int i, atomic_t *v) +{ + __atomic_release_fence(); + return arch_atomic_add_negative_relaxed(i, v); +} +#define arch_atomic_add_negative_release arch_atomic_add_negative_release +#endif + +#ifndef arch_atomic_add_negative +static __always_inline bool +arch_atomic_add_negative(int i, atomic_t *v) +{ + bool ret; + __atomic_pre_full_fence(); + ret = arch_atomic_add_negative_relaxed(i, v); + __atomic_post_full_fence(); + return ret; +} +#define arch_atomic_add_negative arch_atomic_add_negative +#endif + +#endif /* arch_atomic_add_negative_relaxed */ + #ifndef arch_atomic_fetch_add_unless /** * arch_atomic_fetch_add_unless - add unless the number is already a given value @@ -2329,15 +2424,21 @@ arch_atomic64_inc_and_test(atomic64_t *v) #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test #endif +#ifndef arch_atomic64_add_negative_relaxed +#ifdef arch_atomic64_add_negative +#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative +#define arch_atomic64_add_negative_release arch_atomic64_add_negative +#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative +#endif /* arch_atomic64_add_negative */ + #ifndef arch_atomic64_add_negative /** - * arch_atomic64_add_negative - add and test if negative + * arch_atomic64_add_negative - Add and test if negative * @i: integer value to add * @v: pointer of type atomic64_t * - * Atomically adds @i to @v and returns true - * if the result is negative, or false when - * result is greater than or equal to zero. + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. */ static __always_inline bool arch_atomic64_add_negative(s64 i, atomic64_t *v) @@ -2347,6 +2448,95 @@ arch_atomic64_add_negative(s64 i, atomic64_t *v) #define arch_atomic64_add_negative arch_atomic64_add_negative #endif +#ifndef arch_atomic64_add_negative_acquire +/** + * arch_atomic64_add_negative_acquire - Add and test if negative + * @i: integer value to add + * @v: pointer of type atomic64_t + * + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. + */ +static __always_inline bool +arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v) +{ + return arch_atomic64_add_return_acquire(i, v) < 0; +} +#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire +#endif + +#ifndef arch_atomic64_add_negative_release +/** + * arch_atomic64_add_negative_release - Add and test if negative + * @i: integer value to add + * @v: pointer of type atomic64_t + * + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. + */ +static __always_inline bool +arch_atomic64_add_negative_release(s64 i, atomic64_t *v) +{ + return arch_atomic64_add_return_release(i, v) < 0; +} +#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release +#endif + +#ifndef arch_atomic64_add_negative_relaxed +/** + * arch_atomic64_add_negative_relaxed - Add and test if negative + * @i: integer value to add + * @v: pointer of type atomic64_t + * + * Atomically adds @i to @v and returns true if the result is negative, + * or false when the result is greater than or equal to zero. + */ +static __always_inline bool +arch_atomic64_add_negative_relaxed(s64 i, atomic64_t *v) +{ + return arch_atomic64_add_return_relaxed(i, v) < 0; +} +#define arch_atomic64_add_negative_relaxed arch_atomic64_add_negative_relaxed +#endif + +#else /* arch_atomic64_add_negative_relaxed */ + +#ifndef arch_atomic64_add_negative_acquire +static __always_inline bool +arch_atomic64_add_negative_acquire(s64 i, atomic64_t *v) +{ + bool ret = arch_atomic64_add_negative_relaxed(i, v); + __atomic_acquire_fence(); + return ret; +} +#define arch_atomic64_add_negative_acquire arch_atomic64_add_negative_acquire +#endif + +#ifndef arch_atomic64_add_negative_release +static __always_inline bool +arch_atomic64_add_negative_release(s64 i, atomic64_t *v) +{ + __atomic_release_fence(); + return arch_atomic64_add_negative_relaxed(i, v); +} +#define arch_atomic64_add_negative_release arch_atomic64_add_negative_release +#endif + +#ifndef arch_atomic64_add_negative +static __always_inline bool +arch_atomic64_add_negative(s64 i, atomic64_t *v) +{ + bool ret; + __atomic_pre_full_fence(); + ret = arch_atomic64_add_negative_relaxed(i, v); + __atomic_post_full_fence(); + return ret; +} +#define arch_atomic64_add_negative arch_atomic64_add_negative +#endif + +#endif /* arch_atomic64_add_negative_relaxed */ + #ifndef arch_atomic64_fetch_add_unless /** * arch_atomic64_fetch_add_unless - add unless the number is already a given value @@ -2456,4 +2646,4 @@ arch_atomic64_dec_if_positive(atomic64_t *v) #endif #endif /* _LINUX_ATOMIC_FALLBACK_H */ -// b5e87bdd5ede61470c29f7a7e4de781af3770f09 +// 00071fffa021cec66f6290d706d69c91df87bade diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index 7a139ec030b0..0496816738ca 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -592,6 +592,28 @@ atomic_add_negative(int i, atomic_t *v) return arch_atomic_add_negative(i, v); } +static __always_inline bool +atomic_add_negative_acquire(int i, atomic_t *v) +{ + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic_add_negative_acquire(i, v); +} + +static __always_inline bool +atomic_add_negative_release(int i, atomic_t *v) +{ + kcsan_release(); + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic_add_negative_release(i, v); +} + +static __always_inline bool +atomic_add_negative_relaxed(int i, atomic_t *v) +{ + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic_add_negative_relaxed(i, v); +} + static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { @@ -1211,6 +1233,28 @@ atomic64_add_negative(s64 i, atomic64_t *v) return arch_atomic64_add_negative(i, v); } +static __always_inline bool +atomic64_add_negative_acquire(s64 i, atomic64_t *v) +{ + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic64_add_negative_acquire(i, v); +} + +static __always_inline bool +atomic64_add_negative_release(s64 i, atomic64_t *v) +{ + kcsan_release(); + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic64_add_negative_release(i, v); +} + +static __always_inline bool +atomic64_add_negative_relaxed(s64 i, atomic64_t *v) +{ + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic64_add_negative_relaxed(i, v); +} + static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { @@ -1830,6 +1874,28 @@ atomic_long_add_negative(long i, atomic_long_t *v) return arch_atomic_long_add_negative(i, v); } +static __always_inline bool +atomic_long_add_negative_acquire(long i, atomic_long_t *v) +{ + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic_long_add_negative_acquire(i, v); +} + +static __always_inline bool +atomic_long_add_negative_release(long i, atomic_long_t *v) +{ + kcsan_release(); + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic_long_add_negative_release(i, v); +} + +static __always_inline bool +atomic_long_add_negative_relaxed(long i, atomic_long_t *v) +{ + instrument_atomic_read_write(v, sizeof(*v)); + return arch_atomic_long_add_negative_relaxed(i, v); +} + static __always_inline long atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { @@ -2083,4 +2149,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) }) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// 764f741eb77a7ad565dc8d99ce2837d5542e8aee +// 1b485de9cbaa4900de59e14ee2084357eaeb1c3a diff --git a/include/linux/atomic/atomic-long.h b/include/linux/atomic/atomic-long.h index 800b8c35992d..2fc51ba66beb 100644 --- a/include/linux/atomic/atomic-long.h +++ b/include/linux/atomic/atomic-long.h @@ -479,6 +479,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v) return arch_atomic64_add_negative(i, v); } +static __always_inline bool +arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v) +{ + return arch_atomic64_add_negative_acquire(i, v); +} + +static __always_inline bool +arch_atomic_long_add_negative_release(long i, atomic_long_t *v) +{ + return arch_atomic64_add_negative_release(i, v); +} + +static __always_inline bool +arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v) +{ + return arch_atomic64_add_negative_relaxed(i, v); +} + static __always_inline long arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { @@ -973,6 +991,24 @@ arch_atomic_long_add_negative(long i, atomic_long_t *v) return arch_atomic_add_negative(i, v); } +static __always_inline bool +arch_atomic_long_add_negative_acquire(long i, atomic_long_t *v) +{ + return arch_atomic_add_negative_acquire(i, v); +} + +static __always_inline bool +arch_atomic_long_add_negative_release(long i, atomic_long_t *v) +{ + return arch_atomic_add_negative_release(i, v); +} + +static __always_inline bool +arch_atomic_long_add_negative_relaxed(long i, atomic_long_t *v) +{ + return arch_atomic_add_negative_relaxed(i, v); +} + static __always_inline long arch_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { @@ -1011,4 +1047,4 @@ arch_atomic_long_dec_if_positive(atomic_long_t *v) #endif /* CONFIG_64BIT */ #endif /* _LINUX_ATOMIC_LONG_H */ -// e8f0e08ff072b74d180eabe2ad001282b38c2c88 +// a194c07d7d2f4b0e178d3c118c919775d5d65f50 -- cgit v1.2.3 From ee1ee6db07795d9637bc5e8993a8ddcf886541ef Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Mar 2023 21:55:31 +0100 Subject: atomics: Provide rcuref - scalable reference counting atomic_t based reference counting, including refcount_t, uses atomic_inc_not_zero() for acquiring a reference. atomic_inc_not_zero() is implemented with a atomic_try_cmpxchg() loop. High contention of the reference count leads to retry loops and scales badly. There is nothing to improve on this implementation as the semantics have to be preserved. Provide rcuref as a scalable alternative solution which is suitable for RCU managed objects. Similar to refcount_t it comes with overflow and underflow detection and mitigation. rcuref treats the underlying atomic_t as an unsigned integer and partitions this space into zones: 0x00000000 - 0x7FFFFFFF valid zone (1 .. (INT_MAX + 1) references) 0x80000000 - 0xBFFFFFFF saturation zone 0xC0000000 - 0xFFFFFFFE dead zone 0xFFFFFFFF no reference rcuref_get() unconditionally increments the reference count with atomic_add_negative_relaxed(). rcuref_put() unconditionally decrements the reference count with atomic_add_negative_release(). This unconditional increment avoids the inc_not_zero() problem, but requires a more complex implementation on the put() side when the count drops from 0 to -1. When this transition is detected then it is attempted to mark the reference count dead, by setting it to the midpoint of the dead zone with a single atomic_cmpxchg_release() operation. This operation can fail due to a concurrent rcuref_get() elevating the reference count from -1 to 0 again. If the unconditional increment in rcuref_get() hits a reference count which is marked dead (or saturated) it will detect it after the fact and bring back the reference count to the midpoint of the respective zone. The zones provide enough tolerance which makes it practically impossible to escape from a zone. The racy implementation of rcuref_put() requires to protect rcuref_put() against a grace period ending in order to prevent a subtle use after free. As RCU is the only mechanism which allows to protect against that, it is not possible to fully replace the atomic_inc_not_zero() based implementation of refcount_t with this scheme. The final drop is slightly more expensive than the atomic_dec_return() counterpart, but that's not the case which this is optimized for. The optimization is on the high frequeunt get()/put() pairs and their scalability. The performance of an uncontended rcuref_get()/put() pair where the put() is not dropping the last reference is still on par with the plain atomic operations, while at the same time providing overflow and underflow detection and mitigation. The performance of rcuref compared to plain atomic_inc_not_zero() and atomic_dec_return() based reference counting under contention: - Micro benchmark: All CPUs running a increment/decrement loop on an elevated reference count, which means the 0 to -1 transition never happens. The performance gain depends on microarchitecture and the number of CPUs and has been observed in the range of 1.3X to 4.7X - Conversion of dst_entry::__refcnt to rcuref and testing with the localhost memtier/memcached benchmark. That benchmark shows the reference count contention prominently. The performance gain depends on microarchitecture and the number of CPUs and has been observed in the range of 1.1X to 2.6X over the previous fix for the false sharing issue vs. struct dst_entry::__refcnt. When memtier is run over a real 1Gb network connection, there is a small gain on top of the false sharing fix. The two changes combined result in a 2%-5% total gain for that networked test. Reported-by: Wangyang Guo Reported-by: Arjan Van De Ven Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230323102800.158429195@linutronix.de --- include/linux/rcuref.h | 155 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/types.h | 6 ++ 2 files changed, 161 insertions(+) create mode 100644 include/linux/rcuref.h (limited to 'include/linux') diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h new file mode 100644 index 000000000000..2c8bfd0f1b6b --- /dev/null +++ b/include/linux/rcuref.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_RCUREF_H +#define _LINUX_RCUREF_H + +#include +#include +#include +#include +#include +#include + +#define RCUREF_ONEREF 0x00000000U +#define RCUREF_MAXREF 0x7FFFFFFFU +#define RCUREF_SATURATED 0xA0000000U +#define RCUREF_RELEASED 0xC0000000U +#define RCUREF_DEAD 0xE0000000U +#define RCUREF_NOREF 0xFFFFFFFFU + +/** + * rcuref_init - Initialize a rcuref reference count with the given reference count + * @ref: Pointer to the reference count + * @cnt: The initial reference count typically '1' + */ +static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) +{ + atomic_set(&ref->refcnt, cnt - 1); +} + +/** + * rcuref_read - Read the number of held reference counts of a rcuref + * @ref: Pointer to the reference count + * + * Return: The number of held references (0 ... N) + */ +static inline unsigned int rcuref_read(rcuref_t *ref) +{ + unsigned int c = atomic_read(&ref->refcnt); + + /* Return 0 if within the DEAD zone. */ + return c >= RCUREF_RELEASED ? 0 : c + 1; +} + +extern __must_check bool rcuref_get_slowpath(rcuref_t *ref); + +/** + * rcuref_get - Acquire one reference on a rcuref reference count + * @ref: Pointer to the reference count + * + * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See documentation in lib/rcuref.c + * + * Return: + * False if the attempt to acquire a reference failed. This happens + * when the last reference has been put already + * + * True if a reference was successfully acquired + */ +static inline __must_check bool rcuref_get(rcuref_t *ref) +{ + /* + * Unconditionally increase the reference count. The saturation and + * dead zones provide enough tolerance for this. + */ + if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt))) + return true; + + /* Handle the cases inside the saturation and dead zones */ + return rcuref_get_slowpath(ref); +} + +extern __must_check bool rcuref_put_slowpath(rcuref_t *ref); + +/* + * Internal helper. Do not invoke directly. + */ +static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), + "suspicious rcuref_put_rcusafe() usage"); + /* + * Unconditionally decrease the reference count. The saturation and + * dead zones provide enough tolerance for this. + */ + if (likely(!atomic_add_negative_release(-1, &ref->refcnt))) + return false; + + /* + * Handle the last reference drop and cases inside the saturation + * and dead zones. + */ + return rcuref_put_slowpath(ref); +} + +/** + * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe + * @ref: Pointer to the reference count + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides an acquire ordering on success such that free() + * must come after. + * + * Can be invoked from contexts, which guarantee that no grace period can + * happen which would free the object concurrently if the decrement drops + * the last reference and the slowpath races against a concurrent get() and + * put() pair. rcu_read_lock()'ed and atomic contexts qualify. + * + * Return: + * True if this was the last reference with no future references + * possible. This signals the caller that it can safely release the + * object which is protected by the reference counter. + * + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * release the protected object. + */ +static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref) +{ + return __rcuref_put(ref); +} + +/** + * rcuref_put -- Release one reference for a rcuref reference count + * @ref: Pointer to the reference count + * + * Can be invoked from any context. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides an acquire ordering on success such that free() + * must come after. + * + * Return: + * + * True if this was the last reference with no future references + * possible. This signals the caller that it can safely schedule the + * object, which is protected by the reference counter, for + * deconstruction. + * + * False if there are still active references or the put() raced + * with a concurrent get()/put() pair. Caller is not allowed to + * deconstruct the protected object. + */ +static inline __must_check bool rcuref_put(rcuref_t *ref) +{ + bool released; + + preempt_disable(); + released = __rcuref_put(ref); + preempt_enable(); + return released; +} + +#endif diff --git a/include/linux/types.h b/include/linux/types.h index ea8cf60a8a79..688fb943556a 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -175,6 +175,12 @@ typedef struct { } atomic64_t; #endif +typedef struct { + atomic_t refcnt; +} rcuref_t; + +#define RCUREF_INIT(i) { .refcnt = ATOMIC_INIT(i - 1) } + struct list_head { struct list_head *next, *prev; }; -- cgit v1.2.3 From 634f1a7110b439c65fd8a809171c1d2d28bcea6f Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Mon, 27 Mar 2023 19:11:51 +0000 Subject: vsock: support sockmap This patch adds sockmap support for vsock sockets. It is intended to be usable by all transports, but only the virtio and loopback transports are implemented. SOCK_STREAM, SOCK_DGRAM, and SOCK_SEQPACKET are all supported. Signed-off-by: Bobby Eshleman Acked-by: Michael S. Tsirkin Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 3f9c16611306..c58453699ee9 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -245,4 +245,5 @@ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted); void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); +int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); #endif /* _LINUX_VIRTIO_VSOCK_H */ -- cgit v1.2.3 From c59647c0dc679008886756a888368da1c6d4ccd3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Mar 2023 23:50:19 +0000 Subject: net: add softnet_data.in_net_rx_action We want to make two optimizations in napi_schedule_rps() and ____napi_schedule() which require to know if these helpers are called from net_rx_action(), instead of being called from other contexts. sd.in_net_rx_action is only read/written by the owning cpu. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Tested-by: Jason Xing Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 18a5be6ddd0f..c8c634091a65 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3188,6 +3188,7 @@ struct softnet_data { #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; #endif + bool in_net_rx_action; #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit __rcu *flow_limit; #endif -- cgit v1.2.3 From dd2d6604407da5b1b260faee409cd601fe914ce9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 30 Mar 2023 21:47:31 -0700 Subject: net: minor reshuffle of napi_struct napi_id is read by GRO and drivers to mark skbs, and it currently sits at the end of the structure, in a mostly unused cache line. Move it up into a hole, and separate the clearly control path fields from the important ones. Before: struct napi_struct { struct list_head poll_list; /* 0 16 */ long unsigned int state; /* 16 8 */ int weight; /* 24 4 */ int defer_hard_irqs_count; /* 28 4 */ long unsigned int gro_bitmask; /* 32 8 */ int (*poll)(struct napi_struct *, int); /* 40 8 */ int poll_owner; /* 48 4 */ /* XXX 4 bytes hole, try to pack */ struct net_device * dev; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct gro_list gro_hash[8]; /* 64 192 */ /* --- cacheline 4 boundary (256 bytes) --- */ struct sk_buff * skb; /* 256 8 */ struct list_head rx_list; /* 264 16 */ int rx_count; /* 280 4 */ /* XXX 4 bytes hole, try to pack */ struct hrtimer timer; /* 288 64 */ /* XXX last struct has 4 bytes of padding */ /* --- cacheline 5 boundary (320 bytes) was 32 bytes ago --- */ struct list_head dev_list; /* 352 16 */ struct hlist_node napi_hash_node; /* 368 16 */ /* --- cacheline 6 boundary (384 bytes) --- */ unsigned int napi_id; /* 384 4 */ /* XXX 4 bytes hole, try to pack */ struct task_struct * thread; /* 392 8 */ /* size: 400, cachelines: 7, members: 17 */ /* sum members: 388, holes: 3, sum holes: 12 */ /* paddings: 1, sum paddings: 4 */ /* last cacheline: 16 bytes */ }; After: struct napi_struct { struct list_head poll_list; /* 0 16 */ long unsigned int state; /* 16 8 */ int weight; /* 24 4 */ int defer_hard_irqs_count; /* 28 4 */ long unsigned int gro_bitmask; /* 32 8 */ int (*poll)(struct napi_struct *, int); /* 40 8 */ int poll_owner; /* 48 4 */ /* XXX 4 bytes hole, try to pack */ struct net_device * dev; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct gro_list gro_hash[8]; /* 64 192 */ /* --- cacheline 4 boundary (256 bytes) --- */ struct sk_buff * skb; /* 256 8 */ struct list_head rx_list; /* 264 16 */ int rx_count; /* 280 4 */ unsigned int napi_id; /* 284 4 */ struct hrtimer timer; /* 288 64 */ /* XXX last struct has 4 bytes of padding */ /* --- cacheline 5 boundary (320 bytes) was 32 bytes ago --- */ struct task_struct * thread; /* 352 8 */ struct list_head dev_list; /* 360 16 */ struct hlist_node napi_hash_node; /* 376 16 */ /* size: 392, cachelines: 7, members: 17 */ /* sum members: 388, holes: 1, sum holes: 4 */ /* paddings: 1, sum paddings: 4 */ /* forced alignments: 1 */ /* last cacheline: 8 bytes */ } __attribute__((__aligned__(8))); Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c8c634091a65..62e093a6d6d1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -367,11 +367,12 @@ struct napi_struct { struct sk_buff *skb; struct list_head rx_list; /* Pending GRO_NORMAL skbs */ int rx_count; /* length of rx_list */ + unsigned int napi_id; struct hrtimer timer; + struct task_struct *thread; + /* control-path-only fields follow */ struct list_head dev_list; struct hlist_node napi_hash_node; - unsigned int napi_id; - struct task_struct *thread; }; enum { -- cgit v1.2.3 From c4bffeaa8d50b7279c5a76597efa4b06e709df63 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 2 Apr 2023 15:37:53 +0300 Subject: net: add struct kernel_hwtstamp_config and make net_hwtstamp_validate() use it Jakub Kicinski suggested that we may want to add new UAPI for controlling hardware timestamping through netlink in the future, and in that case, we will be limited to the struct hwtstamp_config that is currently passed in fixed binary format through the SIOCGHWTSTAMP and SIOCSHWTSTAMP ioctls. It would be good if new kernel code already started operating on an extensible kernel variant of that structure, similar in concept to struct kernel_ethtool_coalesce vs struct ethtool_coalesce. Since struct hwtstamp_config is in include/uapi/linux/net_tstamp.h, here we introduce include/linux/net_tstamp.h which shadows that other header, but also includes it, so that existing includers of this header work as before. In addition to that, we add the definition for the kernel-only structure, and a helper which translates all fields by manual copying. I am doing a manual copy in order to not force the alignment (or type) of the fields of struct kernel_hwtstamp_config to be the same as of struct hwtstamp_config, even though now, they are the same. Link: https://lore.kernel.org/netdev/20230330223519.36ce7d23@kernel.org/ Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/net_tstamp.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 include/linux/net_tstamp.h (limited to 'include/linux') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h new file mode 100644 index 000000000000..fd67f3cc0c4b --- /dev/null +++ b/include/linux/net_tstamp.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_NET_TIMESTAMPING_H_ +#define _LINUX_NET_TIMESTAMPING_H_ + +#include + +/** + * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config + * + * @flags: see struct hwtstamp_config + * @tx_type: see struct hwtstamp_config + * @rx_filter: see struct hwtstamp_config + * + * Prefer using this structure for in-kernel processing of hardware + * timestamping configuration, over the inextensible struct hwtstamp_config + * exposed to the %SIOCGHWTSTAMP and %SIOCSHWTSTAMP ioctl UAPI. + */ +struct kernel_hwtstamp_config { + int flags; + int tx_type; + int rx_filter; +}; + +static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, + const struct hwtstamp_config *cfg) +{ + kernel_cfg->flags = cfg->flags; + kernel_cfg->tx_type = cfg->tx_type; + kernel_cfg->rx_filter = cfg->rx_filter; +} + +#endif /* _LINUX_NET_TIMESTAMPING_H_ */ -- cgit v1.2.3 From 88c0a6b503b7f4fffb68a8d49c3987870c5b1d6b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 2 Apr 2023 15:37:55 +0300 Subject: net: create a netdev notifier for DSA to reject PTP on DSA master The fact that PTP 2-step TX timestamping is broken on DSA switches if the master also timestamps the same packets is documented by commit f685e609a301 ("net: dsa: Deny PTP on master if switch supports it"). We attempt to help the users avoid shooting themselves in the foot by making DSA reject the timestamping ioctls on an interface that is a DSA master, and the switch tree beneath it contains switches which are aware of PTP. The only problem is that there isn't an established way of intercepting ndo_eth_ioctl calls, so DSA creates avoidable burden upon the network stack by creating a struct dsa_netdevice_ops with overlaid function pointers that are manually checked from the relevant call sites. There used to be 2 such dsa_netdevice_ops, but now, ndo_eth_ioctl is the only one left. There is an ongoing effort to migrate driver-visible hardware timestamping control from the ndo_eth_ioctl() based API to a new ndo_hwtstamp_set() model, but DSA actively prevents that migration, since dsa_master_ioctl() is currently coded to manually call the master's legacy ndo_eth_ioctl(), and so, whenever a network device driver would be converted to the new API, DSA's restrictions would be circumvented, because any device could be used as a DSA master. The established way for unrelated modules to react on a net device event is via netdevice notifiers. So we create a new notifier which gets called whenever there is an attempt to change hardware timestamping settings on a device. Finally, there is another reason why a netdev notifier will be a good idea, besides strictly DSA, and this has to do with PHY timestamping. With ndo_eth_ioctl(), all MAC drivers must manually call phy_has_hwtstamp() before deciding whether to act upon SIOCSHWTSTAMP, otherwise they must pass this ioctl to the PHY driver via phy_mii_ioctl(). With the new ndo_hwtstamp_set() API, it will be desirable to simply not make any calls into the MAC device driver when timestamping should be performed at the PHY level. But there exist drivers, such as the lan966x switch, which need to install packet traps for PTP regardless of whether they are the layer that provides the hardware timestamps, or the PHY is. That would be impossible to support with the new API. The proposal there, too, is to introduce a netdev notifier which acts as a better cue for switching drivers to add or remove PTP packet traps, than ndo_hwtstamp_set(). The one introduced here "almost" works there as well, except for the fact that packet traps should only be installed if the PHY driver succeeded to enable hardware timestamping, whereas here, we need to deny hardware timestamping on the DSA master before it actually gets enabled. This is why this notifier is called "PRE_", and the notifier that would get used for PHY timestamping and packet traps would be called NETDEV_CHANGE_HWTSTAMP. This isn't a new concept, for example NETDEV_CHANGEUPPER and NETDEV_PRECHANGEUPPER do the same thing. In expectation of future netlink UAPI, we also pass a non-NULL extack pointer to the netdev notifier, and we make DSA populate it with an informative reason for the rejection. To avoid making it go to waste, we make the ioctl-based dev_set_hwtstamp() create a fake extack and print the message to the kernel log. Link: https://lore.kernel.org/netdev/20230401191215.tvveoi3lkawgg6g4@skbuf/ Link: https://lore.kernel.org/netdev/20230310164451.ls7bbs6pdzs4m6pw@skbuf/ Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 62e093a6d6d1..a740be3bb911 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2878,6 +2878,7 @@ enum netdev_cmd { NETDEV_OFFLOAD_XSTATS_REPORT_USED, NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, NETDEV_XDP_FEAT_CHANGE, + NETDEV_PRE_CHANGE_HWTSTAMP, }; const char *netdev_cmd_to_name(enum netdev_cmd cmd); @@ -2928,6 +2929,11 @@ struct netdev_notifier_pre_changeaddr_info { const unsigned char *dev_addr; }; +struct netdev_notifier_hwtstamp_info { + struct netdev_notifier_info info; /* must be first */ + struct kernel_hwtstamp_config *config; +}; + enum netdev_offload_xstats_type { NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1, }; @@ -2984,7 +2990,8 @@ netdev_notifier_info_to_extack(const struct netdev_notifier_info *info) } int call_netdevice_notifiers(unsigned long val, struct net_device *dev); - +int call_netdevice_notifiers_info(unsigned long val, + struct netdev_notifier_info *info); extern rwlock_t dev_base_lock; /* Device list lock */ -- cgit v1.2.3 From 657de1cf258dbe2489906c81bd91e4af536de255 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 2 Apr 2023 17:16:56 +0200 Subject: net: phy: smsc: add support for edpd tunable This adds support for the EDPD PHY tunable. Per default EDPD is disabled in interrupt mode, the tunable can be used to override this, e.g. if the link partner doesn't use EDPD. The interval to check for energy can be chosen between 1000ms and 2000ms. Note that this value consists of the 1000ms phylib interval for state machine runs plus the time to wait for energy being detected. v2: - consider that phylib core holds phydev->lock when calling the phy tunable hooks Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/smscphy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/smscphy.h b/include/linux/smscphy.h index 80f37c1dba58..e1c88627755a 100644 --- a/include/linux/smscphy.h +++ b/include/linux/smscphy.h @@ -32,6 +32,10 @@ int smsc_phy_config_intr(struct phy_device *phydev); irqreturn_t smsc_phy_handle_interrupt(struct phy_device *phydev); int smsc_phy_config_init(struct phy_device *phydev); int lan87xx_read_status(struct phy_device *phydev); +int smsc_phy_get_tunable(struct phy_device *phydev, + struct ethtool_tunable *tuna, void *data); +int smsc_phy_set_tunable(struct phy_device *phydev, + struct ethtool_tunable *tuna, const void *data); int smsc_phy_probe(struct phy_device *phydev); #endif /* __LINUX_SMSCPHY_H__ */ -- cgit v1.2.3 From b7e852a9ec96635168c04204fb7cf1f7390b9a8c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Apr 2023 21:50:23 -0700 Subject: bpf: Remove unused arguments from btf_struct_access(). Remove unused arguments from btf_struct_access() callback. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230404045029.82870-3-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 3 +-- include/linux/filter.h | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2d8f3f639e68..4f689dda748f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -893,8 +893,7 @@ struct bpf_verifier_ops { struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag); + int off, int size); }; struct bpf_prog_offload_ops { diff --git a/include/linux/filter.h b/include/linux/filter.h index 23c08c31bea9..5364b0c52c1d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -571,8 +571,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); extern struct mutex nf_conn_btf_access_lock; extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag); + int off, int size); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, -- cgit v1.2.3 From 63260df1396578226ac3134cf7f764690002e70e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 3 Apr 2023 21:50:24 -0700 Subject: bpf: Refactor btf_nested_type_is_trusted(). btf_nested_type_is_trusted() tries to find a struct member at corresponding offset. It works for flat structures and falls apart in more complex structs with nested structs. The offset->member search is already performed by btf_struct_walk() including nested structs. Reuse this work and pass {field name, field btf id} into btf_nested_type_is_trusted() instead of offset to make BTF_TYPE_SAFE*() logic more robust. Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230404045029.82870-4-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4f689dda748f..002a811b6b90 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2263,7 +2263,7 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size, int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag); + u32 *next_btf_id, enum bpf_type_flag *flag, const char **field_name); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, const struct btf *need_btf, u32 need_type_id, @@ -2302,7 +2302,7 @@ struct bpf_core_ctx { bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off, const char *suffix); + const char *field_name, u32 btf_id, const char *suffix); bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, const struct btf *reg_btf, u32 reg_id, @@ -2517,7 +2517,8 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) static inline int btf_struct_access(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id, enum bpf_type_flag *flag) + u32 *next_btf_id, enum bpf_type_flag *flag, + const char **field_name) { return -EACCES; } -- cgit v1.2.3 From 10739ea3132861b1344264b110ea48d951a0e3b0 Mon Sep 17 00:00:00 2001 From: Shenwei Wang Date: Mon, 3 Apr 2023 17:23:01 -0500 Subject: net: stmmac: add support for platform specific reset This patch adds support for platform-specific reset logic in the stmmac driver. Some SoCs require a different reset mechanism than the standard dwmac IP reset. To support these platforms, a new function pointer 'fix_soc_reset' is added to the plat_stmmacenet_data structure. The stmmac_reset in hwif.h is modified to call the 'fix_soc_reset' function if it exists. This enables the driver to use the platform-specific reset logic when necessary. Signed-off-by: Shenwei Wang Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230403222302.328262-1-shenwei.wang@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index a2414c187483..dafa001e9e7a 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -223,6 +223,7 @@ struct plat_stmmacenet_data { struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES]; struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES]; void (*fix_mac_speed)(void *priv, unsigned int speed); + int (*fix_soc_reset)(void *priv, void __iomem *ioaddr); int (*serdes_powerup)(struct net_device *ndev, void *priv); void (*serdes_powerdown)(struct net_device *ndev, void *priv); void (*speed_mode_2500)(struct net_device *ndev, void *priv); -- cgit v1.2.3 From 34bf93472f8fb60b4189aa2872471017e739cf0a Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Thu, 30 Mar 2023 12:20:01 +0200 Subject: kallsyms: move module-related functions under correct configs Functions for searching module kallsyms should have non-empty definitions only if CONFIG_MODULES=y and CONFIG_KALLSYMS=y. Until now, only CONFIG_MODULES check was used for many of these, which may have caused complilation errors on some configs. This patch moves all relevant functions under the correct configs. Fixes: bd5314f8dd2d ("kallsyms, bpf: Move find_kallsyms_symbol_value out of internal header") Signed-off-by: Viktor Malik Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202303181535.RFDCnz3E-lkp@intel.com/ Link: https://lore.kernel.org/r/20230330102001.2183693-1-vmalik@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/module.h | 135 +++++++++++++++++++++++++++---------------------- 1 file changed, 74 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index 41cfd3be57e5..886d24877c7c 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -608,16 +608,6 @@ static inline bool within_module(unsigned long addr, const struct module *mod) /* Search for module by name: must be in a RCU-sched critical section. */ struct module *find_module(const char *name); -/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if - symnum out of range. */ -int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, - char *name, char *module_name, int *exported); - -/* Look for this name: can be of form module:name. */ -unsigned long module_kallsyms_lookup_name(const char *name); - -unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); - extern void __noreturn __module_put_and_kthread_exit(struct module *mod, long code); #define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code) @@ -664,17 +654,6 @@ static inline void __module_get(struct module *module) /* Dereference module function descriptor */ void *dereference_module_function_descriptor(struct module *mod, void *ptr); -/* For kallsyms to ask for address resolution. namebuf should be at - * least KSYM_NAME_LEN long: a pointer to namebuf is returned if - * found, otherwise NULL. */ -const char *module_address_lookup(unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset, - char **modname, const unsigned char **modbuildid, - char *namebuf); -int lookup_module_symbol_name(unsigned long addr, char *symname); -int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); - int register_module_notifier(struct notifier_block *nb); int unregister_module_notifier(struct notifier_block *nb); @@ -765,45 +744,6 @@ static inline void module_put(struct module *module) #define module_name(mod) "kernel" -/* For kallsyms to ask for address resolution. NULL means not found. */ -static inline const char *module_address_lookup(unsigned long addr, - unsigned long *symbolsize, - unsigned long *offset, - char **modname, - const unsigned char **modbuildid, - char *namebuf) -{ - return NULL; -} - -static inline int lookup_module_symbol_name(unsigned long addr, char *symname) -{ - return -ERANGE; -} - -static inline int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name) -{ - return -ERANGE; -} - -static inline int module_get_kallsym(unsigned int symnum, unsigned long *value, - char *type, char *name, - char *module_name, int *exported) -{ - return -ERANGE; -} - -static inline unsigned long module_kallsyms_lookup_name(const char *name) -{ - return 0; -} - -static inline unsigned long find_kallsyms_symbol_value(struct module *mod, - const char *name) -{ - return 0; -} - static inline int register_module_notifier(struct notifier_block *nb) { /* no events will happen anyway, so this can always succeed */ @@ -899,7 +839,36 @@ int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, struct module *, unsigned long), void *data); -#else + +/* For kallsyms to ask for address resolution. namebuf should be at + * least KSYM_NAME_LEN long: a pointer to namebuf is returned if + * found, otherwise NULL. + */ +const char *module_address_lookup(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset, + char **modname, const unsigned char **modbuildid, + char *namebuf); +int lookup_module_symbol_name(unsigned long addr, char *symname); +int lookup_module_symbol_attrs(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char *modname, + char *name); + +/* Returns 0 and fills in value, defined and namebuf, or -ERANGE if + * symnum out of range. + */ +int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name, char *module_name, int *exported); + +/* Look for this name: can be of form module:name. */ +unsigned long module_kallsyms_lookup_name(const char *name); + +unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name); + +#else /* CONFIG_MODULES && CONFIG_KALLSYMS */ + static inline int module_kallsyms_on_each_symbol(const char *modname, int (*fn)(void *, const char *, struct module *, unsigned long), @@ -907,6 +876,50 @@ static inline int module_kallsyms_on_each_symbol(const char *modname, { return -EOPNOTSUPP; } + +/* For kallsyms to ask for address resolution. NULL means not found. */ +static inline const char *module_address_lookup(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset, + char **modname, + const unsigned char **modbuildid, + char *namebuf) +{ + return NULL; +} + +static inline int lookup_module_symbol_name(unsigned long addr, char *symname) +{ + return -ERANGE; +} + +static inline int lookup_module_symbol_attrs(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char *modname, + char *name) +{ + return -ERANGE; +} + +static inline int module_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, + char *module_name, int *exported) +{ + return -ERANGE; +} + +static inline unsigned long module_kallsyms_lookup_name(const char *name) +{ + return 0; +} + +static inline unsigned long find_kallsyms_symbol_value(struct module *mod, + const char *name) +{ + return 0; +} + #endif /* CONFIG_MODULES && CONFIG_KALLSYMS */ #endif /* _LINUX_MODULE_H */ -- cgit v1.2.3 From 05f3ab7780b3c0cfe26a8134606bdf641c4f4bb2 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 5 Apr 2023 17:10:25 +0200 Subject: net: ethernet: mtk_eth_soc: add code for offloading flows from wlan devices WED version 2 (on MT7986 and later) can offload flows originating from wireless devices. In order to make that work, ndo_setup_tc needs to be implemented on the netdevs. This adds the required code to offload flows coming in from WED, while keeping track of the incoming wed index used for selecting the correct PPE device. Signed-off-by: Felix Fietkau Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index fd0b0605cf90..b2b28180dff7 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -6,6 +6,7 @@ #include #include #include +#include #define MTK_WED_TX_QUEUES 2 #define MTK_WED_RX_QUEUES 2 @@ -179,6 +180,8 @@ struct mtk_wed_ops { u32 (*irq_get)(struct mtk_wed_device *dev, u32 mask); void (*irq_set_mask)(struct mtk_wed_device *dev, u32 mask); + int (*setup_tc)(struct mtk_wed_device *wed, struct net_device *dev, + enum tc_setup_type type, void *type_data); }; extern const struct mtk_wed_ops __rcu *mtk_soc_wed_ops; @@ -237,6 +240,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev) (_dev)->ops->msg_update(_dev, _id, _msg, _len) #define mtk_wed_device_stop(_dev) (_dev)->ops->stop(_dev) #define mtk_wed_device_dma_reset(_dev) (_dev)->ops->reset_dma(_dev) +#define mtk_wed_device_setup_tc(_dev, _netdev, _type, _type_data) \ + (_dev)->ops->setup_tc(_dev, _netdev, _type, _type_data) #else static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) { @@ -255,6 +260,7 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV #define mtk_wed_device_stop(_dev) do {} while (0) #define mtk_wed_device_dma_reset(_dev) do {} while (0) +#define mtk_wed_device_setup_tc(_dev, _netdev, _type, _type_data) -EOPNOTSUPP #endif #endif -- cgit v1.2.3 From 5a17818682cf43ad0fdd6035945f3b7a8c9dc5e9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 6 Apr 2023 14:42:46 +0300 Subject: net: dsa: replace NETDEV_PRE_CHANGE_HWTSTAMP notifier with a stub There was a sort of rush surrounding commit 88c0a6b503b7 ("net: create a netdev notifier for DSA to reject PTP on DSA master"), due to a desire to convert DSA's attempt to deny TX timestamping on a DSA master to something that doesn't block the kernel-wide API conversion from ndo_eth_ioctl() to ndo_hwtstamp_set(). What was required was a mechanism that did not depend on ndo_eth_ioctl(), and what was provided was a mechanism that did not depend on ndo_eth_ioctl(), while at the same time introducing something that wasn't absolutely necessary - a new netdev notifier. There have been objections from Jakub Kicinski that using notifiers in general when they are not absolutely necessary creates complications to the control flow and difficulties to maintainers who look at the code. So there is a desire to not use notifiers. In addition to that, the notifier chain gets called even if there is no DSA in the system and no one is interested in applying any restriction. Take the model of udp_tunnel_nic_ops and introduce a stub mechanism, through which net/core/dev_ioctl.c can call into DSA even when CONFIG_NET_DSA=m. Compared to the code that existed prior to the notifier conversion, aka what was added in commits: - 4cfab3566710 ("net: dsa: Add wrappers for overloaded ndo_ops") - 3369afba1e46 ("net: Call into DSA netdevice_ops wrappers") this is different because we are not overloading any struct net_device_ops of the DSA master anymore, but rather, we are exposing a rather specific functionality which is orthogonal to which API is used to enable it - ndo_eth_ioctl() or ndo_hwtstamp_set(). Also, what is similar is that both approaches use function pointers to get from built-in code to DSA. There is no point in replicating the function pointers towards __dsa_master_hwtstamp_validate() once for every CPU port (dev->dsa_ptr). Instead, it is sufficient to introduce a singleton struct dsa_stubs, built into the kernel, which contains a single function pointer to __dsa_master_hwtstamp_validate(). I find this approach preferable to what we had originally, because dev->dsa_ptr->netdev_ops->ndo_do_ioctl() used to require going through struct dsa_port (dev->dsa_ptr), and so, this was incompatible with any attempts to add any data encapsulation and hide DSA data structures from the outside world. Link: https://lore.kernel.org/netdev/20230403083019.120b72fd@kernel.org/ Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a740be3bb911..1c25b39681b3 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2878,7 +2878,6 @@ enum netdev_cmd { NETDEV_OFFLOAD_XSTATS_REPORT_USED, NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, NETDEV_XDP_FEAT_CHANGE, - NETDEV_PRE_CHANGE_HWTSTAMP, }; const char *netdev_cmd_to_name(enum netdev_cmd cmd); @@ -2929,11 +2928,6 @@ struct netdev_notifier_pre_changeaddr_info { const unsigned char *dev_addr; }; -struct netdev_notifier_hwtstamp_info { - struct netdev_notifier_info info; /* must be first */ - struct kernel_hwtstamp_config *config; -}; - enum netdev_offload_xstats_type { NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1, }; -- cgit v1.2.3 From c91c46de6bbc1147ae5dfe046b87f5f3d6593215 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 6 Apr 2023 18:25:33 -0700 Subject: net: provide macros for commonly copied lockless queue stop/wake code A lot of drivers follow the same scheme to stop / start queues without introducing locks between xmit and NAPI tx completions. I'm guessing they all copy'n'paste each other's code. The original code dates back all the way to e1000 and Linux 2.6.19. Smaller drivers shy away from the scheme and introduce a lock which may cause deadlocks in netpoll. Provide macros which encapsulate the necessary logic. The macros do not prevent false wake ups, the extra barrier required to close that race is not worth it. See discussion in: https://lore.kernel.org/all/c39312a2-4537-14b4-270c-9fe1fbb91e89@gmail.com/ Acked-by: Herbert Xu Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1c25b39681b3..7bec9a2be8ef 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3335,6 +3335,7 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev) static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue) { + /* Must be an atomic op see netif_txq_try_stop() */ set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } -- cgit v1.2.3 From 301f227fc860624d37ba5dae9da57dcf371268db Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 6 Apr 2023 18:25:36 -0700 Subject: net: piggy back on the memory barrier in bql when waking queues Drivers call netdev_tx_completed_queue() right before netif_txq_maybe_wake(). If BQL is enabled netdev_tx_completed_queue() should issue a memory barrier, so we can depend on that separating the stop check from the consumer index update, instead of adding another barrier in netif_txq_maybe_wake(). This matters more than the barriers on the xmit path, because the wake condition is almost always true. So we issue the consumer side barrier often. Wrap netdev_tx_completed_queue() in a local helper to issue the barrier even if BQL is disabled. Keep the same semantics as netdev_tx_completed_queue() (barrier only if bytes != 0) to make it clear that the barrier is conditional. Plus since macro gets pkt/byte counts as arguments now - we can skip waking if there were no packets completed. Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7bec9a2be8ef..fe355592dfde 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3532,7 +3532,7 @@ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue, * netdev_tx_sent_queue will miss the update and cause the queue to * be stopped forever */ - smp_mb(); + smp_mb(); /* NOTE: netdev_txq_completed_mb() assumes this exists */ if (unlikely(dql_avail(&dev_queue->dql) < 0)) return; -- cgit v1.2.3 From 4294a0a7ab6282c3d92f03de84e762dda993c93d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:47 -0700 Subject: bpf: Split off basic BPF verifier log into separate file kernel/bpf/verifier.c file is large and growing larger all the time. So it's good to start splitting off more or less self-contained parts into separate files to keep source code size (somewhat) somewhat under control. This patch is a one step in this direction, moving some of BPF verifier log routines into a separate kernel/bpf/log.c. Right now it's most low-level and isolated routines to append data to log, reset log to previous position, etc. Eventually we could probably move verifier state printing logic here as well, but this patch doesn't attempt to do that yet. Subsequent patches will add more logic to verifier log management, so having basics in a separate file will make sure verifier.c doesn't grow more with new changes. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-2-andrii@kernel.org --- include/linux/bpf_verifier.h | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 81d525d057c7..83dff25545ee 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -498,11 +498,6 @@ struct bpf_verifier_log { u32 len_total; }; -static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) -{ - return log->len_used >= log->len_total - 1; -} - #define BPF_LOG_LEVEL1 1 #define BPF_LOG_LEVEL2 2 #define BPF_LOG_STATS 4 @@ -512,6 +507,11 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U +static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) +{ + return log->len_used >= log->len_total - 1; +} + static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { return log && @@ -519,13 +519,6 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) log->level == BPF_LOG_KERNEL); } -static inline bool -bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) -{ - return log->len_total >= 128 && log->len_total <= UINT_MAX >> 2 && - log->level && log->ubuf && !(log->level & ~BPF_LOG_MASK); -} - #define BPF_MAX_SUBPROGS 256 struct bpf_subprog_info { @@ -608,12 +601,14 @@ struct bpf_verifier_env { char type_str_buf[TYPE_STR_BUF_LEN]; }; +bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log); __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...); +void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { -- cgit v1.2.3 From 1216640938035e63bdbd32438e91c9bcc1fd8ee1 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:49 -0700 Subject: bpf: Switch BPF verifier log to be a rotating log by default Currently, if user-supplied log buffer to collect BPF verifier log turns out to be too small to contain full log, bpf() syscall returns -ENOSPC, fails BPF program verification/load, and preserves first N-1 bytes of the verifier log (where N is the size of user-supplied buffer). This is problematic in a bunch of common scenarios, especially when working with real-world BPF programs that tend to be pretty complex as far as verification goes and require big log buffers. Typically, it's when debugging tricky cases at log level 2 (verbose). Also, when BPF program is successfully validated, log level 2 is the only way to actually see verifier state progression and all the important details. Even with log level 1, it's possible to get -ENOSPC even if the final verifier log fits in log buffer, if there is a code path that's deep enough to fill up entire log, even if normally it would be reset later on (there is a logic to chop off successfully validated portions of BPF verifier log). In short, it's not always possible to pre-size log buffer. Also, what's worse, in practice, the end of the log most often is way more important than the beginning, but verifier stops emitting log as soon as initial log buffer is filled up. This patch switches BPF verifier log behavior to effectively behave as rotating log. That is, if user-supplied log buffer turns out to be too short, verifier will keep overwriting previously written log, effectively treating user's log buffer as a ring buffer. -ENOSPC is still going to be returned at the end, to notify user that log contents was truncated, but the important last N bytes of the log would be returned, which might be all that user really needs. This consistent -ENOSPC behavior, regardless of rotating or fixed log behavior, allows to prevent backwards compatibility breakage. The only user-visible change is which portion of verifier log user ends up seeing *if buffer is too small*. Given contents of verifier log itself is not an ABI, there is no breakage due to this behavior change. Specialized tools that rely on specific contents of verifier log in -ENOSPC scenario are expected to be easily adapted to accommodate old and new behaviors. Importantly, though, to preserve good user experience and not require every user-space application to adopt to this new behavior, before exiting to user-space verifier will rotate log (in place) to make it start at the very beginning of user buffer as a continuous zero-terminated string. The contents will be a chopped off N-1 last bytes of full verifier log, of course. Given beginning of log is sometimes important as well, we add BPF_LOG_FIXED (which equals 8) flag to force old behavior, which allows tools like veristat to request first part of verifier log, if necessary. BPF_LOG_FIXED flag is also a simple and straightforward way to check if BPF verifier supports rotating behavior. On the implementation side, conceptually, it's all simple. We maintain 64-bit logical start and end positions. If we need to truncate the log, start position will be adjusted accordingly to lag end position by N bytes. We then use those logical positions to calculate their matching actual positions in user buffer and handle wrap around the end of the buffer properly. Finally, right before returning from bpf_check(), we rotate user log buffer contents in-place as necessary, to make log contents contiguous. See comments in relevant functions for details. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Reviewed-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-4-andrii@kernel.org --- include/linux/bpf_verifier.h | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 83dff25545ee..4c926227f612 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -491,25 +491,42 @@ struct bpf_insn_aux_data { #define BPF_VERIFIER_TMP_LOG_SIZE 1024 struct bpf_verifier_log { - u32 level; - char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; + /* Logical start and end positions of a "log window" of the verifier log. + * start_pos == 0 means we haven't truncated anything. + * Once truncation starts to happen, start_pos + len_total == end_pos, + * except during log reset situations, in which (end_pos - start_pos) + * might get smaller than len_total (see bpf_vlog_reset()). + * Generally, (end_pos - start_pos) gives number of useful data in + * user log buffer. + */ + u64 start_pos; + u64 end_pos; char __user *ubuf; - u32 len_used; + u32 level; u32 len_total; + char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; }; #define BPF_LOG_LEVEL1 1 #define BPF_LOG_LEVEL2 2 #define BPF_LOG_STATS 4 +#define BPF_LOG_FIXED 8 #define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) -#define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS) +#define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS | BPF_LOG_FIXED) #define BPF_LOG_KERNEL (BPF_LOG_MASK + 1) /* kernel internal flag */ #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U +static inline u32 bpf_log_used(const struct bpf_verifier_log *log) +{ + return log->end_pos - log->start_pos; +} + static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) { - return log->len_used >= log->len_total - 1; + if (log->level & BPF_LOG_FIXED) + return bpf_log_used(log) >= log->len_total - 1; + return false; } static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) @@ -596,7 +613,7 @@ struct bpf_verifier_env { u32 scratched_regs; /* Same as scratched_regs but for stack slots */ u64 scratched_stack_slots; - u32 prev_log_len, prev_insn_print_len; + u64 prev_log_pos, prev_insn_print_pos; /* buffer used in reg_type_str() to generate reg_type string */ char type_str_buf[TYPE_STR_BUF_LEN]; }; @@ -608,7 +625,9 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...); -void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos); +void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); +void bpf_vlog_finalize(struct bpf_verifier_log *log); +bool bpf_vlog_truncated(const struct bpf_verifier_log *log); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { -- cgit v1.2.3 From fa1c7d5cc404ac3b6e6b4ab6d00b07c76bd819be Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:57 -0700 Subject: bpf: Keep track of total log content size in both fixed and rolling modes Change how we do accounting in BPF_LOG_FIXED mode and adopt log->end_pos as *logical* log position. This means that we can go beyond physical log buffer size now and be able to tell what log buffer size should be to fit entire log contents without -ENOSPC. To do this for BPF_LOG_FIXED mode, we need to remove a short-circuiting logic of not vsnprintf()'ing further log content once we filled up user-provided buffer, which is done by bpf_verifier_log_needed() checks. We modify these checks to always keep going if log->level is non-zero (i.e., log is requested), even if log->ubuf was NULL'ed out due to copying data to user-space, or if entire log buffer is physically full. We adopt bpf_verifier_vlog() routine to work correctly with log->ubuf == NULL condition, performing log formatting into temporary kernel buffer, doing all the necessary accounting, but just avoiding copying data out if buffer is full or NULL'ed out. With these changes, it's now possible to do this sort of determination of log contents size in both BPF_LOG_FIXED and default rolling log mode. We need to keep in mind bpf_vlog_reset(), though, which shrinks log contents after successful verification of a particular code path. This log reset means that log->end_pos isn't always increasing, so to return back to users what should be the log buffer size to fit all log content without causing -ENOSPC even in the presence of log resetting, we need to keep maximum over "lifetime" of logging. We do this accounting in bpf_vlog_update_len_max() helper. A related and subtle aspect is that with this logical log->end_pos even in BPF_LOG_FIXED mode we could temporary "overflow" buffer, but then reset it back with bpf_vlog_reset() to a position inside user-supplied log_buf. In such situation we still want to properly maintain terminating zero. We will eventually return -ENOSPC even if final log buffer is small (we detect this through log->len_max check). This behavior is simpler to reason about and is consistent with current behavior of verifier log. Handling of this required a small addition to bpf_vlog_reset() logic to avoid doing put_user() beyond physical log buffer dimensions. Another issue to keep in mind is that we limit log buffer size to 32-bit value and keep such log length as u32, but theoretically verifier could produce huge log stretching beyond 4GB. Instead of keeping (and later returning) 64-bit log length, we cap it at UINT_MAX. Current UAPI makes it impossible to specify log buffer size bigger than 4GB anyways, so we don't really loose anything here and keep everything consistently 32-bit in UAPI. This property will be utilized in next patch. Doing the same determination of maximum log buffer for rolling mode is trivial, as log->end_pos and log->start_pos are already logical positions, so there is nothing new there. These changes do incidentally fix one small issue with previous logging logic. Previously, if use provided log buffer of size N, and actual log output was exactly N-1 bytes + terminating \0, kernel logic coun't distinguish this condition from log truncation scenario which would end up with truncated log contents of N-1 bytes + terminating \0 as well. But now with log->end_pos being logical position that could go beyond actual log buffer size, we can distinguish these two conditions, which we do in this patch. This plays nicely with returning log_size_actual (implemented in UAPI in the next patch), as we can now guarantee that if user takes such log_size_actual and provides log buffer of that exact size, they will not get -ENOSPC in return. All in all, all these changes do conceptually unify fixed and rolling log modes much better, and allow a nice feature requested by users: knowing what should be the size of the buffer to avoid -ENOSPC. We'll plumb this through the UAPI and the code in the next patch. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-12-andrii@kernel.org --- include/linux/bpf_verifier.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4c926227f612..98d2eb382dbb 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -504,6 +504,7 @@ struct bpf_verifier_log { char __user *ubuf; u32 level; u32 len_total; + u32 len_max; char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; }; @@ -517,23 +518,16 @@ struct bpf_verifier_log { #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U -static inline u32 bpf_log_used(const struct bpf_verifier_log *log) -{ - return log->end_pos - log->start_pos; -} - static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) { if (log->level & BPF_LOG_FIXED) - return bpf_log_used(log) >= log->len_total - 1; + return log->end_pos >= log->len_total; return false; } static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { - return log && - ((log->level && log->ubuf && !bpf_verifier_log_full(log)) || - log->level == BPF_LOG_KERNEL); + return log && log->level; } #define BPF_MAX_SUBPROGS 256 -- cgit v1.2.3 From 47a71c1f9af0a334c9dfa97633c41de4feda4287 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:58 -0700 Subject: bpf: Add log_true_size output field to return necessary log buffer size Add output-only log_true_size and btf_log_true_size field to BPF_PROG_LOAD and BPF_BTF_LOAD commands, respectively. It will return the size of log buffer necessary to fit in all the log contents at specified log_level. This is very useful for BPF loader libraries like libbpf to be able to size log buffer correctly, but could be used by users directly, if necessary, as well. This patch plumbs all this through the code, taking into account actual bpf_attr size provided by user to determine if these new fields are expected by users. And if they are, set them from kernel on return. We refactory btf_parse() function to accommodate this, moving attr and uattr handling inside it. The rest is very straightforward code, which is split from the logging accounting changes in the previous patch to make it simpler to review logic vs UAPI changes. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-13-andrii@kernel.org --- include/linux/bpf.h | 2 +- include/linux/btf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 002a811b6b90..2c6095bd7d69 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2175,7 +2175,7 @@ int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr); +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size); #ifndef CONFIG_BPF_JIT_ALWAYS_ON void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); diff --git a/include/linux/btf.h b/include/linux/btf.h index d53b10cc55f2..495250162422 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -125,7 +125,7 @@ extern const struct file_operations btf_fops; void btf_get(struct btf *btf); void btf_put(struct btf *btf); -int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr); +int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_sz); struct btf *btf_get_by_fd(int fd); int btf_get_info_by_fd(const struct btf *btf, const union bpf_attr *attr, -- cgit v1.2.3 From bdcab4144f5da97cc0fa7e1dd63b8475e10c8f0a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 6 Apr 2023 16:41:59 -0700 Subject: bpf: Simplify internal verifier log interface Simplify internal verifier log API down to bpf_vlog_init() and bpf_vlog_finalize(). The former handles input arguments validation in one place and makes it easier to change it. The latter subsumes -ENOSPC (truncation) and -EFAULT handling and simplifies both caller's code (bpf_check() and btf_parse()). For btf_parse(), this patch also makes sure that verifier log finalization happens even if there is some error condition during BTF verification process prior to normal finalization step. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/20230406234205.323208-14-andrii@kernel.org --- include/linux/bpf_verifier.h | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 98d2eb382dbb..f03852b89d28 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -518,13 +518,6 @@ struct bpf_verifier_log { #define BPF_LOG_MIN_ALIGNMENT 8U #define BPF_LOG_ALIGNMENT 40U -static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) -{ - if (log->level & BPF_LOG_FIXED) - return log->end_pos >= log->len_total; - return false; -} - static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) { return log && log->level; @@ -612,16 +605,16 @@ struct bpf_verifier_env { char type_str_buf[TYPE_STR_BUF_LEN]; }; -bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log); __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...); +int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, + char __user *log_buf, u32 log_size); void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); -void bpf_vlog_finalize(struct bpf_verifier_log *log); -bool bpf_vlog_truncated(const struct bpf_verifier_log *log); +int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual); static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { -- cgit v1.2.3 From e5688f6fb9e3d0254a8fb1c714c38e407f0baa64 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Wed, 1 Mar 2023 10:33:55 +0100 Subject: net/mlx5: Add mlx5_ifc definitions for bridge multicast support Add the required hardware definitions to mlx5_ifc: fdb_uplink_hairpin, fdb_multi_path_any_table_limit_regc, fdb_multi_path_any_table. Signed-off-by: Vlad Buslov Reviewed-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e47d6c58da35..02c628f4fe26 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -880,7 +880,12 @@ enum { struct mlx5_ifc_flow_table_eswitch_cap_bits { u8 fdb_to_vport_reg_c_id[0x8]; - u8 reserved_at_8[0xd]; + u8 reserved_at_8[0x5]; + u8 fdb_uplink_hairpin[0x1]; + u8 fdb_multi_path_any_table_limit_regc[0x1]; + u8 reserved_at_f[0x3]; + u8 fdb_multi_path_any_table[0x1]; + u8 reserved_at_13[0x2]; u8 fdb_modify_header_fwd_to_table[0x1]; u8 fdb_ipv4_ttl_modify[0x1]; u8 flow_source[0x1]; -- cgit v1.2.3 From 9df839a711aee437390b16ee39cf0b5c1620be6a Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 23 Apr 2020 08:27:59 -0500 Subject: net/mlx5: Create a new profile for SFs Create a new profile for SFs in order to disable the command cache. Each function command cache consumes ~500KB of memory, when using a large number of SFs this savings is notable on memory constarined systems. Use a new profile to provide for future differences between SFs and PFs. The mr_cache not used for non-PF functions, so it is excluded from the new profile. Signed-off-by: Parav Pandit Reviewed-by: Bodong Wang Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f243bd10a5e1..135a3c8d8237 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -751,6 +751,7 @@ enum { struct mlx5_profile { u64 mask; u8 log_max_qp; + u8 num_cmd_caches; struct { int size; int limit; -- cgit v1.2.3 From 9fa7f1de3dda0bc74c964b5127b3f22bd8ef8fd2 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 30 Aug 2022 01:20:30 +0300 Subject: net/mlx5: Add mlx5_ifc bits for modify header argument Add enum value for modify-header argument object and mlx5_bits for the related capabilities. Signed-off-by: Muhammad Sammar Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 02c628f4fe26..6c84bf6eec85 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -78,12 +78,15 @@ enum { enum { MLX5_OBJ_TYPE_SW_ICM = 0x0008, + MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT = 0x23, }; enum { MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM), MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11), MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q = (1ULL << 13), + MLX5_GENERAL_OBJ_TYPES_CAP_HEADER_MODIFY_ARGUMENT = + (1ULL << MLX5_OBJ_TYPE_HEADER_MODIFY_ARGUMENT), MLX5_GENERAL_OBJ_TYPES_CAP_MACSEC_OFFLOAD = (1ULL << 39), }; @@ -321,6 +324,10 @@ enum { MLX5_FT_NIC_TX_RDMA_2_NIC_TX = BIT(1), }; +enum { + MLX5_CMD_OP_MOD_UPDATE_HEADER_MODIFY_ARGUMENT = 0x1, +}; + struct mlx5_ifc_flow_table_fields_supported_bits { u8 outer_dmac[0x1]; u8 outer_smac[0x1]; @@ -1927,7 +1934,14 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_750[0x4]; u8 max_dynamic_vf_msix_table_size[0xc]; - u8 reserved_at_760[0x20]; + u8 reserved_at_760[0x3]; + u8 log_max_num_header_modify_argument[0x5]; + u8 reserved_at_768[0x4]; + u8 log_header_modify_argument_granularity[0x4]; + u8 reserved_at_770[0x3]; + u8 log_header_modify_argument_max_alloc[0x5]; + u8 reserved_at_778[0x8]; + u8 vhca_tunnel_commands[0x40]; u8 match_definer_format_supported[0x40]; }; @@ -6361,6 +6375,18 @@ struct mlx5_ifc_general_obj_out_cmd_hdr_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_modify_header_arg_bits { + u8 reserved_at_0[0x80]; + + u8 reserved_at_80[0x8]; + u8 access_pd[0x18]; +}; + +struct mlx5_ifc_create_modify_header_arg_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_modify_header_arg_bits arg; +}; + struct mlx5_ifc_create_match_definer_in_bits { struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; -- cgit v1.2.3 From 977c4a3e7c89d5b48fdc548773fb9ed3e6c50cf8 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 30 Aug 2022 01:20:40 +0300 Subject: net/mlx5: Add new WQE for updating flow table Add new WQE type: FLOW_TBL_ACCESS, which will be used for writing modify header arguments. This type has specific control segment and special data segment. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 2 ++ include/linux/mlx5/qp.h | 10 ++++++++++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index af4dd536a52c..e4aa147ab390 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -442,6 +442,8 @@ enum { MLX5_OPCODE_UMR = 0x25, + MLX5_OPCODE_FLOW_TBL_ACCESS = 0x2c, + MLX5_OPCODE_ACCESS_ASO = 0x2d, }; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index df55fbb65717..bd53cf4be7bd 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -499,6 +499,16 @@ struct mlx5_stride_block_ctrl_seg { __be16 num_entries; }; +struct mlx5_wqe_flow_update_ctrl_seg { + __be32 flow_idx_update; + __be32 dest_handle; + u8 reserved0[40]; +}; + +struct mlx5_wqe_header_modify_argument_update_seg { + u8 argument_list[64]; +}; + struct mlx5_core_qp { struct mlx5_core_rsc_common common; /* must be first */ void (*event) (struct mlx5_core_qp *, int); -- cgit v1.2.3 From 7d6d2dd326a8a8d32091e9748f3428dd3be68367 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Wed, 5 Apr 2023 22:07:26 +0200 Subject: mmc: sdio: add Realtek SDIO vendor ID and various wifi device IDs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the SDIO vendor ID for Realtek and some device IDs extracted from their GPL vendor driver. This will be useful in the future when the rtw88 driver gains support for these chips. Acked-by: Ulf Hansson Reviewed-by: Ping-Ke Shih Reviewed-by: Pali Rohár Signed-off-by: Martin Blumenstingl Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20230405200729.632435-7-martin.blumenstingl@googlemail.com --- include/linux/mmc/sdio_ids.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 0e4ef9c5127a..66f503ed2448 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -112,6 +112,15 @@ #define SDIO_VENDOR_ID_MICROCHIP_WILC 0x0296 #define SDIO_DEVICE_ID_MICROCHIP_WILC1000 0x5347 +#define SDIO_VENDOR_ID_REALTEK 0x024c +#define SDIO_DEVICE_ID_REALTEK_RTW8723BS 0xb723 +#define SDIO_DEVICE_ID_REALTEK_RTW8821BS 0xb821 +#define SDIO_DEVICE_ID_REALTEK_RTW8822BS 0xb822 +#define SDIO_DEVICE_ID_REALTEK_RTW8821CS 0xc821 +#define SDIO_DEVICE_ID_REALTEK_RTW8822CS 0xc822 +#define SDIO_DEVICE_ID_REALTEK_RTW8723DS 0xd723 +#define SDIO_DEVICE_ID_REALTEK_RTW8821DS 0xd821 + #define SDIO_VENDOR_ID_SIANO 0x039a #define SDIO_DEVICE_ID_SIANO_NOVA_B0 0x0201 #define SDIO_DEVICE_ID_SIANO_NICE 0x0202 -- cgit v1.2.3 From 33719b57f52e5b761234373f98f55f4e036d61c9 Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Tue, 11 Apr 2023 15:04:06 -0500 Subject: net: stmmac: dwmac4: Allow platforms to specify some DMA/MTL offsets Some platforms have dwmac4 implementations that have a different address space layout than the default, resulting in the need to define their own DMA/MTL offsets. Extend the functions to allow a platform driver to indicate what its addresses are, overriding the defaults. Signed-off-by: Andrew Halaney Reviewed-by: Jesse Brandeburg Tested-by: Brian Masney Signed-off-by: Paolo Abeni --- include/linux/stmmac.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index dafa001e9e7a..225751a8fd8e 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -186,6 +186,24 @@ struct stmmac_safety_feature_cfg { u32 tmouten; }; +/* Addresses that may be customized by a platform */ +struct dwmac4_addrs { + u32 dma_chan; + u32 dma_chan_offset; + u32 mtl_chan; + u32 mtl_chan_offset; + u32 mtl_ets_ctrl; + u32 mtl_ets_ctrl_offset; + u32 mtl_txq_weight; + u32 mtl_txq_weight_offset; + u32 mtl_send_slp_cred; + u32 mtl_send_slp_cred_offset; + u32 mtl_high_cred; + u32 mtl_high_cred_offset; + u32 mtl_low_cred; + u32 mtl_low_cred_offset; +}; + struct plat_stmmacenet_data { int bus_id; int phy_addr; @@ -274,5 +292,6 @@ struct plat_stmmacenet_data { bool use_phy_wol; bool sph_disable; bool serdes_up_after_phy_linkup; + const struct dwmac4_addrs *dwmac4_addrs; }; #endif -- cgit v1.2.3 From 4fdeb847130229dc94befa241461669c7359776b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 31 Mar 2023 16:59:07 +0200 Subject: wifi: ieee80211: clean up public action codes WLAN_PUBLIC_ACTION_FTM_RESPONSE is duplicated with WLAN_PUB_ACTION_FTM, but that might better be called WLAN_PUB_ACTION_FTM_RESPONSE; clean up here. Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 2463bdd2a382..0583b2b0ce1f 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -9,7 +9,7 @@ * Copyright (c) 2006, Michael Wu * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright (c) 2016 - 2017 Intel Deutschland GmbH - * Copyright (c) 2018 - 2022 Intel Corporation + * Copyright (c) 2018 - 2023 Intel Corporation */ #ifndef LINUX_IEEE80211_H @@ -3557,11 +3557,6 @@ enum ieee80211_unprotected_wnm_actioncode { WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE = 1, }; -/* Public action codes */ -enum ieee80211_public_actioncode { - WLAN_PUBLIC_ACTION_FTM_RESPONSE = 33, -}; - /* Security key length */ enum ieee80211_key_len { WLAN_KEY_LEN_WEP40 = 5, @@ -3653,7 +3648,7 @@ enum ieee80211_pub_actioncode { WLAN_PUB_ACTION_NETWORK_CHANNEL_CONTROL = 30, WLAN_PUB_ACTION_WHITE_SPACE_MAP_ANN = 31, WLAN_PUB_ACTION_FTM_REQUEST = 32, - WLAN_PUB_ACTION_FTM = 33, + WLAN_PUB_ACTION_FTM_RESPONSE = 33, WLAN_PUB_ACTION_FILS_DISCOVERY = 34, }; @@ -4383,7 +4378,7 @@ static inline bool ieee80211_is_ftm(struct sk_buff *skb) return false; if (mgmt->u.action.u.ftm.action_code == - WLAN_PUBLIC_ACTION_FTM_RESPONSE && + WLAN_PUB_ACTION_FTM_RESPONSE && skb->len >= offsetofend(typeof(*mgmt), u.action.u.ftm)) return true; -- cgit v1.2.3 From 2c9abe653bc5134eeab411c46dde008d8a1c37b0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 31 Mar 2023 16:59:08 +0200 Subject: wifi: ieee80211: correctly mark FTM frames non-bufferable The checks of whether or not a frame is bufferable were not taking into account that some action frames aren't, such as FTM. Check this, which requires some changes to the function ieee80211_is_bufferable_mmpdu() since we need the whole skb for the checks now. Reviewed-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 52 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 0583b2b0ce1f..c4cf296e7eaf 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -782,20 +782,6 @@ static inline bool ieee80211_is_any_nullfunc(__le16 fc) return (ieee80211_is_nullfunc(fc) || ieee80211_is_qos_nullfunc(fc)); } -/** - * ieee80211_is_bufferable_mmpdu - check if frame is bufferable MMPDU - * @fc: frame control field in little-endian byteorder - */ -static inline bool ieee80211_is_bufferable_mmpdu(__le16 fc) -{ - /* IEEE 802.11-2012, definition of "bufferable management frame"; - * note that this ignores the IBSS special case. */ - return ieee80211_is_mgmt(fc) && - (ieee80211_is_action(fc) || - ieee80211_is_disassoc(fc) || - ieee80211_is_deauth(fc)); -} - /** * ieee80211_is_first_frag - check if IEEE80211_SCTL_FRAG is not set * @seq_ctrl: frame sequence control bytes in little-endian byteorder @@ -4132,6 +4118,44 @@ static inline u8 *ieee80211_get_DA(struct ieee80211_hdr *hdr) return hdr->addr1; } +/** + * ieee80211_is_bufferable_mmpdu - check if frame is bufferable MMPDU + * @skb: the skb to check, starting with the 802.11 header + */ +static inline bool ieee80211_is_bufferable_mmpdu(struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *)skb->data; + __le16 fc = mgmt->frame_control; + + /* + * IEEE 802.11 REVme D2.0 definition of bufferable MMPDU; + * note that this ignores the IBSS special case. + */ + if (!ieee80211_is_mgmt(fc)) + return false; + + if (ieee80211_is_disassoc(fc) || ieee80211_is_deauth(fc)) + return true; + + if (!ieee80211_is_action(fc)) + return false; + + if (skb->len < offsetofend(typeof(*mgmt), u.action.u.ftm.action_code)) + return true; + + /* action frame - additionally check for non-bufferable FTM */ + + if (mgmt->u.action.category != WLAN_CATEGORY_PUBLIC && + mgmt->u.action.category != WLAN_CATEGORY_PROTECTED_DUAL_OF_ACTION) + return true; + + if (mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_REQUEST || + mgmt->u.action.u.ftm.action_code == WLAN_PUB_ACTION_FTM_RESPONSE) + return false; + + return true; +} + /** * _ieee80211_is_robust_mgmt_frame - check if frame is a robust management frame * @hdr: the frame (buffer must include at least the first octet of payload) -- cgit v1.2.3 From 1cf3bfc60f9836f44da951f58b6ae24680484b35 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Thu, 13 Apr 2023 01:06:32 +0200 Subject: bpf: Support 64-bit pointers to kfuncs test_ksyms_module fails to emit a kfunc call targeting a module on s390x, because the verifier stores the difference between kfunc address and __bpf_call_base in bpf_insn.imm, which is s32, and modules are roughly (1 << 42) bytes away from the kernel on s390x. Fix by keeping BTF id in bpf_insn.imm for BPF_PSEUDO_KFUNC_CALLs, and storing the absolute address in bpf_kfunc_desc. Introduce bpf_jit_supports_far_kfunc_call() in order to limit this new behavior to the s390x JIT. Otherwise other JITs need to be modified, which is not desired. Introduce bpf_get_kfunc_addr() instead of exposing both find_kfunc_desc() and struct bpf_kfunc_desc. In addition to sorting kfuncs by imm, also sort them by offset, in order to handle conflicting imms from different modules. Do this on all architectures in order to simplify code. Factor out resolving specialized kfuncs (XPD and dynptr) from fixup_kfunc_call(). This was required in the first place, because fixup_kfunc_call() uses find_kfunc_desc(), which returns a const pointer, so it's not possible to modify kfunc addr without stripping const, which is not nice. It also removes repetition of code like: if (bpf_jit_supports_far_kfunc_call()) desc->addr = func; else insn->imm = BPF_CALL_IMM(func); and separates kfunc_desc_tab fixups from kfunc_call fixups. Suggested-by: Jiri Olsa Signed-off-by: Ilya Leoshkevich Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20230412230632.885985-1-iii@linux.ibm.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++++++ include/linux/filter.h | 1 + 2 files changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2c6095bd7d69..88845aadc47d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2295,6 +2295,9 @@ bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, const struct bpf_insn *insn); +int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, + u16 btf_fd_idx, u8 **func_addr); + struct bpf_core_ctx { struct bpf_verifier_log *log; const struct btf *btf; @@ -2545,6 +2548,13 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog, return NULL; } +static inline int +bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id, + u16 btf_fd_idx, u8 **func_addr) +{ + return -ENOTSUPP; +} + static inline bool unprivileged_ebpf_enabled(void) { return false; diff --git a/include/linux/filter.h b/include/linux/filter.h index 5364b0c52c1d..bbce89937fde 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -920,6 +920,7 @@ void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_kfunc_call(void); +bool bpf_jit_supports_far_kfunc_call(void); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(const struct cred *cred) -- cgit v1.2.3 From d54151aa0f4b5c89561705a00d8a5ebb4230028c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 11 Apr 2023 21:01:49 +0300 Subject: net: ethtool: create and export ethtool_dev_mm_supported() Create a wrapper over __ethtool_dev_mm_supported() which also calls ethnl_ops_begin() and ethnl_ops_complete(). It can be used by other code layers, such as tc, to make sure that preemptible TCs are supported (this is true if an underlying MAC Merge layer exists). Signed-off-by: Vladimir Oltean Reviewed-by: Ferenc Fejes Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski --- include/linux/ethtool_netlink.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool_netlink.h b/include/linux/ethtool_netlink.h index 17003b385756..fae0dfb9a9c8 100644 --- a/include/linux/ethtool_netlink.h +++ b/include/linux/ethtool_netlink.h @@ -39,6 +39,7 @@ void ethtool_aggregate_pause_stats(struct net_device *dev, struct ethtool_pause_stats *pause_stats); void ethtool_aggregate_rmon_stats(struct net_device *dev, struct ethtool_rmon_stats *rmon_stats); +bool ethtool_dev_mm_supported(struct net_device *dev); #else static inline int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) @@ -112,5 +113,10 @@ ethtool_aggregate_rmon_stats(struct net_device *dev, { } +static inline bool ethtool_dev_mm_supported(struct net_device *dev) +{ + return false; +} + #endif /* IS_ENABLED(CONFIG_ETHTOOL_NETLINK) */ #endif /* _LINUX_ETHTOOL_NETLINK_H_ */ -- cgit v1.2.3 From cc4cffc3c142d57df48c07851862444e1d33bdaa Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Fri, 7 Apr 2023 22:37:52 +0200 Subject: wifi: brcmfmac: add Cypress 43439 SDIO ids Add SDIO ids for use with the muRata 1YN (Cypress CYW43439). The odd thing about this is that the previous 1YN populated on M.2 card for evaluation purposes had BRCM SDIO vendor ID, while the chip populated on real hardware has a Cypress one. The device ID also differs between the two devices. But they are both 43439 otherwise, so add the IDs for both. On-device 1YN (43439), the new one, chip label reads "1YN": ``` /sys/.../mmc_host/mmc2/mmc2:0001 # cat vendor device 0x04b4 0xbd3d ``` EA M.2 evaluation board 1YN (43439), the old one, chip label reads "1YN ES1.4": ``` /sys/.../mmc_host/mmc0/mmc0:0001/# cat vendor device 0x02d0 0xa9a6 ``` Reviewed-by: Hans de Goede Cc: stable@vger.kernel.org Signed-off-by: Marek Vasut Reviewed-by: Simon Horman Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20230407203752.128539-1-marex@denx.de --- include/linux/mmc/sdio_ids.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 66f503ed2448..c653accdc7fd 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -74,10 +74,13 @@ #define SDIO_DEVICE_ID_BROADCOM_43362 0xa962 #define SDIO_DEVICE_ID_BROADCOM_43364 0xa9a4 #define SDIO_DEVICE_ID_BROADCOM_43430 0xa9a6 -#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439 0xa9af +#define SDIO_DEVICE_ID_BROADCOM_43439 0xa9af #define SDIO_DEVICE_ID_BROADCOM_43455 0xa9bf #define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752 0xaae8 +#define SDIO_VENDOR_ID_CYPRESS 0x04b4 +#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439 0xbd3d + #define SDIO_VENDOR_ID_MARVELL 0x02df #define SDIO_DEVICE_ID_MARVELL_LIBERTAS 0x9103 #define SDIO_DEVICE_ID_MARVELL_8688_WLAN 0x9104 -- cgit v1.2.3 From 8c48eea3adf3119e0a3fc57bd31f6966f26ee784 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 12 Apr 2023 21:26:04 -0700 Subject: page_pool: allow caching from safely localized NAPI Recent patches to mlx5 mentioned a regression when moving from driver local page pool to only using the generic page pool code. Page pool has two recycling paths (1) direct one, which runs in safe NAPI context (basically consumer context, so producing can be lockless); and (2) via a ptr_ring, which takes a spin lock because the freeing can happen from any CPU; producer and consumer may run concurrently. Since the page pool code was added, Eric introduced a revised version of deferred skb freeing. TCP skbs are now usually returned to the CPU which allocated them, and freed in softirq context. This places the freeing (producing of pages back to the pool) enticingly close to the allocation (consumer). If we can prove that we're freeing in the same softirq context in which the consumer NAPI will run - lockless use of the cache is perfectly fine, no need for the lock. Let drivers link the page pool to a NAPI instance. If the NAPI instance is scheduled on the same CPU on which we're freeing - place the pages in the direct cache. With that and patched bnxt (XDP enabled to engage the page pool, sigh, bnxt really needs page pool work :() I see a 2.6% perf boost with a TCP stream test (app on a different physical core than softirq). The CPU use of relevant functions decreases as expected: page_pool_refill_alloc_cache 1.17% -> 0% _raw_spin_lock 2.41% -> 0.98% Only consider lockless path to be safe when NAPI is scheduled - in practice this should cover majority if not all of steady state workloads. It's usually the NAPI kicking in that causes the skb flush. The main case we'll miss out on is when application runs on the same CPU as NAPI. In that case we don't use the deferred skb free path. Reviewed-by: Tariq Toukan Acked-by: Jesper Dangaard Brouer Tested-by: Dragos Tatulea Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 +++ include/linux/skbuff.h | 20 +++++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 96d27d558b0c..203c0df2046c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -360,8 +360,11 @@ struct napi_struct { unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL + /* CPU actively polling if netpoll is configured */ int poll_owner; #endif + /* CPU on which NAPI has been scheduled for processing */ + int list_owner; struct net_device *dev; struct gro_list gro_hash[GRO_HASH_BUCKETS]; struct sk_buff *skb; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 494a23a976b0..a823ec3aa326 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3386,6 +3386,18 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) __skb_frag_ref(&skb_shinfo(skb)->frags[f]); } +static inline void +napi_frag_unref(skb_frag_t *frag, bool recycle, bool napi_safe) +{ + struct page *page = skb_frag_page(frag); + +#ifdef CONFIG_PAGE_POOL + if (recycle && page_pool_return_skb_page(page, napi_safe)) + return; +#endif + put_page(page); +} + /** * __skb_frag_unref - release a reference on a paged fragment. * @frag: the paged fragment @@ -3396,13 +3408,7 @@ static inline void skb_frag_ref(struct sk_buff *skb, int f) */ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) { - struct page *page = skb_frag_page(frag); - -#ifdef CONFIG_PAGE_POOL - if (recycle && page_pool_return_skb_page(page)) - return; -#endif - put_page(page); + napi_frag_unref(frag, recycle, false); } /** -- cgit v1.2.3 From cd2a8079014aced27da9b2e669784f31680f1351 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:03 -0700 Subject: bpf: Remove btf_field_offs, use btf_record's fields instead The btf_field_offs struct contains (offset, size) for btf_record fields, sorted by offset. btf_field_offs is always used in conjunction with btf_record, which has btf_field 'fields' array with (offset, type), the latter of which btf_field_offs' size is derived from via btf_field_type_size. This patch adds a size field to struct btf_field and sorts btf_record's fields by offset, making it possible to get rid of btf_field_offs. Less data duplication and less code complexity results. Since btf_field_offs' lifetime closely followed the btf_record used to populate it, most complexity wins are from removal of initialization code like: if (btf_record_successfully_initialized) { foffs = btf_parse_field_offs(rec); if (IS_ERR_OR_NULL(foffs)) // free the btf_record and return err } Other changes in this patch are pretty mechanical: * foffs->field_off[i] -> rec->fields[i].offset * foffs->field_sz[i] -> rec->fields[i].size * Sort rec->fields in btf_parse_fields before returning * It's possible that this is necessary independently of other changes in this patch. btf_record_find in syscall.c expects btf_record's fields to be sorted by offset, yet there's no explicit sorting of them before this patch, record's fields are populated in the order they're read from BTF struct definition. BTF docs don't say anything about the sortedness of struct fields. * All functions taking struct btf_field_offs * input now instead take struct btf_record *. All callsites of these functions already have access to the correct btf_record. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-2-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 44 +++++++++++++++++++------------------------- include/linux/btf.h | 2 -- 2 files changed, 19 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 88845aadc47d..7888ed497432 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -210,6 +210,7 @@ struct btf_field_graph_root { struct btf_field { u32 offset; + u32 size; enum btf_field_type type; union { struct btf_field_kptr kptr; @@ -225,12 +226,6 @@ struct btf_record { struct btf_field fields[]; }; -struct btf_field_offs { - u32 cnt; - u32 field_off[BTF_FIELDS_MAX]; - u8 field_sz[BTF_FIELDS_MAX]; -}; - struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -257,7 +252,6 @@ struct bpf_map { struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; - struct btf_field_offs *field_offs; /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ @@ -360,14 +354,14 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } -static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) +static inline void bpf_obj_init(const struct btf_record *rec, void *obj) { int i; - if (!foffs) + if (IS_ERR_OR_NULL(rec)) return; - for (i = 0; i < foffs->cnt; i++) - memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]); + for (i = 0; i < rec->cnt; i++) + memset(obj + rec->fields[i].offset, 0, rec->fields[i].size); } /* 'dst' must be a temporary buffer and should not point to memory that is being @@ -379,7 +373,7 @@ static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) */ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { - bpf_obj_init(map->field_offs, dst); + bpf_obj_init(map->record, dst); } /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and @@ -399,14 +393,14 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) } /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ -static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, +static inline void bpf_obj_memcpy(struct btf_record *rec, void *dst, void *src, u32 size, bool long_memcpy) { u32 curr_off = 0; int i; - if (likely(!foffs)) { + if (IS_ERR_OR_NULL(rec)) { if (long_memcpy) bpf_long_memcpy(dst, src, round_up(size, 8)); else @@ -414,49 +408,49 @@ static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, return; } - for (i = 0; i < foffs->cnt; i++) { - u32 next_off = foffs->field_off[i]; + for (i = 0; i < rec->cnt; i++) { + u32 next_off = rec->fields[i].offset; u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); - curr_off += foffs->field_sz[i] + sz; + curr_off += rec->fields[i].size + sz; } memcpy(dst + curr_off, src + curr_off, size - curr_off); } static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { - bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, false); + bpf_obj_memcpy(map->record, dst, src, map->value_size, false); } static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) { - bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, true); + bpf_obj_memcpy(map->record, dst, src, map->value_size, true); } -static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 size) +static inline void bpf_obj_memzero(struct btf_record *rec, void *dst, u32 size) { u32 curr_off = 0; int i; - if (likely(!foffs)) { + if (IS_ERR_OR_NULL(rec)) { memset(dst, 0, size); return; } - for (i = 0; i < foffs->cnt; i++) { - u32 next_off = foffs->field_off[i]; + for (i = 0; i < rec->cnt; i++) { + u32 next_off = rec->fields[i].offset; u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); - curr_off += foffs->field_sz[i] + sz; + curr_off += rec->fields[i].size + sz; } memset(dst + curr_off, 0, size - curr_off); } static inline void zero_map_value(struct bpf_map *map, void *dst) { - bpf_obj_memzero(map->field_offs, dst, map->value_size); + bpf_obj_memzero(map->record, dst, map->value_size); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, diff --git a/include/linux/btf.h b/include/linux/btf.h index 495250162422..813227bff58a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -113,7 +113,6 @@ struct btf_id_dtor_kfunc { struct btf_struct_meta { u32 btf_id; struct btf_record *record; - struct btf_field_offs *field_offs; }; struct btf_struct_metas { @@ -207,7 +206,6 @@ int btf_find_timer(const struct btf *btf, const struct btf_type *t); struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size); int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec); -struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit v1.2.3 From d54730b50bae1f3119bd686d551d66f0fcc387ca Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:04 -0700 Subject: bpf: Introduce opaque bpf_refcount struct and add btf_record plumbing A 'struct bpf_refcount' is added to the set of opaque uapi/bpf.h types meant for use in BPF programs. Similarly to other opaque types like bpf_spin_lock and bpf_rbtree_node, the verifier needs to know where in user-defined struct types a bpf_refcount can be located, so necessary btf_record plumbing is added to enable this. bpf_refcount is sized to hold a refcount_t. Similarly to bpf_spin_lock, the offset of a bpf_refcount is cached in btf_record as refcount_off in addition to being in the field array. Caching refcount_off makes sense for this field because further patches in the series will modify functions that take local kptrs (e.g. bpf_obj_drop) to change their behavior if the type they're operating on is refcounted. So enabling fast "is this type refcounted?" checks is desirable. No such verifier behavior changes are introduced in this patch, just logic to recognize 'struct bpf_refcount' in btf_record. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-3-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7888ed497432..be44d765b7a4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -187,6 +187,7 @@ enum btf_field_type { BPF_RB_NODE = (1 << 7), BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | BPF_RB_NODE | BPF_RB_ROOT, + BPF_REFCOUNT = (1 << 8), }; typedef void (*btf_dtor_kfunc_t)(void *); @@ -223,6 +224,7 @@ struct btf_record { u32 field_mask; int spin_lock_off; int timer_off; + int refcount_off; struct btf_field fields[]; }; @@ -293,6 +295,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "bpf_rb_root"; case BPF_RB_NODE: return "bpf_rb_node"; + case BPF_REFCOUNT: + return "bpf_refcount"; default: WARN_ON_ONCE(1); return "unknown"; @@ -317,6 +321,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(struct bpf_rb_root); case BPF_RB_NODE: return sizeof(struct bpf_rb_node); + case BPF_REFCOUNT: + return sizeof(struct bpf_refcount); default: WARN_ON_ONCE(1); return 0; @@ -341,6 +347,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(struct bpf_rb_root); case BPF_RB_NODE: return __alignof__(struct bpf_rb_node); + case BPF_REFCOUNT: + return __alignof__(struct bpf_refcount); default: WARN_ON_ONCE(1); return 0; -- cgit v1.2.3 From 1512217c47f0e8ea076dd0e67262e5a668a78f01 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:05 -0700 Subject: bpf: Support refcounted local kptrs in existing semantics A local kptr is considered 'refcounted' when it is of a type that has a bpf_refcount field. When such a kptr is created, its refcount should be initialized to 1; when destroyed, the object should be free'd only if a refcount decr results in 0 refcount. Existing logic always frees the underlying memory when destroying a local kptr, and 0-initializes all btf_record fields. This patch adds checks for "is local kptr refcounted?" and new logic for that case in the appropriate places. This patch focuses on changing existing semantics and thus conspicuously does _not_ provide a way for BPF programs in increment refcount. That follows later in the series. __bpf_obj_drop_impl is modified to do the right thing when it sees a refcounted type. Container types for graph nodes (list, tree, stashed in map) are migrated to use __bpf_obj_drop_impl as a destructor for their nodes instead of each having custom destruction code in their _free paths. Now that "drop" isn't a synonym for "free" when the type is refcounted it makes sense to centralize this logic. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-4-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index be44d765b7a4..b065de2cfcea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -370,6 +370,9 @@ static inline void bpf_obj_init(const struct btf_record *rec, void *obj) return; for (i = 0; i < rec->cnt; i++) memset(obj + rec->fields[i].offset, 0, rec->fields[i].size); + + if (rec->refcount_off >= 0) + refcount_set((refcount_t *)(obj + rec->refcount_off), 1); } /* 'dst' must be a temporary buffer and should not point to memory that is being -- cgit v1.2.3 From d2dcc67df910dd85253a701b6a5b747f955d28f5 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:07 -0700 Subject: bpf: Migrate bpf_rbtree_add and bpf_list_push_{front,back} to possibly fail Consider this code snippet: struct node { long key; bpf_list_node l; bpf_rb_node r; bpf_refcount ref; } int some_bpf_prog(void *ctx) { struct node *n = bpf_obj_new(/*...*/), *m; bpf_spin_lock(&glock); bpf_rbtree_add(&some_tree, &n->r, /* ... */); m = bpf_refcount_acquire(n); bpf_rbtree_add(&other_tree, &m->r, /* ... */); bpf_spin_unlock(&glock); /* ... */ } After bpf_refcount_acquire, n and m point to the same underlying memory, and that node's bpf_rb_node field is being used by the some_tree insert, so overwriting it as a result of the second insert is an error. In order to properly support refcounted nodes, the rbtree and list insert functions must be allowed to fail. This patch adds such support. The kfuncs bpf_rbtree_add, bpf_list_push_{front,back} are modified to return an int indicating success/failure, with 0 -> success, nonzero -> failure. bpf_obj_drop on failure ======================= Currently the only reason an insert can fail is the example above: the bpf_{list,rb}_node is already in use. When such a failure occurs, the insert kfuncs will bpf_obj_drop the input node. This allows the insert operations to logically fail without changing their verifier owning ref behavior, namely the unconditional release_reference of the input owning ref. With insert that always succeeds, ownership of the node is always passed to the collection, since the node always ends up in the collection. With a possibly-failed insert w/ bpf_obj_drop, ownership of the node is always passed either to the collection (success), or to bpf_obj_drop (failure). Regardless, it's correct to continue unconditionally releasing the input owning ref, as something is always taking ownership from the calling program on insert. Keeping owning ref behavior unchanged results in a nice default UX for insert functions that can fail. If the program's reaction to a failed insert is "fine, just get rid of this owning ref for me and let me go on with my business", then there's no reason to check for failure since that's default behavior. e.g.: long important_failures = 0; int some_bpf_prog(void *ctx) { struct node *n, *m, *o; /* all bpf_obj_new'd */ bpf_spin_lock(&glock); bpf_rbtree_add(&some_tree, &n->node, /* ... */); bpf_rbtree_add(&some_tree, &m->node, /* ... */); if (bpf_rbtree_add(&some_tree, &o->node, /* ... */)) { important_failures++; } bpf_spin_unlock(&glock); } If we instead chose to pass ownership back to the program on failed insert - by returning NULL on success or an owning ref on failure - programs would always have to do something with the returned ref on failure. The most likely action is probably "I'll just get rid of this owning ref and go about my business", which ideally would look like: if (n = bpf_rbtree_add(&some_tree, &n->node, /* ... */)) bpf_obj_drop(n); But bpf_obj_drop isn't allowed in a critical section and inserts must occur within one, so in reality error handling would become a hard-to-parse mess. For refcounted nodes, we can replicate the "pass ownership back to program on failure" logic with this patch's semantics, albeit in an ugly way: struct node *n = bpf_obj_new(/* ... */), *m; bpf_spin_lock(&glock); m = bpf_refcount_acquire(n); if (bpf_rbtree_add(&some_tree, &n->node, /* ... */)) { /* Do something with m */ } bpf_spin_unlock(&glock); bpf_obj_drop(m); bpf_refcount_acquire is used to simulate "return owning ref on failure". This should be an uncommon occurrence, though. Addition of two verifier-fixup'd args to collection inserts =========================================================== The actual bpf_obj_drop kfunc is bpf_obj_drop_impl(void *, struct btf_struct_meta *), with bpf_obj_drop macro populating the second arg with 0 and the verifier later filling in the arg during insn fixup. Because bpf_rbtree_add and bpf_list_push_{front,back} now might do bpf_obj_drop, these kfuncs need a btf_struct_meta parameter that can be passed to bpf_obj_drop_impl. Similarly, because the 'node' param to those insert functions is the bpf_{list,rb}_node within the node type, and bpf_obj_drop expects a pointer to the beginning of the node, the insert functions need to be able to find the beginning of the node struct. A second verifier-populated param is necessary: the offset of {list,rb}_node within the node type. These two new params allow the insert kfuncs to correctly call __bpf_obj_drop_impl: beginning_of_node = bpf_rb_node_ptr - offset if (already_inserted) __bpf_obj_drop_impl(beginning_of_node, btf_struct_meta->record); Similarly to other kfuncs with "hidden" verifier-populated params, the insert functions are renamed with _impl prefix and a macro is provided for common usage. For example, bpf_rbtree_add kfunc is now bpf_rbtree_add_impl and bpf_rbtree_add is now a macro which sets "hidden" args to 0. Due to the two new args BPF progs will need to be recompiled to work with the new _impl kfuncs. This patch also rewrites the "hidden argument" explanation to more directly say why the BPF program writer doesn't need to populate the arguments with anything meaningful. How does this new logic affect non-owning references? ===================================================== Currently, non-owning refs are valid until the end of the critical section in which they're created. We can make this guarantee because, if a non-owning ref exists, the referent was added to some collection. The collection will drop() its nodes when it goes away, but it can't go away while our program is accessing it, so that's not a problem. If the referent is removed from the collection in the same CS that it was added in, it can't be bpf_obj_drop'd until after CS end. Those are the only two ways to free the referent's memory and neither can happen until after the non-owning ref's lifetime ends. On first glance, having these collection insert functions potentially bpf_obj_drop their input seems like it breaks the "can't be bpf_obj_drop'd until after CS end" line of reasoning. But we care about the memory not being _freed_ until end of CS end, and a previous patch in the series modified bpf_obj_drop such that it doesn't free refcounted nodes until refcount == 0. So the statement can be more accurately rewritten as "can't be free'd until after CS end". We can prove that this rewritten statement holds for any non-owning reference produced by collection insert functions: * If the input to the insert function is _not_ refcounted * We have an owning reference to the input, and can conclude it isn't in any collection * Inserting a node in a collection turns owning refs into non-owning, and since our input type isn't refcounted, there's no way to obtain additional owning refs to the same underlying memory * Because our node isn't in any collection, the insert operation cannot fail, so bpf_obj_drop will not execute * If bpf_obj_drop is guaranteed not to execute, there's no risk of memory being free'd * Otherwise, the input to the insert function is refcounted * If the insert operation fails due to the node's list_head or rb_root already being in some collection, there was some previous successful insert which passed refcount to the collection * We have an owning reference to the input, it must have been acquired via bpf_refcount_acquire, which bumped the refcount * refcount must be >= 2 since there's a valid owning reference and the node is already in a collection * Insert triggering bpf_obj_drop will decr refcount to >= 1, never resulting in a free So although we may do bpf_obj_drop during the critical section, this will never result in memory being free'd, and no changes to non-owning ref logic are needed in this patch. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-6-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index f03852b89d28..3dd29a53b711 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -464,7 +464,12 @@ struct bpf_insn_aux_data { */ struct bpf_loop_inline_state loop_inline_state; }; - u64 obj_new_size; /* remember the size of type passed to bpf_obj_new to rewrite R1 */ + union { + /* remember the size of type passed to bpf_obj_new to rewrite R1 */ + u64 obj_new_size; + /* remember the offset of node field within type to rewrite */ + u64 insert_off; + }; struct btf_struct_meta *kptr_struct_meta; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ -- cgit v1.2.3 From 3e81740a90626024a9d9c6f9bfa3d36204dafefb Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Sat, 15 Apr 2023 13:18:10 -0700 Subject: bpf: Centralize btf_field-specific initialization logic All btf_fields in an object are 0-initialized by memset in bpf_obj_init. This might not be a valid initial state for some field types, in which case kfuncs that use the type will properly initialize their input if it's been 0-initialized. Some BPF graph collection types and kfuncs do this: bpf_list_{head,node} and bpf_rb_node. An earlier patch in this series added the bpf_refcount field, for which the 0 state indicates that the refcounted object should be free'd. bpf_obj_init treats this field specially, setting refcount to 1 instead of relying on scattered "refcount is 0? Must have just been initialized, let's set to 1" logic in kfuncs. This patch extends this treatment to list and rbtree field types, allowing most scattered initialization logic in kfuncs to be removed. Note that bpf_{list_head,rb_root} may be inside a BPF map, in which case they'll be 0-initialized without passing through the newly-added logic, so scattered initialization logic must remain for these collection root types. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20230415201811.343116-9-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b065de2cfcea..18b592fde896 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -355,6 +355,34 @@ static inline u32 btf_field_type_align(enum btf_field_type type) } } +static inline void bpf_obj_init_field(const struct btf_field *field, void *addr) +{ + memset(addr, 0, field->size); + + switch (field->type) { + case BPF_REFCOUNT: + refcount_set((refcount_t *)addr, 1); + break; + case BPF_RB_NODE: + RB_CLEAR_NODE((struct rb_node *)addr); + break; + case BPF_LIST_HEAD: + case BPF_LIST_NODE: + INIT_LIST_HEAD((struct list_head *)addr); + break; + case BPF_RB_ROOT: + /* RB_ROOT_CACHED 0-inits, no need to do anything after memset */ + case BPF_SPIN_LOCK: + case BPF_TIMER: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + break; + default: + WARN_ON_ONCE(1); + return; + } +} + static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type) { if (IS_ERR_OR_NULL(rec)) @@ -369,10 +397,7 @@ static inline void bpf_obj_init(const struct btf_record *rec, void *obj) if (IS_ERR_OR_NULL(rec)) return; for (i = 0; i < rec->cnt; i++) - memset(obj + rec->fields[i].offset, 0, rec->fields[i].size); - - if (rec->refcount_off >= 0) - refcount_set((refcount_t *)(obj + rec->refcount_off), 1); + bpf_obj_init_field(&rec->fields[i], obj + rec->fields[i].offset); } /* 'dst' must be a temporary buffer and should not point to memory that is being -- cgit v1.2.3 From 7b4ddf3920d247c2949073b9c274301c8131332a Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sun, 16 Apr 2023 03:49:27 -0500 Subject: bpf: Remove KF_KPTR_GET kfunc flag We've managed to improve the UX for kptrs significantly over the last 9 months. All of the existing use cases which previously had KF_KPTR_GET kfuncs (struct bpf_cpumask *, struct task_struct *, and struct cgroup *) have all been updated to be synchronized using RCU. In other words, their KF_KPTR_GET kfuncs have been removed in favor of KF_RCU | KF_ACQUIRE kfuncs, with the pointers themselves also being readable from maps in an RCU read region thanks to the types being RCU safe. While KF_KPTR_GET was a logical starting point for kptrs, it's become clear that they're not the correct abstraction. KF_KPTR_GET is a flag that essentially does nothing other than enforcing that the argument to a function is a pointer to a referenced kptr map value. At first glance, that's a useful thing to guarantee to a kfunc. It gives kfuncs the ability to try and acquire a reference on that kptr without requiring the BPF prog to do something like this: struct kptr_type *in_map, *new = NULL; in_map = bpf_kptr_xchg(&map->value, NULL); if (in_map) { new = bpf_kptr_type_acquire(in_map); in_map = bpf_kptr_xchg(&map->value, in_map); if (in_map) bpf_kptr_type_release(in_map); } That's clearly a pretty ugly (and racy) UX, and if using KF_KPTR_GET is the only alternative, it's better than nothing. However, the problem with any KF_KPTR_GET kfunc lies in the fact that it always requires some kind of synchronization in order to safely do an opportunistic acquire of the kptr in the map. This is because a BPF program running on another CPU could do a bpf_kptr_xchg() on that map value, and free the kptr after it's been read by the KF_KPTR_GET kfunc. For example, the now-removed bpf_task_kptr_get() kfunc did the following: struct task_struct *bpf_task_kptr_get(struct task_struct **pp) { struct task_struct *p; rcu_read_lock(); p = READ_ONCE(*pp); /* If p is non-NULL, it could still be freed by another CPU, * so we have to do an opportunistic refcount_inc_not_zero() * and return NULL if the task will be freed after the * current RCU read region. */ |f (p && !refcount_inc_not_zero(&p->rcu_users)) p = NULL; rcu_read_unlock(); return p; } In other words, the kfunc uses RCU to ensure that the task remains valid after it's been peeked from the map. However, this is completely redundant with just defining a KF_RCU kfunc that itself does a refcount_inc_not_zero(), which is exactly what bpf_task_acquire() now does. So, the question of whether KF_KPTR_GET is useful is actually, "Are there any synchronization mechanisms / safety flags that are required by certain kptrs, but which are not provided by the verifier to kfuncs?" The answer to that question today is "No", because every kptr we currently care about is RCU protected. Even if the answer ever became "yes", the proper way to support that referenced kptr type would be to add support for whatever synchronization mechanism it requires in the verifier, rather than giving kfuncs a flag that says, "Here's a pointer to a referenced kptr in a map, do whatever you need to do." With all that said -- so as to allow us to consolidate the kfunc API, and simplify the verifier a bit, this patch removes KF_KPTR_GET, and all relevant logic from the verifier. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20230416084928.326135-3-void@manifault.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 813227bff58a..508199e38415 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -18,7 +18,6 @@ #define KF_ACQUIRE (1 << 0) /* kfunc is an acquire function */ #define KF_RELEASE (1 << 1) /* kfunc is a release function */ #define KF_RET_NULL (1 << 2) /* kfunc returns a pointer that may be NULL */ -#define KF_KPTR_GET (1 << 3) /* kfunc returns reference to a kptr */ /* Trusted arguments are those which are guaranteed to be valid when passed to * the kfunc. It is used to enforce that pointers obtained from either acquire * kfuncs, or from the main kernel on a tracepoint or struct_ops callback -- cgit v1.2.3 From 59e498a3289f685261c076b998a8a2f8a516874f Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 17 Apr 2023 15:49:15 +0200 Subject: bpf: Set skb redirect and from_ingress info in __bpf_tx_skb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are some use-cases where it is desirable to use bpf_redirect() in combination with ifb device, which currently is not supported, for example, around filtering inbound traffic with BPF to then push it to ifb which holds the qdisc for shaping in contrast to doing that on the egress device. Toke mentions the following case related to OpenWrt: Because there's not always a single egress on the other side. These are mainly home routers, which tend to have one or more WiFi devices bridged to one or more ethernet ports on the LAN side, and a single upstream WAN port. And the objective is to control the total amount of traffic going over the WAN link (in both directions), to deal with bufferbloat in the ISP network (which is sadly still all too prevalent). In this setup, the traffic can be split arbitrarily between the links on the LAN side, and the only "single bottleneck" is the WAN link. So we install both egress and ingress shapers on this, configured to something like 95-98% of the true link bandwidth, thus moving the queues into the qdisc layer in the router. It's usually necessary to set the ingress bandwidth shaper a bit lower than the egress due to being "downstream" of the bottleneck link, but it does work surprisingly well. We usually use something like a matchall filter to put all ingress traffic on the ifb, so doing the redirect from BPF has not been an immediate requirement thus far. However, it does seem a bit odd that this is not possible, and we do have a BPF-based filter that layers on top of this kind of setup, which currently uses u32 as the ingress filter and so it could presumably be improved to use BPF instead if that was available. Reported-by: Toke Høiland-Jørgensen Reported-by: Yafang Shao Reported-by: Tonghao Zhang Signed-off-by: Daniel Borkmann Acked-by: Yafang Shao Acked-by: Toke Høiland-Jørgensen Link: https://git.openwrt.org/?p=project/qosify.git;a=blob;f=README Link: https://lore.kernel.org/bpf/875y9yzbuy.fsf@toke.dk Link: https://lore.kernel.org/r/8cebc8b2b6e967e10cbafe2ffd6795050e74accd.1681739137.git.daniel@iogearbox.net Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 494a23a976b0..9ff2e3d57329 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5041,6 +5041,15 @@ static inline void skb_reset_redirect(struct sk_buff *skb) skb->redirected = 0; } +static inline void skb_set_redirected_noclear(struct sk_buff *skb, + bool from_ingress) +{ + skb->redirected = 1; +#ifdef CONFIG_NET_REDIRECT + skb->from_ingress = from_ingress; +#endif +} + static inline bool skb_csum_is_sctp(struct sk_buff *skb) { return skb->csum_not_inet; -- cgit v1.2.3 From 1210af3b99561bbe140af8d9a6b16d1bd0ba3fda Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 13 Apr 2023 15:29:19 +0300 Subject: net/mlx5e: Add IPsec packet offload tunnel bits Extend packet reformat types and flow table capabilities with IPsec packet offload tunnel bits. Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky Reviewed-by: Sridhar Samudrala Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6c84bf6eec85..20d00e09b168 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -463,9 +463,11 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 max_ft_level[0x8]; u8 reformat_add_esp_trasport[0x1]; - u8 reserved_at_41[0x2]; + u8 reformat_l2_to_l3_esp_tunnel[0x1]; + u8 reserved_at_42[0x1]; u8 reformat_del_esp_trasport[0x1]; - u8 reserved_at_44[0x2]; + u8 reformat_l3_esp_tunnel_to_l2[0x1]; + u8 reserved_at_45[0x1]; u8 execute_aso[0x1]; u8 reserved_at_47[0x19]; @@ -6630,7 +6632,9 @@ enum mlx5_reformat_ctx_type { MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3, MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4, MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV4 = 0x5, + MLX5_REFORMAT_TYPE_L2_TO_L3_ESP_TUNNEL = 0x6, MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT = 0x8, + MLX5_REFORMAT_TYPE_L3_ESP_TUNNEL_TO_L2 = 0x9, MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 = 0xb, MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf, MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10, -- cgit v1.2.3 From 980f0799a15c75403f1f9284a32b6056b9660144 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 18 Apr 2023 11:48:41 +0800 Subject: bonding: add software tx timestamping support Currently, bonding only obtain the timestamp (ts) information of the active slave, which is available only for modes 1, 5, and 6. For other modes, bonding only has software rx timestamping support. However, some users who use modes such as LACP also want tx timestamp support. To address this issue, let's check the ts information of each slave. If all slaves support tx timestamping, we can enable tx timestamping support for the bond. Add a note that the get_ts_info may be called with RCU, or rtnl or reference on the device in ethtool.h> Suggested-by: Miroslav Lichvar Signed-off-by: Hangbin Liu Acked-by: Jay Vosburgh Link: https://lore.kernel.org/r/20230418034841.2566262-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 798d35890118..62b61527bcc4 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -711,6 +711,7 @@ struct ethtool_mm_stats { * @get_dump_data: Get dump data. * @set_dump: Set dump specific flags to the device. * @get_ts_info: Get the time stamping and PTP hardware clock capabilities. + * It may be called with RCU, or rtnl or reference on the device. * Drivers supporting transmit time stamps in software should set this to * ethtool_op_get_ts_info(). * @get_module_info: Get the size and type of the eeprom contained within -- cgit v1.2.3 From e5029edd53937a29801ef507cee12e657ff31ea9 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 17 Apr 2023 17:17:26 +0200 Subject: leds: Provide stubs for when CLASS_LED & NEW_LEDS are disabled Provide stubs for devm_led_classdev_register_ext() and led_init_default_state_get() so that LED drivers embedded within other drivers such as PHYs and Ethernet switches still build when LEDS_CLASS or NEW_LEDS are disabled. This also helps with Kconfig dependencies, which are somewhat hairy for phylib and mdio and only get worse when adding a dependency on LED_CLASS. Signed-off-by: Andrew Lunn Signed-off-by: Christian Marangi Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/leds.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/leds.h b/include/linux/leds.h index d71201a968b6..aa48e643f655 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -82,7 +82,15 @@ struct led_init_data { bool devname_mandatory; }; +#if IS_ENABLED(CONFIG_NEW_LEDS) enum led_default_state led_init_default_state_get(struct fwnode_handle *fwnode); +#else +static inline enum led_default_state +led_init_default_state_get(struct fwnode_handle *fwnode) +{ + return LEDS_DEFSTATE_OFF; +} +#endif struct led_hw_trigger_type { int dummy; @@ -217,9 +225,19 @@ static inline int led_classdev_register(struct device *parent, return led_classdev_register_ext(parent, led_cdev, NULL); } +#if IS_ENABLED(CONFIG_LEDS_CLASS) int devm_led_classdev_register_ext(struct device *parent, struct led_classdev *led_cdev, struct led_init_data *init_data); +#else +static inline int +devm_led_classdev_register_ext(struct device *parent, + struct led_classdev *led_cdev, + struct led_init_data *init_data) +{ + return 0; +} +#endif static inline int devm_led_classdev_register(struct device *parent, struct led_classdev *led_cdev) -- cgit v1.2.3 From 01e5b728e9e43ae444e0369695a5f72209906464 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 17 Apr 2023 17:17:27 +0200 Subject: net: phy: Add a binding for PHY LEDs Define common binding parsing for all PHY drivers with LEDs using phylib. Parse the DT as part of the phy_probe and add LEDs to the linux LED class infrastructure. For the moment, provide a dummy brightness function, which will later be replaced with a call into the PHY driver. This allows testing since the LED core might otherwise reject an LED whose brightness cannot be set. Add a dependency on LED_CLASS. It either needs to be built in, or not enabled, since a modular build can result in linker errors. Signed-off-by: Andrew Lunn Signed-off-by: Christian Marangi Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 2f83cfc206e5..bd6b5e9bb729 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -600,6 +601,7 @@ struct macsec_ops; * @phy_num_led_triggers: Number of triggers in @phy_led_triggers * @led_link_trigger: LED trigger for link up/down * @last_triggered: last LED trigger for link speed + * @leds: list of PHY LED structures * @master_slave_set: User requested master/slave configuration * @master_slave_get: Current master/slave advertisement * @master_slave_state: Current master/slave configuration @@ -699,6 +701,7 @@ struct phy_device { struct phy_led_trigger *led_link_trigger; #endif + struct list_head leds; /* * Interrupt number for this PHY @@ -834,6 +837,19 @@ struct phy_plca_status { bool pst; }; +/** + * struct phy_led: An LED driven by the PHY + * + * @list: List of LEDs + * @led_cdev: Standard LED class structure + * @index: Number of the LED + */ +struct phy_led { + struct list_head list; + struct led_classdev led_cdev; + u8 index; +}; + /** * struct phy_driver - Driver structure for a particular PHY type * -- cgit v1.2.3 From 684818189b04b095b34964ed4a3ea5249a840eab Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 17 Apr 2023 17:17:28 +0200 Subject: net: phy: phy_device: Call into the PHY driver to set LED brightness Linux LEDs can be software controlled via the brightness file in /sys. LED drivers need to implement a brightness_set function which the core will call. Implement an intermediary in phy_device, which will call into the phy driver if it implements the necessary function. Signed-off-by: Andrew Lunn Signed-off-by: Christian Marangi Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index bd6b5e9bb729..f3c7e3c99f24 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -841,15 +841,19 @@ struct phy_plca_status { * struct phy_led: An LED driven by the PHY * * @list: List of LEDs + * @phydev: PHY this LED is attached to * @led_cdev: Standard LED class structure * @index: Number of the LED */ struct phy_led { struct list_head list; + struct phy_device *phydev; struct led_classdev led_cdev; u8 index; }; +#define to_phy_led(d) container_of(d, struct phy_led, led_cdev) + /** * struct phy_driver - Driver structure for a particular PHY type * @@ -1072,6 +1076,15 @@ struct phy_driver { /** @get_plca_status: Return the current PLCA status info */ int (*get_plca_status)(struct phy_device *dev, struct phy_plca_status *plca_st); + + /** + * @led_brightness_set: Set a PHY LED brightness. Index + * indicates which of the PHYs led should be set. Value + * follows the standard LED class meaning, e.g. LED_OFF, + * LED_HALF, LED_FULL. + */ + int (*led_brightness_set)(struct phy_device *dev, + u8 index, enum led_brightness value); }; #define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) -- cgit v1.2.3 From 4e901018432e38eab35d2a352661ce4727795be1 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 17 Apr 2023 17:17:30 +0200 Subject: net: phy: phy_device: Call into the PHY driver to set LED blinking Linux LEDs can be requested to perform hardware accelerated blinking. Pass this to the PHY driver, if it implements the op. Signed-off-by: Andrew Lunn Signed-off-by: Christian Marangi Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/phy.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index f3c7e3c99f24..c5a0dc829714 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1085,6 +1085,18 @@ struct phy_driver { */ int (*led_brightness_set)(struct phy_device *dev, u8 index, enum led_brightness value); + + /** + * @led_blink_set: Set a PHY LED brightness. Index indicates + * which of the PHYs led should be configured to blink. Delays + * are in milliseconds and if both are zero then a sensible + * default should be chosen. The call should adjust the + * timings in that case and if it can't match the values + * specified exactly. + */ + int (*led_blink_set)(struct phy_device *dev, u8 index, + unsigned long *delay_on, + unsigned long *delay_off); }; #define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) -- cgit v1.2.3 From eb6fba7555a812c07aa984fb9e8e9b151a65ca16 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 17 Apr 2023 08:53:46 -0700 Subject: net: skbuff: hide wifi_acked when CONFIG_WIRELESS not set Datacenter kernel builds will very likely not include WIRELESS, so let them shave 2 bits off the skb by hiding the wifi fields. Reviewed-by: Florian Fainelli Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski Acked-by: Johannes Berg Signed-off-by: David S. Miller --- include/linux/skbuff.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a823ec3aa326..513f03b23a73 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -953,8 +953,10 @@ struct sk_buff { __u8 l4_hash:1; __u8 sw_hash:1; +#ifdef CONFIG_WIRELESS __u8 wifi_acked_valid:1; __u8 wifi_acked:1; +#endif __u8 no_fcs:1; /* Indicates the inner headers are valid in the skbuff. */ __u8 encapsulation:1; @@ -1187,6 +1189,15 @@ static inline unsigned int skb_napi_id(const struct sk_buff *skb) #endif } +static inline bool skb_wifi_acked_valid(const struct sk_buff *skb) +{ +#ifdef CONFIG_WIRELESS + return skb->wifi_acked_valid; +#else + return 0; +#endif +} + /** * skb_unref - decrement the skb's reference count * @skb: buffer -- cgit v1.2.3 From c24831a13ba2e472f874483525084da2f2feb890 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 17 Apr 2023 08:53:47 -0700 Subject: net: skbuff: hide csum_not_inet when CONFIG_IP_SCTP not set SCTP is not universally deployed, allow hiding its bit from the skb. Reviewed-by: Florian Fainelli Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/skbuff.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 513f03b23a73..98d6b48f4dcf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -983,7 +983,9 @@ struct sk_buff { __u8 decrypted:1; #endif __u8 slow_gro:1; +#if IS_ENABLED(CONFIG_IP_SCTP) __u8 csum_not_inet:1; +#endif #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -5060,7 +5062,19 @@ static inline void skb_reset_redirect(struct sk_buff *skb) static inline bool skb_csum_is_sctp(struct sk_buff *skb) { +#if IS_ENABLED(CONFIG_IP_SCTP) return skb->csum_not_inet; +#else + return 0; +#endif +} + +static inline void skb_reset_csum_not_inet(struct sk_buff *skb) +{ + skb->ip_summed = CHECKSUM_NONE; +#if IS_ENABLED(CONFIG_IP_SCTP) + skb->csum_not_inet = 0; +#endif } static inline void skb_set_kcov_handle(struct sk_buff *skb, -- cgit v1.2.3 From 4398f3f6d1380d9f71a484e1e5d869ba0eaf23d5 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 17 Apr 2023 08:53:48 -0700 Subject: net: skbuff: move alloc_cpu into a potential hole alloc_cpu is currently between 4 byte fields, so it's almost guaranteed to create a 2B hole. It has a knock on effect of creating a 4B hole after @end (and @end and @tail being in different cachelines). None of this matters hugely, but for kernel configs which don't enable all the features there may well be a 2B hole after the bitfield. Move alloc_cpu there. Reviewed-by: Florian Fainelli Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 98d6b48f4dcf..2595b2cfba0d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -991,6 +991,8 @@ struct sk_buff { __u16 tc_index; /* traffic control index */ #endif + u16 alloc_cpu; + union { __wsum csum; struct { @@ -1014,7 +1016,6 @@ struct sk_buff { unsigned int sender_cpu; }; #endif - u16 alloc_cpu; #ifdef CONFIG_NETWORK_SECMARK __u32 secmark; #endif -- cgit v1.2.3 From 4c60d04c2888a542f96fe77ecbdfc242b484f943 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 17 Apr 2023 08:53:49 -0700 Subject: net: skbuff: push nf_trace down the bitfield nf_trace is a debug feature, AFAIU, and yet it sits oddly high in the sk_buff bitfield. Move it down, pushing up dst_pending_confirm and inner_protocol_type. Next change will make nf_trace optional (under Kconfig) and all optional fields should be placed after 2b fields to avoid 2b fields straddling bytes. dst_pending_confirm is L3, so it makes sense next to ignore_df. inner_protocol_type goes up just to keep the balance. Acked-by: Florian Westphal Reviewed-by: Florian Fainelli Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2595b2cfba0d..3ae9e8868afa 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -934,7 +934,7 @@ struct sk_buff { /* public: */ __u8 pkt_type:3; /* see PKT_TYPE_MAX */ __u8 ignore_df:1; - __u8 nf_trace:1; + __u8 dst_pending_confirm:1; __u8 ip_summed:2; __u8 ooo_okay:1; @@ -949,7 +949,7 @@ struct sk_buff { __u8 remcsum_offload:1; __u8 csum_complete_sw:1; __u8 csum_level:2; - __u8 dst_pending_confirm:1; + __u8 inner_protocol_type:1; __u8 l4_hash:1; __u8 sw_hash:1; @@ -967,7 +967,7 @@ struct sk_buff { #endif __u8 ipvs_property:1; - __u8 inner_protocol_type:1; + __u8 nf_trace:1; #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; -- cgit v1.2.3 From 48d80c394d3d1afcb49d26398917f5be27bf44cb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 17 Apr 2023 08:53:50 -0700 Subject: net: skbuff: hide nf_trace and ipvs_property Accesses to nf_trace and ipvs_property are already wrapped by ifdefs where necessary. Don't allocate the bits for those fields at all if possible. Acked-by: Florian Westphal Acked-by: Simon Horman Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3ae9e8868afa..5f120bbab9da 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -966,8 +966,12 @@ struct sk_buff { __u8 ndisc_nodetype:2; #endif +#if IS_ENABLED(CONFIG_IP_VS) __u8 ipvs_property:1; +#endif +#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES) __u8 nf_trace:1; +#endif #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; -- cgit v1.2.3 From b0bc615df488abd0e95107e4a9ecefb9bf8c250a Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Tue, 21 Mar 2023 00:10:16 +0200 Subject: net/mlx5: Add vnic devlink health reporter to PFs/VFs Create a vnic devlink health reporter for PFs/VFs interfaces. The reporter's diagnose callback displays the values of vNIC/vport transport debug counters of PFs/VFs, as follows: $ devlink health diagnose pci/0000:08:00.0 reporter vnic vNIC env counters: total_error_queues: 0 send_queue_priority_update_flow: 0 comp_eq_overrun: 0 async_eq_overrun: 0 cq_overrun: 0 invalid_command: 0 quota_exceeded_command: 0 nic_receive_steering_discard: 0 Moreover, add documentation on the reporter functionality and the counters description. While at it, expose the vNIC counters diagnose function to be used by the downstream patch, which will reveal the counters for representor interfaces. Signed-off-by: Maher Sanalla Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 135a3c8d8237..5d25c4c73046 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -439,6 +439,7 @@ struct mlx5_core_health { struct work_struct report_work; struct devlink_health_reporter *fw_reporter; struct devlink_health_reporter *fw_fatal_reporter; + struct devlink_health_reporter *vnic_reporter; struct delayed_work update_fw_log_ts_work; }; -- cgit v1.2.3 From f9c895a72a390656f9582e048fdcc3d2cec1dd7c Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Wed, 22 Mar 2023 10:21:46 +0200 Subject: net/mlx5: Update op_mode to op_mod for port selection To be consistent with the other enum keys use OP_MOD instead of OP_MODE. Signed-off-by: Roi Dayan Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 20d00e09b168..b42696d74c9f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -69,7 +69,7 @@ enum { MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3, MLX5_SET_HCA_CAP_OP_MOD_ROCE = 0x4, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE2 = 0x20, - MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION = 0x25, + MLX5_SET_HCA_CAP_OP_MOD_PORT_SELECTION = 0x25, }; enum { -- cgit v1.2.3 From 8fa66e4a1bdd41d55d7842928e60a40fed65715d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 19 Apr 2023 19:00:05 -0700 Subject: net: skbuff: update and rename __kfree_skb_defer() __kfree_skb_defer() uses the old naming where "defer" meant slab bulk free/alloc APIs. In the meantime we also made __kfree_skb_defer() feed the per-NAPI skb cache, which implies bulk APIs. So take away the 'defer' and add 'napi'. While at it add a drop reason. This only matters on the tx_action path, if the skb has a frag_list. But getting rid of a SKB_DROP_REASON_NOT_SPECIFIED seems like a net benefit so why not. Reviewed-by: Alexander Lobakin Link: https://lore.kernel.org/r/20230420020005.815854-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index db5973559042..03aa7ed076f0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3253,7 +3253,7 @@ static inline struct sk_buff *napi_alloc_skb(struct napi_struct *napi, void napi_consume_skb(struct sk_buff *skb, int budget); void napi_skb_free_stolen_head(struct sk_buff *skb); -void __kfree_skb_defer(struct sk_buff *skb); +void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason); /** * __dev_alloc_pages - allocate page for network Rx -- cgit v1.2.3 From 5b8285cca6fed9bc5baabe2e5699a5a5c0d96371 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 19 Apr 2023 14:52:52 +0200 Subject: net: move dropreason.h to dropreason-core.h This will, after the next patch, hold only the core drop reasons and minimal infrastructure. Fix a small kernel-doc issue while at it, to avoid the move triggering a checker. Signed-off-by: Johannes Berg Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- include/linux/skbuff.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 203c0df2046c..a6a3e9457d6c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -52,7 +52,7 @@ #include #include #include -#include +#include struct netpoll_info; struct device; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 03aa7ed076f0..eb9e7bb76fa6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -37,7 +37,7 @@ #include #endif #include -#include +#include /** * DOC: skb checksums -- cgit v1.2.3 From add7370a398930077c6bc257ef5016b040d476eb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 19 Apr 2023 11:16:28 -0400 Subject: sctp: delete the nested flexible array params This patch deletes the flexible-array params[] from the structure sctp_inithdr, sctp_addiphdr and sctp_reconf_chunk to avoid some sparse warnings: # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/ net/sctp/input.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h): ./include/linux/sctp.h:278:29: warning: nested flexible array ./include/linux/sctp.h:675:30: warning: nested flexible array This warning is reported if a structure having a flexible array member is included by other structures. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index 358dc08e0831..0ff36a2737a3 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -270,7 +270,7 @@ struct sctp_inithdr { __be16 num_outbound_streams; __be16 num_inbound_streams; __be32 initial_tsn; - __u8 params[]; + /* __u8 params[]; */ }; struct sctp_init_chunk { @@ -667,7 +667,7 @@ struct sctp_addip_param { struct sctp_addiphdr { __be32 serial; - __u8 params[]; + /* __u8 params[]; */ }; struct sctp_addip_chunk { @@ -742,7 +742,7 @@ struct sctp_infox { struct sctp_reconf_chunk { struct sctp_chunkhdr chunk_hdr; - __u8 params[]; + /* __u8 params[]; */ }; struct sctp_strreset_outreq { -- cgit v1.2.3 From 73175a042955e531ec355a8708585befa67a22db Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 19 Apr 2023 11:16:29 -0400 Subject: sctp: delete the nested flexible array skip This patch deletes the flexible-array skip[] from the structure sctp_ifwdtsn/fwdtsn_hdr to avoid some sparse warnings: # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/ net/sctp/stream_interleave.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h): ./include/linux/sctp.h:611:32: warning: nested flexible array ./include/linux/sctp.h:628:33: warning: nested flexible array Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index 0ff36a2737a3..9815b801fec0 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -603,7 +603,7 @@ struct sctp_fwdtsn_skip { struct sctp_fwdtsn_hdr { __be32 new_cum_tsn; - struct sctp_fwdtsn_skip skip[]; + /* struct sctp_fwdtsn_skip skip[]; */ }; struct sctp_fwdtsn_chunk { @@ -620,7 +620,7 @@ struct sctp_ifwdtsn_skip { struct sctp_ifwdtsn_hdr { __be32 new_cum_tsn; - struct sctp_ifwdtsn_skip skip[]; + /* struct sctp_ifwdtsn_skip skip[]; */ }; struct sctp_ifwdtsn_chunk { -- cgit v1.2.3 From 9789c1c6619e0a5eccfc31abe49b1ce5ca3cd11f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 19 Apr 2023 11:16:30 -0400 Subject: sctp: delete the nested flexible array variable This patch deletes the flexible-array variable[] from the structure sctp_sackhdr and sctp_errhdr to avoid some sparse warnings: # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/ net/sctp/sm_statefuns.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h): ./include/linux/sctp.h:451:28: warning: nested flexible array ./include/linux/sctp.h:393:29: warning: nested flexible array Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index 9815b801fec0..01a0eb7e9fa1 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -385,7 +385,7 @@ struct sctp_sackhdr { __be32 a_rwnd; __be16 num_gap_ack_blocks; __be16 num_dup_tsns; - union sctp_sack_variable variable[]; + /* union sctp_sack_variable variable[]; */ }; struct sctp_sack_chunk { @@ -443,7 +443,7 @@ struct sctp_shutdown_chunk { struct sctp_errhdr { __be16 cause; __be16 length; - __u8 variable[]; + /* __u8 variable[]; */ }; struct sctp_operr_chunk { -- cgit v1.2.3 From 2ab399a931dddacdf7202cd4b49a5187154623d1 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 19 Apr 2023 11:16:32 -0400 Subject: sctp: delete the nested flexible array hmac This patch deletes the flexible-array hmac[] from the structure sctp_authhdr to avoid some sparse warnings: # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/ net/sctp/auth.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h): ./include/linux/sctp.h:735:29: warning: nested flexible array Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index 01a0eb7e9fa1..d182e8c41985 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -727,7 +727,7 @@ struct sctp_addip_chunk { struct sctp_authhdr { __be16 shkey_id; __be16 hmac_id; - __u8 hmac[]; + /* __u8 hmac[]; */ }; struct sctp_auth_chunk { -- cgit v1.2.3 From dbda0fba7a14f14835c34d59fd329cb90a887862 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 19 Apr 2023 11:16:33 -0400 Subject: sctp: delete the nested flexible array payload This patch deletes the flexible-array payload[] from the structure sctp_datahdr to avoid some sparse warnings: # make C=2 CF="-Wflexible-array-nested" M=./net/sctp/ net/sctp/socket.c: note: in included file (through include/net/sctp/structs.h, include/net/sctp/sctp.h): ./include/linux/sctp.h:230:29: warning: nested flexible array This member is not even used anywhere. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/linux/sctp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index d182e8c41985..836a7e200f39 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -222,7 +222,7 @@ struct sctp_datahdr { __be16 stream; __be16 ssn; __u32 ppid; - __u8 payload[]; + /* __u8 payload[]; */ }; struct sctp_data_chunk { -- cgit v1.2.3 From a714e3ec230892039b5d5ae6902b58bb084a15c1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 19 Apr 2023 18:34:54 +0300 Subject: bridge: Add internal flags for per-{Port, VLAN} neighbor suppression Add two internal flags that will be used to enable / disable per-{Port, VLAN} neighbor suppression: 1. 'BR_NEIGH_VLAN_SUPPRESS': A per-port flag used to indicate that per-{Port, VLAN} neighbor suppression is enabled on the bridge port. When set, 'BR_NEIGH_SUPPRESS' has no effect. 2. 'BR_VLFLAG_NEIGH_SUPPRESS_ENABLED': A per-VLAN flag used to indicate that neighbor suppression is enabled on the given VLAN. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 1668ac4d7adc..3ff96ae31bf6 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -60,6 +60,7 @@ struct br_ip_list { #define BR_TX_FWD_OFFLOAD BIT(20) #define BR_PORT_LOCKED BIT(21) #define BR_PORT_MAB BIT(22) +#define BR_NEIGH_VLAN_SUPPRESS BIT(23) #define BR_DEFAULT_AGEING_TIME (300 * HZ) -- cgit v1.2.3 From 55435ea7729accb5b8a330de751836c4be524834 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Wed, 19 Apr 2023 10:04:14 -0700 Subject: pds_core: initial framework for pds_core PF driver This is the initial PCI driver framework for the new pds_core device driver and its family of devices. This does the very basics of registering for the new PF PCI device 1dd8:100c, setting up debugfs entries, and registering with devlink. Signed-off-by: Shannon Nelson Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/pds/pds_common.h | 14 + include/linux/pds/pds_core_if.h | 571 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 585 insertions(+) create mode 100644 include/linux/pds/pds_common.h create mode 100644 include/linux/pds/pds_core_if.h (limited to 'include/linux') diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h new file mode 100644 index 000000000000..bd041a5170a6 --- /dev/null +++ b/include/linux/pds/pds_common.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) OR BSD-2-Clause */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _PDS_COMMON_H_ +#define _PDS_COMMON_H_ + +#define PDS_CORE_DRV_NAME "pds_core" + +/* the device's internal addressing uses up to 52 bits */ +#define PDS_CORE_ADDR_LEN 52 +#define PDS_CORE_ADDR_MASK (BIT_ULL(PDS_ADDR_LEN) - 1) +#define PDS_PAGE_SIZE 4096 + +#endif /* _PDS_COMMON_H_ */ diff --git a/include/linux/pds/pds_core_if.h b/include/linux/pds/pds_core_if.h new file mode 100644 index 000000000000..e838a2b90440 --- /dev/null +++ b/include/linux/pds/pds_core_if.h @@ -0,0 +1,571 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) OR BSD-2-Clause */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _PDS_CORE_IF_H_ +#define _PDS_CORE_IF_H_ + +#define PCI_VENDOR_ID_PENSANDO 0x1dd8 +#define PCI_DEVICE_ID_PENSANDO_CORE_PF 0x100c +#define PCI_DEVICE_ID_VIRTIO_NET_TRANS 0x1000 +#define PCI_DEVICE_ID_PENSANDO_IONIC_ETH_VF 0x1003 +#define PCI_DEVICE_ID_PENSANDO_VDPA_VF 0x100b +#define PDS_CORE_BARS_MAX 4 +#define PDS_CORE_PCI_BAR_DBELL 1 + +/* Bar0 */ +#define PDS_CORE_DEV_INFO_SIGNATURE 0x44455649 /* 'DEVI' */ +#define PDS_CORE_BAR0_SIZE 0x8000 +#define PDS_CORE_BAR0_DEV_INFO_REGS_OFFSET 0x0000 +#define PDS_CORE_BAR0_DEV_CMD_REGS_OFFSET 0x0800 +#define PDS_CORE_BAR0_DEV_CMD_DATA_REGS_OFFSET 0x0c00 +#define PDS_CORE_BAR0_INTR_STATUS_OFFSET 0x1000 +#define PDS_CORE_BAR0_INTR_CTRL_OFFSET 0x2000 +#define PDS_CORE_DEV_CMD_DONE 0x00000001 + +#define PDS_CORE_DEVCMD_TIMEOUT 5 + +#define PDS_CORE_CLIENT_ID 0 +#define PDS_CORE_ASIC_TYPE_CAPRI 0 + +/* + * enum pds_core_cmd_opcode - Device commands + */ +enum pds_core_cmd_opcode { + /* Core init */ + PDS_CORE_CMD_NOP = 0, + PDS_CORE_CMD_IDENTIFY = 1, + PDS_CORE_CMD_RESET = 2, + PDS_CORE_CMD_INIT = 3, + + PDS_CORE_CMD_FW_DOWNLOAD = 4, + PDS_CORE_CMD_FW_CONTROL = 5, + + /* SR/IOV commands */ + PDS_CORE_CMD_VF_GETATTR = 60, + PDS_CORE_CMD_VF_SETATTR = 61, + PDS_CORE_CMD_VF_CTRL = 62, + + /* Add commands before this line */ + PDS_CORE_CMD_MAX, + PDS_CORE_CMD_COUNT +}; + +/* + * enum pds_core_status_code - Device command return codes + */ +enum pds_core_status_code { + PDS_RC_SUCCESS = 0, /* Success */ + PDS_RC_EVERSION = 1, /* Incorrect version for request */ + PDS_RC_EOPCODE = 2, /* Invalid cmd opcode */ + PDS_RC_EIO = 3, /* I/O error */ + PDS_RC_EPERM = 4, /* Permission denied */ + PDS_RC_EQID = 5, /* Bad qid */ + PDS_RC_EQTYPE = 6, /* Bad qtype */ + PDS_RC_ENOENT = 7, /* No such element */ + PDS_RC_EINTR = 8, /* operation interrupted */ + PDS_RC_EAGAIN = 9, /* Try again */ + PDS_RC_ENOMEM = 10, /* Out of memory */ + PDS_RC_EFAULT = 11, /* Bad address */ + PDS_RC_EBUSY = 12, /* Device or resource busy */ + PDS_RC_EEXIST = 13, /* object already exists */ + PDS_RC_EINVAL = 14, /* Invalid argument */ + PDS_RC_ENOSPC = 15, /* No space left or alloc failure */ + PDS_RC_ERANGE = 16, /* Parameter out of range */ + PDS_RC_BAD_ADDR = 17, /* Descriptor contains a bad ptr */ + PDS_RC_DEV_CMD = 18, /* Device cmd attempted on AdminQ */ + PDS_RC_ENOSUPP = 19, /* Operation not supported */ + PDS_RC_ERROR = 29, /* Generic error */ + PDS_RC_ERDMA = 30, /* Generic RDMA error */ + PDS_RC_EVFID = 31, /* VF ID does not exist */ + PDS_RC_BAD_FW = 32, /* FW file is invalid or corrupted */ + PDS_RC_ECLIENT = 33, /* No such client id */ +}; + +/** + * struct pds_core_drv_identity - Driver identity information + * @drv_type: Driver type (enum pds_core_driver_type) + * @os_dist: OS distribution, numeric format + * @os_dist_str: OS distribution, string format + * @kernel_ver: Kernel version, numeric format + * @kernel_ver_str: Kernel version, string format + * @driver_ver_str: Driver version, string format + */ +struct pds_core_drv_identity { + __le32 drv_type; + __le32 os_dist; + char os_dist_str[128]; + __le32 kernel_ver; + char kernel_ver_str[32]; + char driver_ver_str[32]; +}; + +#define PDS_DEV_TYPE_MAX 16 +/** + * struct pds_core_dev_identity - Device identity information + * @version: Version of device identify + * @type: Identify type (0 for now) + * @state: Device state + * @rsvd: Word boundary padding + * @nlifs: Number of LIFs provisioned + * @nintrs: Number of interrupts provisioned + * @ndbpgs_per_lif: Number of doorbell pages per LIF + * @intr_coal_mult: Interrupt coalescing multiplication factor + * Scale user-supplied interrupt coalescing + * value in usecs to device units using: + * device units = usecs * mult / div + * @intr_coal_div: Interrupt coalescing division factor + * Scale user-supplied interrupt coalescing + * value in usecs to device units using: + * device units = usecs * mult / div + * @vif_types: How many of each VIF device type is supported + */ +struct pds_core_dev_identity { + u8 version; + u8 type; + u8 state; + u8 rsvd; + __le32 nlifs; + __le32 nintrs; + __le32 ndbpgs_per_lif; + __le32 intr_coal_mult; + __le32 intr_coal_div; + __le16 vif_types[PDS_DEV_TYPE_MAX]; +}; + +#define PDS_CORE_IDENTITY_VERSION_1 1 + +/** + * struct pds_core_dev_identify_cmd - Driver/device identify command + * @opcode: Opcode PDS_CORE_CMD_IDENTIFY + * @ver: Highest version of identify supported by driver + * + * Expects to find driver identification info (struct pds_core_drv_identity) + * in cmd_regs->data. Driver should keep the devcmd interface locked + * while preparing the driver info. + */ +struct pds_core_dev_identify_cmd { + u8 opcode; + u8 ver; +}; + +/** + * struct pds_core_dev_identify_comp - Device identify command completion + * @status: Status of the command (enum pds_core_status_code) + * @ver: Version of identify returned by device + * + * Device identification info (struct pds_core_dev_identity) can be found + * in cmd_regs->data. Driver should keep the devcmd interface locked + * while reading the results. + */ +struct pds_core_dev_identify_comp { + u8 status; + u8 ver; +}; + +/** + * struct pds_core_dev_reset_cmd - Device reset command + * @opcode: Opcode PDS_CORE_CMD_RESET + * + * Resets and clears all LIFs, VDevs, and VIFs on the device. + */ +struct pds_core_dev_reset_cmd { + u8 opcode; +}; + +/** + * struct pds_core_dev_reset_comp - Reset command completion + * @status: Status of the command (enum pds_core_status_code) + */ +struct pds_core_dev_reset_comp { + u8 status; +}; + +/* + * struct pds_core_dev_init_data - Pointers and info needed for the Core + * initialization PDS_CORE_CMD_INIT command. The in and out structs are + * overlays on the pds_core_dev_cmd_regs.data space for passing data down + * to the firmware on init, and then returning initialization results. + */ +struct pds_core_dev_init_data_in { + __le64 adminq_q_base; + __le64 adminq_cq_base; + __le64 notifyq_cq_base; + __le32 flags; + __le16 intr_index; + u8 adminq_ring_size; + u8 notifyq_ring_size; +}; + +struct pds_core_dev_init_data_out { + __le32 core_hw_index; + __le32 adminq_hw_index; + __le32 notifyq_hw_index; + u8 adminq_hw_type; + u8 notifyq_hw_type; +}; + +/** + * struct pds_core_dev_init_cmd - Core device initialize + * @opcode: opcode PDS_CORE_CMD_INIT + * + * Initializes the core device and sets up the AdminQ and NotifyQ. + * Expects to find initialization data (struct pds_core_dev_init_data_in) + * in cmd_regs->data. Driver should keep the devcmd interface locked + * while preparing the driver info. + */ +struct pds_core_dev_init_cmd { + u8 opcode; +}; + +/** + * struct pds_core_dev_init_comp - Core init completion + * @status: Status of the command (enum pds_core_status_code) + * + * Initialization result data (struct pds_core_dev_init_data_in) + * is found in cmd_regs->data. + */ +struct pds_core_dev_init_comp { + u8 status; +}; + +/** + * struct pds_core_fw_download_cmd - Firmware download command + * @opcode: opcode + * @rsvd: Word boundary padding + * @addr: DMA address of the firmware buffer + * @offset: offset of the firmware buffer within the full image + * @length: number of valid bytes in the firmware buffer + */ +struct pds_core_fw_download_cmd { + u8 opcode; + u8 rsvd[3]; + __le32 offset; + __le64 addr; + __le32 length; +}; + +/** + * struct pds_core_fw_download_comp - Firmware download completion + * @status: Status of the command (enum pds_core_status_code) + */ +struct pds_core_fw_download_comp { + u8 status; +}; + +/** + * enum pds_core_fw_control_oper - FW control operations + * @PDS_CORE_FW_INSTALL_ASYNC: Install firmware asynchronously + * @PDS_CORE_FW_INSTALL_STATUS: Firmware installation status + * @PDS_CORE_FW_ACTIVATE_ASYNC: Activate firmware asynchronously + * @PDS_CORE_FW_ACTIVATE_STATUS: Firmware activate status + * @PDS_CORE_FW_UPDATE_CLEANUP: Cleanup any firmware update leftovers + * @PDS_CORE_FW_GET_BOOT: Return current active firmware slot + * @PDS_CORE_FW_SET_BOOT: Set active firmware slot for next boot + * @PDS_CORE_FW_GET_LIST: Return list of installed firmware images + */ +enum pds_core_fw_control_oper { + PDS_CORE_FW_INSTALL_ASYNC = 0, + PDS_CORE_FW_INSTALL_STATUS = 1, + PDS_CORE_FW_ACTIVATE_ASYNC = 2, + PDS_CORE_FW_ACTIVATE_STATUS = 3, + PDS_CORE_FW_UPDATE_CLEANUP = 4, + PDS_CORE_FW_GET_BOOT = 5, + PDS_CORE_FW_SET_BOOT = 6, + PDS_CORE_FW_GET_LIST = 7, +}; + +enum pds_core_fw_slot { + PDS_CORE_FW_SLOT_INVALID = 0, + PDS_CORE_FW_SLOT_A = 1, + PDS_CORE_FW_SLOT_B = 2, + PDS_CORE_FW_SLOT_GOLD = 3, +}; + +/** + * struct pds_core_fw_control_cmd - Firmware control command + * @opcode: opcode + * @rsvd: Word boundary padding + * @oper: firmware control operation (enum pds_core_fw_control_oper) + * @slot: slot to operate on (enum pds_core_fw_slot) + */ +struct pds_core_fw_control_cmd { + u8 opcode; + u8 rsvd[3]; + u8 oper; + u8 slot; +}; + +/** + * struct pds_core_fw_control_comp - Firmware control copletion + * @status: Status of the command (enum pds_core_status_code) + * @rsvd: Word alignment space + * @slot: Slot number (enum pds_core_fw_slot) + * @rsvd1: Struct padding + * @color: Color bit + */ +struct pds_core_fw_control_comp { + u8 status; + u8 rsvd[3]; + u8 slot; + u8 rsvd1[10]; + u8 color; +}; + +struct pds_core_fw_name_info { +#define PDS_CORE_FWSLOT_BUFLEN 8 +#define PDS_CORE_FWVERS_BUFLEN 32 + char slotname[PDS_CORE_FWSLOT_BUFLEN]; + char fw_version[PDS_CORE_FWVERS_BUFLEN]; +}; + +struct pds_core_fw_list_info { +#define PDS_CORE_FWVERS_LIST_LEN 16 + u8 num_fw_slots; + struct pds_core_fw_name_info fw_names[PDS_CORE_FWVERS_LIST_LEN]; +} __packed; + +enum pds_core_vf_attr { + PDS_CORE_VF_ATTR_SPOOFCHK = 1, + PDS_CORE_VF_ATTR_TRUST = 2, + PDS_CORE_VF_ATTR_MAC = 3, + PDS_CORE_VF_ATTR_LINKSTATE = 4, + PDS_CORE_VF_ATTR_VLAN = 5, + PDS_CORE_VF_ATTR_RATE = 6, + PDS_CORE_VF_ATTR_STATSADDR = 7, +}; + +/** + * enum pds_core_vf_link_status - Virtual Function link status + * @PDS_CORE_VF_LINK_STATUS_AUTO: Use link state of the uplink + * @PDS_CORE_VF_LINK_STATUS_UP: Link always up + * @PDS_CORE_VF_LINK_STATUS_DOWN: Link always down + */ +enum pds_core_vf_link_status { + PDS_CORE_VF_LINK_STATUS_AUTO = 0, + PDS_CORE_VF_LINK_STATUS_UP = 1, + PDS_CORE_VF_LINK_STATUS_DOWN = 2, +}; + +/** + * struct pds_core_vf_setattr_cmd - Set VF attributes on the NIC + * @opcode: Opcode + * @attr: Attribute type (enum pds_core_vf_attr) + * @vf_index: VF index + * @macaddr: mac address + * @vlanid: vlan ID + * @maxrate: max Tx rate in Mbps + * @spoofchk: enable address spoof checking + * @trust: enable VF trust + * @linkstate: set link up or down + * @stats: stats addr struct + * @stats.pa: set DMA address for VF stats + * @stats.len: length of VF stats space + * @pad: force union to specific size + */ +struct pds_core_vf_setattr_cmd { + u8 opcode; + u8 attr; + __le16 vf_index; + union { + u8 macaddr[6]; + __le16 vlanid; + __le32 maxrate; + u8 spoofchk; + u8 trust; + u8 linkstate; + struct { + __le64 pa; + __le32 len; + } stats; + u8 pad[60]; + } __packed; +}; + +struct pds_core_vf_setattr_comp { + u8 status; + u8 attr; + __le16 vf_index; + __le16 comp_index; + u8 rsvd[9]; + u8 color; +}; + +/** + * struct pds_core_vf_getattr_cmd - Get VF attributes from the NIC + * @opcode: Opcode + * @attr: Attribute type (enum pds_core_vf_attr) + * @vf_index: VF index + */ +struct pds_core_vf_getattr_cmd { + u8 opcode; + u8 attr; + __le16 vf_index; +}; + +struct pds_core_vf_getattr_comp { + u8 status; + u8 attr; + __le16 vf_index; + union { + u8 macaddr[6]; + __le16 vlanid; + __le32 maxrate; + u8 spoofchk; + u8 trust; + u8 linkstate; + __le64 stats_pa; + u8 pad[11]; + } __packed; + u8 color; +}; + +enum pds_core_vf_ctrl_opcode { + PDS_CORE_VF_CTRL_START_ALL = 0, + PDS_CORE_VF_CTRL_START = 1, +}; + +/** + * struct pds_core_vf_ctrl_cmd - VF control command + * @opcode: Opcode for the command + * @ctrl_opcode: VF control operation type + * @vf_index: VF Index. It is unused if op START_ALL is used. + */ + +struct pds_core_vf_ctrl_cmd { + u8 opcode; + u8 ctrl_opcode; + __le16 vf_index; +}; + +/** + * struct pds_core_vf_ctrl_comp - VF_CTRL command completion. + * @status: Status of the command (enum pds_core_status_code) + */ +struct pds_core_vf_ctrl_comp { + u8 status; +}; + +/* + * union pds_core_dev_cmd - Overlay of core device command structures + */ +union pds_core_dev_cmd { + u8 opcode; + u32 words[16]; + + struct pds_core_dev_identify_cmd identify; + struct pds_core_dev_init_cmd init; + struct pds_core_dev_reset_cmd reset; + struct pds_core_fw_download_cmd fw_download; + struct pds_core_fw_control_cmd fw_control; + + struct pds_core_vf_setattr_cmd vf_setattr; + struct pds_core_vf_getattr_cmd vf_getattr; + struct pds_core_vf_ctrl_cmd vf_ctrl; +}; + +/* + * union pds_core_dev_comp - Overlay of core device completion structures + */ +union pds_core_dev_comp { + u8 status; + u8 bytes[16]; + + struct pds_core_dev_identify_comp identify; + struct pds_core_dev_reset_comp reset; + struct pds_core_dev_init_comp init; + struct pds_core_fw_download_comp fw_download; + struct pds_core_fw_control_comp fw_control; + + struct pds_core_vf_setattr_comp vf_setattr; + struct pds_core_vf_getattr_comp vf_getattr; + struct pds_core_vf_ctrl_comp vf_ctrl; +}; + +/** + * struct pds_core_dev_hwstamp_regs - Hardware current timestamp registers + * @tick_low: Low 32 bits of hardware timestamp + * @tick_high: High 32 bits of hardware timestamp + */ +struct pds_core_dev_hwstamp_regs { + u32 tick_low; + u32 tick_high; +}; + +/** + * struct pds_core_dev_info_regs - Device info register format (read-only) + * @signature: Signature value of 0x44455649 ('DEVI') + * @version: Current version of info + * @asic_type: Asic type + * @asic_rev: Asic revision + * @fw_status: Firmware status + * bit 0 - 1 = fw running + * bit 4-7 - 4 bit generation number, changes on fw restart + * @fw_heartbeat: Firmware heartbeat counter + * @serial_num: Serial number + * @fw_version: Firmware version + * @oprom_regs: oprom_regs to store oprom debug enable/disable and bmp + * @rsvd_pad1024: Struct padding + * @hwstamp: Hardware current timestamp registers + * @rsvd_pad2048: Struct padding + */ +struct pds_core_dev_info_regs { +#define PDS_CORE_DEVINFO_FWVERS_BUFLEN 32 +#define PDS_CORE_DEVINFO_SERIAL_BUFLEN 32 + u32 signature; + u8 version; + u8 asic_type; + u8 asic_rev; +#define PDS_CORE_FW_STS_F_STOPPED 0x00 +#define PDS_CORE_FW_STS_F_RUNNING 0x01 +#define PDS_CORE_FW_STS_F_GENERATION 0xF0 + u8 fw_status; + __le32 fw_heartbeat; + char fw_version[PDS_CORE_DEVINFO_FWVERS_BUFLEN]; + char serial_num[PDS_CORE_DEVINFO_SERIAL_BUFLEN]; + u8 oprom_regs[32]; /* reserved */ + u8 rsvd_pad1024[916]; + struct pds_core_dev_hwstamp_regs hwstamp; /* on 1k boundary */ + u8 rsvd_pad2048[1016]; +} __packed; + +/** + * struct pds_core_dev_cmd_regs - Device command register format (read-write) + * @doorbell: Device Cmd Doorbell, write-only + * Write a 1 to signal device to process cmd + * @done: Command completed indicator, poll for completion + * bit 0 == 1 when command is complete + * @cmd: Opcode-specific command bytes + * @comp: Opcode-specific response bytes + * @rsvd: Struct padding + * @data: Opcode-specific side-data + */ +struct pds_core_dev_cmd_regs { + u32 doorbell; + u32 done; + union pds_core_dev_cmd cmd; + union pds_core_dev_comp comp; + u8 rsvd[48]; + u32 data[478]; +} __packed; + +/** + * struct pds_core_dev_regs - Device register format for bar 0 page 0 + * @info: Device info registers + * @devcmd: Device command registers + */ +struct pds_core_dev_regs { + struct pds_core_dev_info_regs info; + struct pds_core_dev_cmd_regs devcmd; +} __packed; + +#ifndef __CHECKER__ +static_assert(sizeof(struct pds_core_drv_identity) <= 1912); +static_assert(sizeof(struct pds_core_dev_identity) <= 1912); +static_assert(sizeof(union pds_core_dev_cmd) == 64); +static_assert(sizeof(union pds_core_dev_comp) == 16); +static_assert(sizeof(struct pds_core_dev_info_regs) == 2048); +static_assert(sizeof(struct pds_core_dev_cmd_regs) == 2048); +static_assert(sizeof(struct pds_core_dev_regs) == 4096); +#endif /* __CHECKER__ */ + +#endif /* _PDS_CORE_IF_H_ */ -- cgit v1.2.3 From 523847df1b3718d6286dce0ed1c83742fe0ffa94 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Wed, 19 Apr 2023 10:04:15 -0700 Subject: pds_core: add devcmd device interfaces The devcmd interface is the basic connection to the device through the PCI BAR for low level identification and command services. This does the early device initialization and finds the identity data, and adds devcmd routines to be used by later driver bits. Signed-off-by: Shannon Nelson Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/pds/pds_common.h | 30 ++++++++ include/linux/pds/pds_intr.h | 163 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 include/linux/pds/pds_intr.h (limited to 'include/linux') diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h index bd041a5170a6..f0798ce01acf 100644 --- a/include/linux/pds/pds_common.h +++ b/include/linux/pds/pds_common.h @@ -11,4 +11,34 @@ #define PDS_CORE_ADDR_MASK (BIT_ULL(PDS_ADDR_LEN) - 1) #define PDS_PAGE_SIZE 4096 +enum pds_core_driver_type { + PDS_DRIVER_LINUX = 1, + PDS_DRIVER_WIN = 2, + PDS_DRIVER_DPDK = 3, + PDS_DRIVER_FREEBSD = 4, + PDS_DRIVER_IPXE = 5, + PDS_DRIVER_ESXI = 6, +}; + +#define PDS_CORE_IFNAMSIZ 16 + +/** + * enum pds_core_logical_qtype - Logical Queue Types + * @PDS_CORE_QTYPE_ADMINQ: Administrative Queue + * @PDS_CORE_QTYPE_NOTIFYQ: Notify Queue + * @PDS_CORE_QTYPE_RXQ: Receive Queue + * @PDS_CORE_QTYPE_TXQ: Transmit Queue + * @PDS_CORE_QTYPE_EQ: Event Queue + * @PDS_CORE_QTYPE_MAX: Max queue type supported + */ +enum pds_core_logical_qtype { + PDS_CORE_QTYPE_ADMINQ = 0, + PDS_CORE_QTYPE_NOTIFYQ = 1, + PDS_CORE_QTYPE_RXQ = 2, + PDS_CORE_QTYPE_TXQ = 3, + PDS_CORE_QTYPE_EQ = 4, + + PDS_CORE_QTYPE_MAX = 16 /* don't change - used in struct size */ +}; + #endif /* _PDS_COMMON_H_ */ diff --git a/include/linux/pds/pds_intr.h b/include/linux/pds/pds_intr.h new file mode 100644 index 000000000000..56277c37248c --- /dev/null +++ b/include/linux/pds/pds_intr.h @@ -0,0 +1,163 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) OR BSD-2-Clause */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _PDS_INTR_H_ +#define _PDS_INTR_H_ + +/* + * Interrupt control register + * @coal_init: Coalescing timer initial value, in + * device units. Use @identity->intr_coal_mult + * and @identity->intr_coal_div to convert from + * usecs to device units: + * + * coal_init = coal_usecs * coal_mutl / coal_div + * + * When an interrupt is sent the interrupt + * coalescing timer current value + * (@coalescing_curr) is initialized with this + * value and begins counting down. No more + * interrupts are sent until the coalescing + * timer reaches 0. When @coalescing_init=0 + * interrupt coalescing is effectively disabled + * and every interrupt assert results in an + * interrupt. Reset value: 0 + * @mask: Interrupt mask. When @mask=1 the interrupt + * resource will not send an interrupt. When + * @mask=0 the interrupt resource will send an + * interrupt if an interrupt event is pending + * or on the next interrupt assertion event. + * Reset value: 1 + * @credits: Interrupt credits. This register indicates + * how many interrupt events the hardware has + * sent. When written by software this + * register atomically decrements @int_credits + * by the value written. When @int_credits + * becomes 0 then the "pending interrupt" bit + * in the Interrupt Status register is cleared + * by the hardware and any pending but unsent + * interrupts are cleared. + * !!!IMPORTANT!!! This is a signed register. + * @flags: Interrupt control flags + * @unmask -- When this bit is written with a 1 + * the interrupt resource will set mask=0. + * @coal_timer_reset -- When this + * bit is written with a 1 the + * @coalescing_curr will be reloaded with + * @coalescing_init to reset the coalescing + * timer. + * @mask_on_assert: Automatically mask on assertion. When + * @mask_on_assert=1 the interrupt resource + * will set @mask=1 whenever an interrupt is + * sent. When using interrupts in Legacy + * Interrupt mode the driver must select + * @mask_on_assert=0 for proper interrupt + * operation. + * @coalescing_curr: Coalescing timer current value, in + * microseconds. When this value reaches 0 + * the interrupt resource is again eligible to + * send an interrupt. If an interrupt event + * is already pending when @coalescing_curr + * reaches 0 the pending interrupt will be + * sent, otherwise an interrupt will be sent + * on the next interrupt assertion event. + */ +struct pds_core_intr { + u32 coal_init; + u32 mask; + u16 credits; + u16 flags; +#define PDS_CORE_INTR_F_UNMASK 0x0001 +#define PDS_CORE_INTR_F_TIMER_RESET 0x0002 + u32 mask_on_assert; + u32 coalescing_curr; + u32 rsvd6[3]; +}; + +#ifndef __CHECKER__ +static_assert(sizeof(struct pds_core_intr) == 32); +#endif /* __CHECKER__ */ + +#define PDS_CORE_INTR_CTRL_REGS_MAX 2048 +#define PDS_CORE_INTR_CTRL_COAL_MAX 0x3F +#define PDS_CORE_INTR_INDEX_NOT_ASSIGNED -1 + +struct pds_core_intr_status { + u32 status[2]; +}; + +/** + * enum pds_core_intr_mask_vals - valid values for mask and mask_assert. + * @PDS_CORE_INTR_MASK_CLEAR: unmask interrupt. + * @PDS_CORE_INTR_MASK_SET: mask interrupt. + */ +enum pds_core_intr_mask_vals { + PDS_CORE_INTR_MASK_CLEAR = 0, + PDS_CORE_INTR_MASK_SET = 1, +}; + +/** + * enum pds_core_intr_credits_bits - Bitwise composition of credits values. + * @PDS_CORE_INTR_CRED_COUNT: bit mask of credit count, no shift needed. + * @PDS_CORE_INTR_CRED_COUNT_SIGNED: bit mask of credit count, including sign bit. + * @PDS_CORE_INTR_CRED_UNMASK: unmask the interrupt. + * @PDS_CORE_INTR_CRED_RESET_COALESCE: reset the coalesce timer. + * @PDS_CORE_INTR_CRED_REARM: unmask the and reset the timer. + */ +enum pds_core_intr_credits_bits { + PDS_CORE_INTR_CRED_COUNT = 0x7fffu, + PDS_CORE_INTR_CRED_COUNT_SIGNED = 0xffffu, + PDS_CORE_INTR_CRED_UNMASK = 0x10000u, + PDS_CORE_INTR_CRED_RESET_COALESCE = 0x20000u, + PDS_CORE_INTR_CRED_REARM = (PDS_CORE_INTR_CRED_UNMASK | + PDS_CORE_INTR_CRED_RESET_COALESCE), +}; + +static inline void +pds_core_intr_coal_init(struct pds_core_intr __iomem *intr_ctrl, u32 coal) +{ + iowrite32(coal, &intr_ctrl->coal_init); +} + +static inline void +pds_core_intr_mask(struct pds_core_intr __iomem *intr_ctrl, u32 mask) +{ + iowrite32(mask, &intr_ctrl->mask); +} + +static inline void +pds_core_intr_credits(struct pds_core_intr __iomem *intr_ctrl, + u32 cred, u32 flags) +{ + if (WARN_ON_ONCE(cred > PDS_CORE_INTR_CRED_COUNT)) { + cred = ioread32(&intr_ctrl->credits); + cred &= PDS_CORE_INTR_CRED_COUNT_SIGNED; + } + + iowrite32(cred | flags, &intr_ctrl->credits); +} + +static inline void +pds_core_intr_clean_flags(struct pds_core_intr __iomem *intr_ctrl, u32 flags) +{ + u32 cred; + + cred = ioread32(&intr_ctrl->credits); + cred &= PDS_CORE_INTR_CRED_COUNT_SIGNED; + cred |= flags; + iowrite32(cred, &intr_ctrl->credits); +} + +static inline void +pds_core_intr_clean(struct pds_core_intr __iomem *intr_ctrl) +{ + pds_core_intr_clean_flags(intr_ctrl, PDS_CORE_INTR_CRED_RESET_COALESCE); +} + +static inline void +pds_core_intr_mask_assert(struct pds_core_intr __iomem *intr_ctrl, u32 mask) +{ + iowrite32(mask, &intr_ctrl->mask_on_assert); +} + +#endif /* _PDS_INTR_H_ */ -- cgit v1.2.3 From 45d76f492938cdc27ddadc16e1e75103f4cfbf56 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Wed, 19 Apr 2023 10:04:18 -0700 Subject: pds_core: set up device and adminq Set up the basic adminq and notifyq queue structures. These are used mostly by the client drivers for feature configuration. These are essentially the same adminq and notifyq as in the ionic driver. Part of this includes querying for device identity and FW information, so we can make that available to devlink dev info. $ devlink dev info pci/0000:b5:00.0 pci/0000:b5:00.0: driver pds_core serial_number FLM18420073 versions: fixed: asic.id 0x0 asic.rev 0x0 running: fw 1.51.0-73 stored: fw.goldfw 1.15.9-C-22 fw.mainfwa 1.60.0-73 fw.mainfwb 1.60.0-57 Signed-off-by: Shannon Nelson Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/pds/pds_adminq.h | 638 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 638 insertions(+) create mode 100644 include/linux/pds/pds_adminq.h (limited to 'include/linux') diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h new file mode 100644 index 000000000000..dd5fbe3ee141 --- /dev/null +++ b/include/linux/pds/pds_adminq.h @@ -0,0 +1,638 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#ifndef _PDS_CORE_ADMINQ_H_ +#define _PDS_CORE_ADMINQ_H_ + +enum pds_core_adminq_flags { + PDS_AQ_FLAG_FASTPOLL = BIT(1), /* completion poll at 1ms */ +}; + +/* + * enum pds_core_adminq_opcode - AdminQ command opcodes + * These commands are only processed on AdminQ, not available in devcmd + */ +enum pds_core_adminq_opcode { + PDS_AQ_CMD_NOP = 0, + + /* Client control */ + PDS_AQ_CMD_CLIENT_REG = 6, + PDS_AQ_CMD_CLIENT_UNREG = 7, + PDS_AQ_CMD_CLIENT_CMD = 8, + + /* LIF commands */ + PDS_AQ_CMD_LIF_IDENTIFY = 20, + PDS_AQ_CMD_LIF_INIT = 21, + PDS_AQ_CMD_LIF_RESET = 22, + PDS_AQ_CMD_LIF_GETATTR = 23, + PDS_AQ_CMD_LIF_SETATTR = 24, + PDS_AQ_CMD_LIF_SETPHC = 25, + + PDS_AQ_CMD_RX_MODE_SET = 30, + PDS_AQ_CMD_RX_FILTER_ADD = 31, + PDS_AQ_CMD_RX_FILTER_DEL = 32, + + /* Queue commands */ + PDS_AQ_CMD_Q_IDENTIFY = 39, + PDS_AQ_CMD_Q_INIT = 40, + PDS_AQ_CMD_Q_CONTROL = 41, + + /* SR/IOV commands */ + PDS_AQ_CMD_VF_GETATTR = 60, + PDS_AQ_CMD_VF_SETATTR = 61, +}; + +/* + * enum pds_core_notifyq_opcode - NotifyQ event codes + */ +enum pds_core_notifyq_opcode { + PDS_EVENT_LINK_CHANGE = 1, + PDS_EVENT_RESET = 2, + PDS_EVENT_XCVR = 5, + PDS_EVENT_CLIENT = 6, +}; + +#define PDS_COMP_COLOR_MASK 0x80 + +/** + * struct pds_core_notifyq_event - Generic event reporting structure + * @eid: event number + * @ecode: event code + * + * This is the generic event report struct from which the other + * actual events will be formed. + */ +struct pds_core_notifyq_event { + __le64 eid; + __le16 ecode; +}; + +/** + * struct pds_core_link_change_event - Link change event notification + * @eid: event number + * @ecode: event code = PDS_EVENT_LINK_CHANGE + * @link_status: link up/down, with error bits + * @link_speed: speed of the network link + * + * Sent when the network link state changes between UP and DOWN + */ +struct pds_core_link_change_event { + __le64 eid; + __le16 ecode; + __le16 link_status; + __le32 link_speed; /* units of 1Mbps: e.g. 10000 = 10Gbps */ +}; + +/** + * struct pds_core_reset_event - Reset event notification + * @eid: event number + * @ecode: event code = PDS_EVENT_RESET + * @reset_code: reset type + * @state: 0=pending, 1=complete, 2=error + * + * Sent when the NIC or some subsystem is going to be or + * has been reset. + */ +struct pds_core_reset_event { + __le64 eid; + __le16 ecode; + u8 reset_code; + u8 state; +}; + +/** + * struct pds_core_client_event - Client event notification + * @eid: event number + * @ecode: event code = PDS_EVENT_CLIENT + * @client_id: client to sent event to + * @client_event: wrapped event struct for the client + * + * Sent when an event needs to be passed on to a client + */ +struct pds_core_client_event { + __le64 eid; + __le16 ecode; + __le16 client_id; + u8 client_event[54]; +}; + +/** + * struct pds_core_notifyq_cmd - Placeholder for building qcq + * @data: anonymous field for building the qcq + */ +struct pds_core_notifyq_cmd { + __le32 data; /* Not used but needed for qcq structure */ +}; + +/* + * union pds_core_notifyq_comp - Overlay of notifyq event structures + */ +union pds_core_notifyq_comp { + struct { + __le64 eid; + __le16 ecode; + }; + struct pds_core_notifyq_event event; + struct pds_core_link_change_event link_change; + struct pds_core_reset_event reset; + u8 data[64]; +}; + +#define PDS_DEVNAME_LEN 32 +/** + * struct pds_core_client_reg_cmd - Register a new client with DSC + * @opcode: opcode PDS_AQ_CMD_CLIENT_REG + * @rsvd: word boundary padding + * @devname: text name of client device + * @vif_type: what type of device (enum pds_core_vif_types) + * + * Tell the DSC of the new client, and receive a client_id from DSC. + */ +struct pds_core_client_reg_cmd { + u8 opcode; + u8 rsvd[3]; + char devname[PDS_DEVNAME_LEN]; + u8 vif_type; +}; + +/** + * struct pds_core_client_reg_comp - Client registration completion + * @status: Status of the command (enum pdc_core_status_code) + * @rsvd: Word boundary padding + * @comp_index: Index in the descriptor ring for which this is the completion + * @client_id: New id assigned by DSC + * @rsvd1: Word boundary padding + * @color: Color bit + */ +struct pds_core_client_reg_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + __le16 client_id; + u8 rsvd1[9]; + u8 color; +}; + +/** + * struct pds_core_client_unreg_cmd - Unregister a client from DSC + * @opcode: opcode PDS_AQ_CMD_CLIENT_UNREG + * @rsvd: word boundary padding + * @client_id: id of client being removed + * + * Tell the DSC this client is going away and remove its context + * This uses the generic completion. + */ +struct pds_core_client_unreg_cmd { + u8 opcode; + u8 rsvd; + __le16 client_id; +}; + +/** + * struct pds_core_client_request_cmd - Pass along a wrapped client AdminQ cmd + * @opcode: opcode PDS_AQ_CMD_CLIENT_CMD + * @rsvd: word boundary padding + * @client_id: id of client being removed + * @client_cmd: the wrapped client command + * + * Proxy post an adminq command for the client. + * This uses the generic completion. + */ +struct pds_core_client_request_cmd { + u8 opcode; + u8 rsvd; + __le16 client_id; + u8 client_cmd[60]; +}; + +#define PDS_CORE_MAX_FRAGS 16 + +#define PDS_CORE_QCQ_F_INITED BIT(0) +#define PDS_CORE_QCQ_F_SG BIT(1) +#define PDS_CORE_QCQ_F_INTR BIT(2) +#define PDS_CORE_QCQ_F_TX_STATS BIT(3) +#define PDS_CORE_QCQ_F_RX_STATS BIT(4) +#define PDS_CORE_QCQ_F_NOTIFYQ BIT(5) +#define PDS_CORE_QCQ_F_CMB_RINGS BIT(6) +#define PDS_CORE_QCQ_F_CORE BIT(7) + +enum pds_core_lif_type { + PDS_CORE_LIF_TYPE_DEFAULT = 0, +}; + +/** + * union pds_core_lif_config - LIF configuration + * @state: LIF state (enum pds_core_lif_state) + * @rsvd: Word boundary padding + * @name: LIF name + * @rsvd2: Word boundary padding + * @features: LIF features active (enum pds_core_hw_features) + * @queue_count: Queue counts per queue-type + * @words: Full union buffer size + */ +union pds_core_lif_config { + struct { + u8 state; + u8 rsvd[3]; + char name[PDS_CORE_IFNAMSIZ]; + u8 rsvd2[12]; + __le64 features; + __le32 queue_count[PDS_CORE_QTYPE_MAX]; + } __packed; + __le32 words[64]; +}; + +/** + * struct pds_core_lif_status - LIF status register + * @eid: most recent NotifyQ event id + * @rsvd: full struct size + */ +struct pds_core_lif_status { + __le64 eid; + u8 rsvd[56]; +}; + +/** + * struct pds_core_lif_info - LIF info structure + * @config: LIF configuration structure + * @status: LIF status structure + */ +struct pds_core_lif_info { + union pds_core_lif_config config; + struct pds_core_lif_status status; +}; + +/** + * struct pds_core_lif_identity - LIF identity information (type-specific) + * @features: LIF features (see enum pds_core_hw_features) + * @version: Identify structure version + * @hw_index: LIF hardware index + * @rsvd: Word boundary padding + * @max_nb_sessions: Maximum number of sessions supported + * @rsvd2: buffer padding + * @config: LIF config struct with features, q counts + */ +struct pds_core_lif_identity { + __le64 features; + u8 version; + u8 hw_index; + u8 rsvd[2]; + __le32 max_nb_sessions; + u8 rsvd2[120]; + union pds_core_lif_config config; +}; + +/** + * struct pds_core_lif_identify_cmd - Get LIF identity info command + * @opcode: Opcode PDS_AQ_CMD_LIF_IDENTIFY + * @type: LIF type (enum pds_core_lif_type) + * @client_id: Client identifier + * @ver: Version of identify returned by device + * @rsvd: Word boundary padding + * @ident_pa: DMA address to receive identity info + * + * Firmware will copy LIF identity data (struct pds_core_lif_identity) + * into the buffer address given. + */ +struct pds_core_lif_identify_cmd { + u8 opcode; + u8 type; + __le16 client_id; + u8 ver; + u8 rsvd[3]; + __le64 ident_pa; +}; + +/** + * struct pds_core_lif_identify_comp - LIF identify command completion + * @status: Status of the command (enum pds_core_status_code) + * @ver: Version of identify returned by device + * @bytes: Bytes copied into the buffer + * @rsvd: Word boundary padding + * @color: Color bit + */ +struct pds_core_lif_identify_comp { + u8 status; + u8 ver; + __le16 bytes; + u8 rsvd[11]; + u8 color; +}; + +/** + * struct pds_core_lif_init_cmd - LIF init command + * @opcode: Opcode PDS_AQ_CMD_LIF_INIT + * @type: LIF type (enum pds_core_lif_type) + * @client_id: Client identifier + * @rsvd: Word boundary padding + * @info_pa: Destination address for LIF info (struct pds_core_lif_info) + */ +struct pds_core_lif_init_cmd { + u8 opcode; + u8 type; + __le16 client_id; + __le32 rsvd; + __le64 info_pa; +}; + +/** + * struct pds_core_lif_init_comp - LIF init command completion + * @status: Status of the command (enum pds_core_status_code) + * @rsvd: Word boundary padding + * @hw_index: Hardware index of the initialized LIF + * @rsvd1: Word boundary padding + * @color: Color bit + */ +struct pds_core_lif_init_comp { + u8 status; + u8 rsvd; + __le16 hw_index; + u8 rsvd1[11]; + u8 color; +}; + +/** + * struct pds_core_lif_reset_cmd - LIF reset command + * Will reset only the specified LIF. + * @opcode: Opcode PDS_AQ_CMD_LIF_RESET + * @rsvd: Word boundary padding + * @client_id: Client identifier + */ +struct pds_core_lif_reset_cmd { + u8 opcode; + u8 rsvd; + __le16 client_id; +}; + +/** + * enum pds_core_lif_attr - List of LIF attributes + * @PDS_CORE_LIF_ATTR_STATE: LIF state attribute + * @PDS_CORE_LIF_ATTR_NAME: LIF name attribute + * @PDS_CORE_LIF_ATTR_FEATURES: LIF features attribute + * @PDS_CORE_LIF_ATTR_STATS_CTRL: LIF statistics control attribute + */ +enum pds_core_lif_attr { + PDS_CORE_LIF_ATTR_STATE = 0, + PDS_CORE_LIF_ATTR_NAME = 1, + PDS_CORE_LIF_ATTR_FEATURES = 4, + PDS_CORE_LIF_ATTR_STATS_CTRL = 6, +}; + +/** + * struct pds_core_lif_setattr_cmd - Set LIF attributes on the NIC + * @opcode: Opcode PDS_AQ_CMD_LIF_SETATTR + * @attr: Attribute type (enum pds_core_lif_attr) + * @client_id: Client identifier + * @state: LIF state (enum pds_core_lif_state) + * @name: The name string, 0 terminated + * @features: Features (enum pds_core_hw_features) + * @stats_ctl: Stats control commands (enum pds_core_stats_ctl_cmd) + * @rsvd: Command Buffer padding + */ +struct pds_core_lif_setattr_cmd { + u8 opcode; + u8 attr; + __le16 client_id; + union { + u8 state; + char name[PDS_CORE_IFNAMSIZ]; + __le64 features; + u8 stats_ctl; + u8 rsvd[60]; + } __packed; +}; + +/** + * struct pds_core_lif_setattr_comp - LIF set attr command completion + * @status: Status of the command (enum pds_core_status_code) + * @rsvd: Word boundary padding + * @comp_index: Index in the descriptor ring for which this is the completion + * @features: Features (enum pds_core_hw_features) + * @rsvd2: Word boundary padding + * @color: Color bit + */ +struct pds_core_lif_setattr_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + union { + __le64 features; + u8 rsvd2[11]; + } __packed; + u8 color; +}; + +/** + * struct pds_core_lif_getattr_cmd - Get LIF attributes from the NIC + * @opcode: Opcode PDS_AQ_CMD_LIF_GETATTR + * @attr: Attribute type (enum pds_core_lif_attr) + * @client_id: Client identifier + */ +struct pds_core_lif_getattr_cmd { + u8 opcode; + u8 attr; + __le16 client_id; +}; + +/** + * struct pds_core_lif_getattr_comp - LIF get attr command completion + * @status: Status of the command (enum pds_core_status_code) + * @rsvd: Word boundary padding + * @comp_index: Index in the descriptor ring for which this is the completion + * @state: LIF state (enum pds_core_lif_state) + * @name: LIF name string, 0 terminated + * @features: Features (enum pds_core_hw_features) + * @rsvd2: Word boundary padding + * @color: Color bit + */ +struct pds_core_lif_getattr_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + union { + u8 state; + __le64 features; + u8 rsvd2[11]; + } __packed; + u8 color; +}; + +/** + * union pds_core_q_identity - Queue identity information + * @version: Queue type version that can be used with FW + * @supported: Bitfield of queue versions, first bit = ver 0 + * @rsvd: Word boundary padding + * @features: Queue features + * @desc_sz: Descriptor size + * @comp_sz: Completion descriptor size + * @rsvd2: Word boundary padding + */ +struct pds_core_q_identity { + u8 version; + u8 supported; + u8 rsvd[6]; +#define PDS_CORE_QIDENT_F_CQ 0x01 /* queue has completion ring */ + __le64 features; + __le16 desc_sz; + __le16 comp_sz; + u8 rsvd2[6]; +}; + +/** + * struct pds_core_q_identify_cmd - queue identify command + * @opcode: Opcode PDS_AQ_CMD_Q_IDENTIFY + * @type: Logical queue type (enum pds_core_logical_qtype) + * @client_id: Client identifier + * @ver: Highest queue type version that the driver supports + * @rsvd: Word boundary padding + * @ident_pa: DMA address to receive the data (struct pds_core_q_identity) + */ +struct pds_core_q_identify_cmd { + u8 opcode; + u8 type; + __le16 client_id; + u8 ver; + u8 rsvd[3]; + __le64 ident_pa; +}; + +/** + * struct pds_core_q_identify_comp - queue identify command completion + * @status: Status of the command (enum pds_core_status_code) + * @rsvd: Word boundary padding + * @comp_index: Index in the descriptor ring for which this is the completion + * @ver: Queue type version that can be used with FW + * @rsvd1: Word boundary padding + * @color: Color bit + */ +struct pds_core_q_identify_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + u8 ver; + u8 rsvd1[10]; + u8 color; +}; + +/** + * struct pds_core_q_init_cmd - Queue init command + * @opcode: Opcode PDS_AQ_CMD_Q_INIT + * @type: Logical queue type + * @client_id: Client identifier + * @ver: Queue type version + * @rsvd: Word boundary padding + * @index: (LIF, qtype) relative admin queue index + * @intr_index: Interrupt control register index, or Event queue index + * @pid: Process ID + * @flags: + * IRQ: Interrupt requested on completion + * ENA: Enable the queue. If ENA=0 the queue is initialized + * but remains disabled, to be later enabled with the + * Queue Enable command. If ENA=1, then queue is + * initialized and then enabled. + * @cos: Class of service for this queue + * @ring_size: Queue ring size, encoded as a log2(size), in + * number of descriptors. The actual ring size is + * (1 << ring_size). For example, to select a ring size + * of 64 descriptors write ring_size = 6. The minimum + * ring_size value is 2 for a ring of 4 descriptors. + * The maximum ring_size value is 12 for a ring of 4k + * descriptors. Values of ring_size <2 and >12 are + * reserved. + * @ring_base: Queue ring base address + * @cq_ring_base: Completion queue ring base address + */ +struct pds_core_q_init_cmd { + u8 opcode; + u8 type; + __le16 client_id; + u8 ver; + u8 rsvd[3]; + __le32 index; + __le16 pid; + __le16 intr_index; + __le16 flags; +#define PDS_CORE_QINIT_F_IRQ 0x01 /* Request interrupt on completion */ +#define PDS_CORE_QINIT_F_ENA 0x02 /* Enable the queue */ + u8 cos; +#define PDS_CORE_QSIZE_MIN_LG2 2 +#define PDS_CORE_QSIZE_MAX_LG2 12 + u8 ring_size; + __le64 ring_base; + __le64 cq_ring_base; +} __packed; + +/** + * struct pds_core_q_init_comp - Queue init command completion + * @status: Status of the command (enum pds_core_status_code) + * @rsvd: Word boundary padding + * @comp_index: Index in the descriptor ring for which this is the completion + * @hw_index: Hardware Queue ID + * @hw_type: Hardware Queue type + * @rsvd2: Word boundary padding + * @color: Color + */ +struct pds_core_q_init_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + __le32 hw_index; + u8 hw_type; + u8 rsvd2[6]; + u8 color; +}; + +union pds_core_adminq_cmd { + u8 opcode; + u8 bytes[64]; + + struct pds_core_client_reg_cmd client_reg; + struct pds_core_client_unreg_cmd client_unreg; + struct pds_core_client_request_cmd client_request; + + struct pds_core_lif_identify_cmd lif_ident; + struct pds_core_lif_init_cmd lif_init; + struct pds_core_lif_reset_cmd lif_reset; + struct pds_core_lif_setattr_cmd lif_setattr; + struct pds_core_lif_getattr_cmd lif_getattr; + + struct pds_core_q_identify_cmd q_ident; + struct pds_core_q_init_cmd q_init; +}; + +union pds_core_adminq_comp { + struct { + u8 status; + u8 rsvd; + __le16 comp_index; + u8 rsvd2[11]; + u8 color; + }; + u32 words[4]; + + struct pds_core_client_reg_comp client_reg; + + struct pds_core_lif_identify_comp lif_ident; + struct pds_core_lif_init_comp lif_init; + struct pds_core_lif_setattr_comp lif_setattr; + struct pds_core_lif_getattr_comp lif_getattr; + + struct pds_core_q_identify_comp q_ident; + struct pds_core_q_init_comp q_init; +}; + +#ifndef __CHECKER__ +static_assert(sizeof(union pds_core_adminq_cmd) == 64); +static_assert(sizeof(union pds_core_adminq_comp) == 16); +static_assert(sizeof(union pds_core_notifyq_comp) == 64); +#endif /* __CHECKER__ */ + +/* The color bit is a 'done' bit for the completion descriptors + * where the meaning alternates between '1' and '0' for alternating + * passes through the completion descriptor ring. + */ +static inline u8 pdsc_color_match(u8 color, u8 done_color) +{ + return (!!(color & PDS_COMP_COLOR_MASK)) == done_color; +} +#endif /* _PDS_CORE_ADMINQ_H_ */ -- cgit v1.2.3 From 01ba61b55b2041a39c54aefb3153c770dd59a0ef Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Wed, 19 Apr 2023 10:04:19 -0700 Subject: pds_core: Add adminq processing and commands Add the service routines for submitting and processing the adminq messages and for handling notifyq events. Signed-off-by: Shannon Nelson Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/pds/pds_adminq.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index dd5fbe3ee141..98a60ce87b92 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -4,6 +4,8 @@ #ifndef _PDS_CORE_ADMINQ_H_ #define _PDS_CORE_ADMINQ_H_ +#define PDSC_ADMINQ_MAX_POLL_INTERVAL 256 + enum pds_core_adminq_flags { PDS_AQ_FLAG_FASTPOLL = BIT(1), /* completion poll at 1ms */ }; @@ -631,8 +633,15 @@ static_assert(sizeof(union pds_core_notifyq_comp) == 64); * where the meaning alternates between '1' and '0' for alternating * passes through the completion descriptor ring. */ -static inline u8 pdsc_color_match(u8 color, u8 done_color) +static inline bool pdsc_color_match(u8 color, bool done_color) { return (!!(color & PDS_COMP_COLOR_MASK)) == done_color; } + +struct pdsc; +int pdsc_adminq_post(struct pdsc *pdsc, + union pds_core_adminq_cmd *cmd, + union pds_core_adminq_comp *comp, + bool fast_poll); + #endif /* _PDS_CORE_ADMINQ_H_ */ -- cgit v1.2.3 From 65e0185ad764d2801811bb2e7c122e92557208c4 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Wed, 19 Apr 2023 10:04:21 -0700 Subject: pds_core: set up the VIF definitions and defaults The Virtual Interfaces (VIFs) supported by the DSC's configuration (vDPA, Eth, RDMA, etc) are reported in the dev_ident struct and made visible in debugfs. At this point only vDPA is supported in this driver so we only setup devices for that feature. Signed-off-by: Shannon Nelson Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/pds/pds_common.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h index f0798ce01acf..b2be14ebadb6 100644 --- a/include/linux/pds/pds_common.h +++ b/include/linux/pds/pds_common.h @@ -20,6 +20,25 @@ enum pds_core_driver_type { PDS_DRIVER_ESXI = 6, }; +enum pds_core_vif_types { + PDS_DEV_TYPE_CORE = 0, + PDS_DEV_TYPE_VDPA = 1, + PDS_DEV_TYPE_VFIO = 2, + PDS_DEV_TYPE_ETH = 3, + PDS_DEV_TYPE_RDMA = 4, + PDS_DEV_TYPE_LM = 5, + + /* new ones added before this line */ + PDS_DEV_TYPE_MAX = 16 /* don't change - used in struct size */ +}; + +#define PDS_DEV_TYPE_CORE_STR "Core" +#define PDS_DEV_TYPE_VDPA_STR "vDPA" +#define PDS_DEV_TYPE_VFIO_STR "VFio" +#define PDS_DEV_TYPE_ETH_STR "Eth" +#define PDS_DEV_TYPE_RDMA_STR "RDMA" +#define PDS_DEV_TYPE_LM_STR "LM" + #define PDS_CORE_IFNAMSIZ 16 /** -- cgit v1.2.3 From 4569cce43bc61e4cdd76597a1cf9b608846c18cc Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Wed, 19 Apr 2023 10:04:23 -0700 Subject: pds_core: add auxiliary_bus devices An auxiliary_bus device is created for each vDPA type VF at VF probe and destroyed at VF remove. The aux device name comes from the driver name + VIF type + the unique id assigned at PCI probe. The VFs are always removed on PF remove, so there should be no issues with VFs trying to access missing PF structures. The auxiliary_device names will look like "pds_core.vDPA.nn" where 'nn' is the VF's uid. Signed-off-by: Shannon Nelson Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/pds/pds_auxbus.h | 14 ++++++++++++++ include/linux/pds/pds_common.h | 1 + 2 files changed, 15 insertions(+) create mode 100644 include/linux/pds/pds_auxbus.h (limited to 'include/linux') diff --git a/include/linux/pds/pds_auxbus.h b/include/linux/pds/pds_auxbus.h new file mode 100644 index 000000000000..493f75b1995e --- /dev/null +++ b/include/linux/pds/pds_auxbus.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc */ + +#ifndef _PDSC_AUXBUS_H_ +#define _PDSC_AUXBUS_H_ + +#include + +struct pds_auxiliary_dev { + struct auxiliary_device aux_dev; + struct pci_dev *vf_pdev; + u16 client_id; +}; +#endif /* _PDSC_AUXBUS_H_ */ diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h index b2be14ebadb6..961b3d02c69f 100644 --- a/include/linux/pds/pds_common.h +++ b/include/linux/pds/pds_common.h @@ -60,4 +60,5 @@ enum pds_core_logical_qtype { PDS_CORE_QTYPE_MAX = 16 /* don't change - used in struct size */ }; +void *pdsc_get_pf_struct(struct pci_dev *vf_pdev); #endif /* _PDS_COMMON_H_ */ -- cgit v1.2.3 From 10659034c622738bc1bfab8a76fc576c52d5acce Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Wed, 19 Apr 2023 10:04:25 -0700 Subject: pds_core: add the aux client API Add the client API operations for running adminq commands. The core registers the client with the FW, then the client has a context for requesting adminq services. We expect to add additional operations for other clients, including requesting additional private adminqs and IRQs, but don't have the need yet. Signed-off-by: Shannon Nelson Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/pds/pds_auxbus.h | 6 ++++++ include/linux/pds/pds_common.h | 2 ++ 2 files changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pds/pds_auxbus.h b/include/linux/pds/pds_auxbus.h index 493f75b1995e..214ef12302d0 100644 --- a/include/linux/pds/pds_auxbus.h +++ b/include/linux/pds/pds_auxbus.h @@ -11,4 +11,10 @@ struct pds_auxiliary_dev { struct pci_dev *vf_pdev; u16 client_id; }; + +int pds_client_adminq_cmd(struct pds_auxiliary_dev *padev, + union pds_core_adminq_cmd *req, + size_t req_len, + union pds_core_adminq_comp *resp, + u64 flags); #endif /* _PDSC_AUXBUS_H_ */ diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h index 961b3d02c69f..4b37675fde3e 100644 --- a/include/linux/pds/pds_common.h +++ b/include/linux/pds/pds_common.h @@ -61,4 +61,6 @@ enum pds_core_logical_qtype { }; void *pdsc_get_pf_struct(struct pci_dev *vf_pdev); +int pds_client_register(struct pci_dev *pf_pdev, char *devname); +int pds_client_unregister(struct pci_dev *pf_pdev, u16 client_id); #endif /* _PDS_COMMON_H_ */ -- cgit v1.2.3 From d24c28278a01dc4c80d1470533c667cf406f0e88 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Wed, 19 Apr 2023 10:04:26 -0700 Subject: pds_core: publish events to the clients When the Core device gets an event from the device, or notices the device FW to be up or down, it needs to send those events on to the clients that have an event handler. Add the code to pass along the events to the clients. The entry points pdsc_register_notify() and pdsc_unregister_notify() are EXPORTed for other drivers that want to listen for these events. Signed-off-by: Shannon Nelson Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/pds/pds_common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h index 4b37675fde3e..060331486d50 100644 --- a/include/linux/pds/pds_common.h +++ b/include/linux/pds/pds_common.h @@ -60,6 +60,8 @@ enum pds_core_logical_qtype { PDS_CORE_QTYPE_MAX = 16 /* don't change - used in struct size */ }; +int pdsc_register_notify(struct notifier_block *nb); +void pdsc_unregister_notify(struct notifier_block *nb); void *pdsc_get_pf_struct(struct pci_dev *vf_pdev); int pds_client_register(struct pci_dev *pf_pdev, char *devname); int pds_client_unregister(struct pci_dev *pf_pdev, u16 client_id); -- cgit v1.2.3 From 84601d6ee68ae820dec97450934797046d62db4b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Apr 2023 19:02:54 +0200 Subject: bpf: add bpf_link support for BPF_NETFILTER programs Add bpf_link support skeleton. To keep this reviewable, no bpf program can be invoked yet, if a program is attached only a c-stub is called and not the actual bpf program. Defaults to 'y' if both netfilter and bpf syscall are enabled in kconfig. Uapi example usage: union bpf_attr attr = { }; attr.link_create.prog_fd = progfd; attr.link_create.attach_type = 0; /* unused */ attr.link_create.netfilter.pf = PF_INET; attr.link_create.netfilter.hooknum = NF_INET_LOCAL_IN; attr.link_create.netfilter.priority = -128; err = bpf(BPF_LINK_CREATE, &attr, sizeof(attr)); ... this would attach progfd to ipv4:input hook. Such hook gets removed automatically if the calling program exits. BPF_NETFILTER program invocation is added in followup change. NF_HOOK_OP_BPF enum will eventually be read from nfnetlink_hook, it allows to tell userspace which program is attached at the given hook when user runs 'nft hook list' command rather than just the priority and not-very-helpful 'this hook runs a bpf prog but I can't tell which one'. Will also be used to disallow registration of two bpf programs with same priority in a followup patch. v4: arm32 cmpxchg only supports 32bit operand s/prio/priority/ v3: restrict prog attachment to ip/ip6 for now, lets lift restrictions if more use cases pop up (arptables, ebtables, netdev ingress/egress etc). Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230421170300.24115-2-fw@strlen.de Signed-off-by: Alexei Starovoitov --- include/linux/netfilter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index c8e03bcaecaa..0762444e3767 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -80,6 +80,7 @@ typedef unsigned int nf_hookfn(void *priv, enum nf_hook_ops_type { NF_HOOK_OP_UNDEFINED, NF_HOOK_OP_NF_TABLES, + NF_HOOK_OP_BPF, }; struct nf_hook_ops { -- cgit v1.2.3 From fd9c663b9ad67dedfc9a3fd3429ddd3e83782b4d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Apr 2023 19:02:55 +0200 Subject: bpf: minimal support for programs hooked into netfilter framework This adds minimal support for BPF_PROG_TYPE_NETFILTER bpf programs that will be invoked via the NF_HOOK() points in the ip stack. Invocation incurs an indirect call. This is not a necessity: Its possible to add 'DEFINE_BPF_DISPATCHER(nf_progs)' and handle the program invocation with the same method already done for xdp progs. This isn't done here to keep the size of this chunk down. Verifier restricts verdicts to either DROP or ACCEPT. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230421170300.24115-3-fw@strlen.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf_types.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index d4ee3ccd3753..39a999abb0ce 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -79,6 +79,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, #endif BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall, void *, void *) +#ifdef CONFIG_NETFILTER +BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter, + struct bpf_nf_ctx, struct bpf_nf_ctx) +#endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) -- cgit v1.2.3 From 2b99ef22e0d237e08bfc437e7d051f78f352aeb2 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Apr 2023 19:02:59 +0200 Subject: bpf: add test_run support for netfilter program type add glue code so a bpf program can be run using userspace-provided netfilter state and packet/skb. Default is to use ipv4:output hook point, but this can be overridden by userspace. Userspace provided netfilter state is restricted, only hook and protocol families can be overridden and only to ipv4/ipv6. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230421170300.24115-7-fw@strlen.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 18b592fde896..e53ceee1df37 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2264,6 +2264,9 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); +int bpf_prog_test_run_nf(struct bpf_prog *prog, + const union bpf_attr *kattr, + union bpf_attr __user *uattr); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); -- cgit v1.2.3 From 9a32e9850686599ed194ccdceb6cd3dd56b2d9b9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 13 Apr 2023 17:13:19 +0200 Subject: netfilter: nf_tables: don't write table validation state without mutex The ->cleanup callback needs to be removed, this doesn't work anymore as the transaction mutex is already released in the ->abort function. Just do it after a successful validation pass, this either happens from commit or abort phases where transaction mutex is held. Fixes: f102d66b335a ("netfilter: nf_tables: use dedicated mutex to guard transactions") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 241e005f290a..e9a9ab34a7cc 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -45,7 +45,6 @@ struct nfnetlink_subsystem { int (*commit)(struct net *net, struct sk_buff *skb); int (*abort)(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action); - void (*cleanup)(struct net *net); bool (*valid_genid)(struct net *net, u32 genid); }; -- cgit v1.2.3 From 6d26d985eeda89faedabbcf6607c37454b9691b0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 22 Apr 2023 09:35:44 +0200 Subject: bpf: fix link failure with NETFILTER=y INET=n Explicitly check if NETFILTER_BPF_LINK is enabled, else configs that have NETFILTER=y but CONFIG_INET=n fail to link: > kernel/bpf/syscall.o: undefined reference to `netfilter_prog_ops' > kernel/bpf/verifier.o: undefined reference to `netfilter_verifier_ops' Fixes: fd9c663b9ad6 ("bpf: minimal support for programs hooked into netfilter framework") Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202304220903.fRZTJtxe-lkp@intel.com/ Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230422073544.17634-1-fw@strlen.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 39a999abb0ce..fc0d6f32c687 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -79,7 +79,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, #endif BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall, void *, void *) -#ifdef CONFIG_NETFILTER +#ifdef CONFIG_NETFILTER_BPF_LINK BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter, struct bpf_nf_ctx, struct bpf_nf_ctx) #endif -- cgit v1.2.3 From 87eff2ec57b6d68d294013d8dd21e839a1175e3a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Apr 2023 09:43:57 +0000 Subject: net: optimize napi_threaded_poll() vs RPS/RFS We use napi_threaded_poll() in order to reduce our softirq dependency. We can add a followup of 821eba962d95 ("net: optimize napi_schedule_rps()") to further remove the need of firing NET_RX_SOFTIRQ whenever RPS/RFS are used. Signed-off-by: Eric Dumazet Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a6a3e9457d6c..08fbd4622ccf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3194,7 +3194,10 @@ struct softnet_data { #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; #endif + bool in_net_rx_action; + bool in_napi_threaded_poll; + #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit __rcu *flow_limit; #endif -- cgit v1.2.3 From f90615ada0b1e21a9d93ff89b04549fd7a92c92b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 21 Apr 2023 01:55:53 +0300 Subject: net: vlan: don't adjust MAC header in __vlan_insert_inner_tag() unless set This is a preparatory change for the deletion of skb_reset_mac_header(skb) from __dev_queue_xmit(). After that deletion, skb_mac_header(skb) will no longer be set in TX paths, from which __vlan_insert_inner_tag() can still be called (perhaps indirectly). If we don't make this change, then an unset MAC header (equal to ~0U) will become set after the adjustment with VLAN_HLEN. Signed-off-by: Vladimir Oltean Reviewed-by: Eric Dumazet Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 6864b89ef868..90b76d63c11c 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -351,7 +351,8 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, /* Move the mac header sans proto to the beginning of the new header. */ if (likely(mac_len > ETH_TLEN)) memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); - skb->mac_header -= VLAN_HLEN; + if (skb_mac_header_was_set(skb)) + skb->mac_header -= VLAN_HLEN; veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN); -- cgit v1.2.3 From 1f5020acb33f926030f62563c86dffca35c7b701 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 21 Apr 2023 01:55:54 +0300 Subject: net: vlan: introduce skb_vlan_eth_hdr() Similar to skb_eth_hdr() introduced in commit 96cc4b69581d ("macvlan: do not assume mac_header is set in macvlan_broadcast()"), let's introduce a skb_vlan_eth_hdr() helper which can be used in TX-only code paths to get to the VLAN header based on skb->data rather than based on the skb_mac_header(skb). We also consolidate the drivers that dereference skb->data to go through this helper. Signed-off-by: Vladimir Oltean Reviewed-by: Eric Dumazet Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 90b76d63c11c..3698f2b391cd 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -62,6 +62,14 @@ static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) return (struct vlan_ethhdr *)skb_mac_header(skb); } +/* Prefer this version in TX path, instead of + * skb_reset_mac_header() + vlan_eth_hdr() + */ +static inline struct vlan_ethhdr *skb_vlan_eth_hdr(const struct sk_buff *skb) +{ + return (struct vlan_ethhdr *)skb->data; +} + #define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ #define VLAN_PRIO_SHIFT 13 #define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */ @@ -529,7 +537,7 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, */ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { - struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb->data; + struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb); if (!eth_type_vlan(veth->h_vlan_proto)) return -EINVAL; @@ -713,7 +721,7 @@ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) return false; - veh = (struct vlan_ethhdr *)skb->data; + veh = skb_vlan_eth_hdr(skb); protocol = veh->h_vlan_encapsulated_proto; } -- cgit v1.2.3 From 0bcf2e4aca6c29a07555b713f2fb461dc38d5977 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 21 Apr 2023 01:56:01 +0300 Subject: net: dsa: tag_ocelot: call only the relevant portion of __skb_vlan_pop() on TX ocelot_xmit_get_vlan_info() calls __skb_vlan_pop() as the most appropriate helper I could find which strips away a VLAN header. That's all I need it to do, but __skb_vlan_pop() has more logic, which will become incompatible with the future revert of commit 6d1ccff62780 ("net: reset mac header in dev_start_xmit()"). Namely, it performs a sanity check on skb_mac_header(), which will stop being set after the above revert, so it will return an error instead of removing the VLAN tag. ocelot_xmit_get_vlan_info() gets called in 2 circumstances: (1) the port is under a VLAN-aware bridge and the bridge sends VLAN-tagged packets (2) the port is under a VLAN-aware bridge and somebody else (an 8021q upper) sends VLAN-tagged packets (using a VID that isn't in the bridge vlan tables) In case (1), there is actually no bug to defend against, because br_dev_xmit() calls skb_reset_mac_header() and things continue to work. However, in case (2), illustrated using the commands below, it can be seen that our intervention is needed, since __skb_vlan_pop() complains: $ ip link add br0 type bridge vlan_filtering 1 && ip link set br0 up $ ip link set $eth master br0 && ip link set $eth up $ ip link add link $eth name $eth.100 type vlan id 100 && ip link set $eth.100 up $ ip addr add 192.168.100.1/24 dev $eth.100 I could fend off the checks in __skb_vlan_pop() with some skb_mac_header_was_set() calls, but seeing how few callers of __skb_vlan_pop() there are from TX paths, that seems rather unproductive. As an alternative solution, extract the bare minimum logic to strip a VLAN header, and move it to a new helper named vlan_remove_tag(), close to the definition of vlan_insert_tag(). Document it appropriately and make ocelot_xmit_get_vlan_info() call this smaller helper instead. Seeing that it doesn't appear illegal to test skb->protocol in the TX path, I guess it would be a good for vlan_remove_tag() to also absorb the vlan_set_encap_proto() function call. Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 3698f2b391cd..0f40f379d75c 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -685,6 +685,27 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, skb->protocol = htons(ETH_P_802_2); } +/** + * vlan_remove_tag - remove outer VLAN tag from payload + * @skb: skbuff to remove tag from + * @vlan_tci: buffer to store value + * + * Expects the skb to contain a VLAN tag in the payload, and to have skb->data + * pointing at the MAC header. + * + * Returns a new pointer to skb->data, or NULL on failure to pull. + */ +static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) +{ + struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); + + *vlan_tci = ntohs(vhdr->h_vlan_TCI); + + memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); + vlan_set_encap_proto(skb, vhdr); + return __skb_pull(skb, VLAN_HLEN); +} + /** * skb_vlan_tagged - check if skb is vlan tagged. * @skb: skbuff to query -- cgit v1.2.3 From 29f93a687f3c435e94d026ee1fc8ad18ce56b7fb Mon Sep 17 00:00:00 2001 From: Neeraj Sanjay Kale Date: Thu, 16 Mar 2023 22:52:11 +0530 Subject: serdev: Replace all instances of ENOTSUPP with EOPNOTSUPP This replaces all instances of ENOTSUPP with EOPNOTSUPP since ENOTSUPP is not a standard error code. This will help maintain consistency in error codes when new serdev API's are added. Signed-off-by: Neeraj Sanjay Kale Reviewed-by: Simon Horman Signed-off-by: Luiz Augusto von Dentz --- include/linux/serdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 5f6bfe4f6d95..babedb13785b 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -250,11 +250,11 @@ static inline int serdev_device_write_buf(struct serdev_device *serdev, static inline void serdev_device_wait_until_sent(struct serdev_device *sdev, long timeout) {} static inline int serdev_device_get_tiocm(struct serdev_device *serdev) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int serdev_device_set_tiocm(struct serdev_device *serdev, int set, int clear) { - return -ENOTSUPP; + return -EOPNOTSUPP; } static inline int serdev_device_write(struct serdev_device *sdev, const unsigned char *buf, size_t count, unsigned long timeout) -- cgit v1.2.3 From 8eaf839e4ac4feadf06e03eeff34059795450712 Mon Sep 17 00:00:00 2001 From: Neeraj Sanjay Kale Date: Thu, 16 Mar 2023 22:52:12 +0530 Subject: serdev: Add method to assert break signal over tty UART port Adds serdev_device_break_ctl() and an implementation for ttyport. This function simply calls the break_ctl in tty layer, which can assert a break signal over UART-TX line, if the tty and the underlying platform and UART peripheral supports this operation. Signed-off-by: Neeraj Sanjay Kale Reviewed-by: Simon Horman Signed-off-by: Luiz Augusto von Dentz --- include/linux/serdev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serdev.h b/include/linux/serdev.h index babedb13785b..f5f97fa25e8a 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -93,6 +93,7 @@ struct serdev_controller_ops { void (*wait_until_sent)(struct serdev_controller *, long); int (*get_tiocm)(struct serdev_controller *); int (*set_tiocm)(struct serdev_controller *, unsigned int, unsigned int); + int (*break_ctl)(struct serdev_controller *ctrl, unsigned int break_state); }; /** @@ -203,6 +204,7 @@ int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_ void serdev_device_wait_until_sent(struct serdev_device *, long); int serdev_device_get_tiocm(struct serdev_device *); int serdev_device_set_tiocm(struct serdev_device *, int, int); +int serdev_device_break_ctl(struct serdev_device *serdev, int break_state); void serdev_device_write_wakeup(struct serdev_device *); int serdev_device_write(struct serdev_device *, const unsigned char *, size_t, long); void serdev_device_write_flush(struct serdev_device *); @@ -256,6 +258,10 @@ static inline int serdev_device_set_tiocm(struct serdev_device *serdev, int set, { return -EOPNOTSUPP; } +static inline int serdev_device_break_ctl(struct serdev_device *serdev, int break_state) +{ + return -EOPNOTSUPP; +} static inline int serdev_device_write(struct serdev_device *sdev, const unsigned char *buf, size_t count, unsigned long timeout) { -- cgit v1.2.3