From dee872e124e8d5de22b68c58f6f6c3f5e8889160 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:45 +0530 Subject: bpf: Populate kfunc BTF ID sets in struct btf This patch prepares the kernel to support putting all kinds of kfunc BTF ID sets in the struct btf itself. The various kernel subsystems will make register_btf_kfunc_id_set call in the initcalls (for built-in code and modules). The 'hook' is one of the many program types, e.g. XDP and TC/SCHED_CLS, STRUCT_OPS, and 'types' are check (allowed or not), acquire, release, and ret_null (with PTR_TO_BTF_ID_OR_NULL return type). A maximum of BTF_KFUNC_SET_MAX_CNT (32) kfunc BTF IDs are permitted in a set of certain hook and type for vmlinux sets, since they are allocated on demand, and otherwise set as NULL. Module sets can only be registered once per hook and type, hence they are directly assigned. A new btf_kfunc_id_set_contains function is exposed for use in verifier, this new method is faster than the existing list searching method, and is also automatic. It also lets other code not care whether the set is unallocated or not. Note that module code can only do single register_btf_kfunc_id_set call per hook. This is why sorting is only done for in-kernel vmlinux sets, because there might be multiple sets for the same hook and type that must be concatenated, hence sorting them is required to ensure bsearch in btf_id_set_contains continues to work correctly. Next commit will update the kernel users to make use of this infrastructure. Finally, add __maybe_unused annotation for BTF ID macros for the !CONFIG_DEBUG_INFO_BTF case, so that they don't produce warnings during build time. The previous patch is also needed to provide synchronization against initialization for module BTF's kfunc_set_tab introduced here, as described below: The kfunc_set_tab pointer in struct btf is write-once (if we consider the registration phase (comprised of multiple register_btf_kfunc_id_set calls) as a single operation). In this sense, once it has been fully prepared, it isn't modified, only used for lookup (from the verifier context). For btf_vmlinux, it is initialized fully during the do_initcalls phase, which happens fairly early in the boot process, before any processes are present. This also eliminates the possibility of bpf_check being called at that point, thus relieving us of ensuring any synchronization between the registration and lookup function (btf_kfunc_id_set_contains). However, the case for module BTF is a bit tricky. The BTF is parsed, prepared, and published from the MODULE_STATE_COMING notifier callback. After this, the module initcalls are invoked, where our registration function will be called to populate the kfunc_set_tab for module BTF. At this point, BTF may be available to userspace while its corresponding module is still intializing. A BTF fd can then be passed to verifier using bpf syscall (e.g. for kfunc call insn). Hence, there is a race window where verifier may concurrently try to lookup the kfunc_set_tab. To prevent this race, we must ensure the operations are serialized, or waiting for the __init functions to complete. In the earlier registration API, this race was alleviated as verifier bpf_check_mod_kfunc_call didn't find the kfunc BTF ID until it was added by the registration function (called usually at the end of module __init function after all module resources have been initialized). If the verifier made the check_kfunc_call before kfunc BTF ID was added to the list, it would fail verification (saying call isn't allowed). The access to list was protected using a mutex. Now, it would still fail verification, but for a different reason (returning ENXIO due to the failed btf_try_get_module call in add_kfunc_call), because if the __init call is in progress the module will be in the middle of MODULE_STATE_COMING -> MODULE_STATE_LIVE transition, and the BTF_MODULE_LIVE flag for btf_module instance will not be set, so the btf_try_get_module call will fail. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 39 +++++++++++++++++++++++++++++++++++++++ include/linux/btf_ids.h | 13 +++++++------ 2 files changed, 46 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 0c74348cbc9d..c451f8e2612a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -12,11 +12,33 @@ #define BTF_TYPE_EMIT(type) ((void)(type *)0) #define BTF_TYPE_EMIT_ENUM(enum_val) ((void)enum_val) +enum btf_kfunc_type { + BTF_KFUNC_TYPE_CHECK, + BTF_KFUNC_TYPE_ACQUIRE, + BTF_KFUNC_TYPE_RELEASE, + BTF_KFUNC_TYPE_RET_NULL, + BTF_KFUNC_TYPE_MAX, +}; + struct btf; struct btf_member; struct btf_type; union bpf_attr; struct btf_show; +struct btf_id_set; + +struct btf_kfunc_id_set { + struct module *owner; + union { + struct { + struct btf_id_set *check_set; + struct btf_id_set *acquire_set; + struct btf_id_set *release_set; + struct btf_id_set *ret_null_set; + }; + struct btf_id_set *sets[BTF_KFUNC_TYPE_MAX]; + }; +}; extern const struct file_operations btf_fops; @@ -307,6 +329,11 @@ const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); +bool btf_kfunc_id_set_contains(const struct btf *btf, + enum bpf_prog_type prog_type, + enum btf_kfunc_type type, u32 kfunc_btf_id); +int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *s); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -318,6 +345,18 @@ static inline const char *btf_name_by_offset(const struct btf *btf, { return NULL; } +static inline bool btf_kfunc_id_set_contains(const struct btf *btf, + enum bpf_prog_type prog_type, + enum btf_kfunc_type type, + u32 kfunc_btf_id) +{ + return false; +} +static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *s) +{ + return 0; +} #endif struct kfunc_btf_id_set { diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 919c0fde1c51..bc5d9cc34e4c 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -11,6 +11,7 @@ struct btf_id_set { #ifdef CONFIG_DEBUG_INFO_BTF #include /* for __PASTE */ +#include /* for __maybe_unused */ /* * Following macros help to define lists of BTF IDs placed @@ -146,14 +147,14 @@ extern struct btf_id_set name; #else -#define BTF_ID_LIST(name) static u32 name[5]; +#define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; #define BTF_ID(prefix, name) #define BTF_ID_UNUSED -#define BTF_ID_LIST_GLOBAL(name, n) u32 name[n]; -#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 name[1]; -#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 name[1]; -#define BTF_SET_START(name) static struct btf_id_set name = { 0 }; -#define BTF_SET_START_GLOBAL(name) static struct btf_id_set name = { 0 }; +#define BTF_ID_LIST_GLOBAL(name, n) u32 __maybe_unused name[n]; +#define BTF_ID_LIST_SINGLE(name, prefix, typename) static u32 __maybe_unused name[1]; +#define BTF_ID_LIST_GLOBAL_SINGLE(name, prefix, typename) u32 __maybe_unused name[1]; +#define BTF_SET_START(name) static struct btf_id_set __maybe_unused name = { 0 }; +#define BTF_SET_START_GLOBAL(name) static struct btf_id_set __maybe_unused name = { 0 }; #define BTF_SET_END(name) #endif /* CONFIG_DEBUG_INFO_BTF */ -- cgit v1.2.3 From b202d84422223b7222cba5031d182f20b37e146e Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:46 +0530 Subject: bpf: Remove check_kfunc_call callback and old kfunc BTF ID API Completely remove the old code for check_kfunc_call to help it work with modules, and also the callback itself. The previous commit adds infrastructure to register all sets and put them in vmlinux or module BTF, and concatenates all related sets organized by the hook and the type. Once populated, these sets remain immutable for the lifetime of the struct btf. Also, since we don't need the 'owner' module anywhere when doing check_kfunc_call, drop the 'btf_modp' module parameter from find_kfunc_desc_btf. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 -------- include/linux/btf.h | 44 -------------------------------------------- 2 files changed, 52 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6e947cd91152..6d7346c54d83 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -573,7 +573,6 @@ struct bpf_verifier_ops { const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); - bool (*check_kfunc_call)(u32 kfunc_btf_id, struct module *owner); }; struct bpf_prog_offload_ops { @@ -1719,7 +1718,6 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); -bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner); bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); @@ -1971,12 +1969,6 @@ static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, return -ENOTSUPP; } -static inline bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, - struct module *owner) -{ - return false; -} - static inline void bpf_map_put(struct bpf_map *map) { } diff --git a/include/linux/btf.h b/include/linux/btf.h index c451f8e2612a..b12cfe3b12bb 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -359,48 +359,4 @@ static inline int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, } #endif -struct kfunc_btf_id_set { - struct list_head list; - struct btf_id_set *set; - struct module *owner; -}; - -struct kfunc_btf_id_list { - struct list_head list; - struct mutex mutex; -}; - -#ifdef CONFIG_DEBUG_INFO_BTF_MODULES -void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s); -void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s); -bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, - struct module *owner); - -extern struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list; -extern struct kfunc_btf_id_list prog_test_kfunc_list; -#else -static inline void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) -{ -} -static inline void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) -{ -} -static inline bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, - u32 kfunc_id, struct module *owner) -{ - return false; -} - -static struct kfunc_btf_id_list bpf_tcp_ca_kfunc_list __maybe_unused; -static struct kfunc_btf_id_list prog_test_kfunc_list __maybe_unused; -#endif - -#define DEFINE_KFUNC_BTF_ID_SET(set, name) \ - struct kfunc_btf_id_set name = { LIST_HEAD_INIT(name.list), (set), \ - THIS_MODULE } - #endif -- cgit v1.2.3 From d583691c47dc0424ebe926000339a6d6cd590ff7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:47 +0530 Subject: bpf: Introduce mem, size argument pair support for kfunc BPF helpers can associate two adjacent arguments together to pass memory of certain size, using ARG_PTR_TO_MEM and ARG_CONST_SIZE arguments. Since we don't use bpf_func_proto for kfunc, we need to leverage BTF to implement similar support. The ARG_CONST_SIZE processing for helpers is refactored into a common check_mem_size_reg helper that is shared with kfunc as well. kfunc ptr_to_mem support follows logic similar to global functions, where verification is done as if pointer is not null, even when it may be null. This leads to a simple to follow rule for writing kfunc: always check the argument pointer for NULL, except when it is PTR_TO_CTX. Also, the PTR_TO_CTX case is also only safe when the helper expecting pointer to program ctx is not exposed to other programs where same struct is not ctx type. In that case, the type check will fall through to other cases and would permit passing other types of pointers, possibly NULL at runtime. Currently, we require the size argument to be suffixed with "__sz" in the parameter name. This information is then recorded in kernel BTF and verified during function argument checking. In the future we can use BTF tagging instead, and modify the kernel function definitions. This will be a purely kernel-side change. This allows us to have some form of backwards compatibility for structures that are passed in to the kernel function with their size, and allow variable length structures to be passed in if they are accompanied by a size parameter. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 143401d4c9d9..857fd687bdc2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -521,6 +521,8 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); int check_ctx_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); +int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); -- cgit v1.2.3 From 5c073f26f9dc78a6c8194b23eac7537c9692c7d7 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:48 +0530 Subject: bpf: Add reference tracking support to kfunc This patch adds verifier support for PTR_TO_BTF_ID return type of kfunc to be a reference, by reusing acquire_reference_state/release_reference support for existing in-kernel bpf helpers. We make use of the three kfunc types: - BTF_KFUNC_TYPE_ACQUIRE Return true if kfunc_btf_id is an acquire kfunc. This will acquire_reference_state for the returned PTR_TO_BTF_ID (this is the only allow return value). Note that acquire kfunc must always return a PTR_TO_BTF_ID{_OR_NULL}, otherwise the program is rejected. - BTF_KFUNC_TYPE_RELEASE Return true if kfunc_btf_id is a release kfunc. This will release the reference to the passed in PTR_TO_BTF_ID which has a reference state (from earlier acquire kfunc). The btf_check_func_arg_match returns the regno (of argument register, hence > 0) if the kfunc is a release kfunc, and a proper referenced PTR_TO_BTF_ID is being passed to it. This is similar to how helper call check uses bpf_call_arg_meta to store the ref_obj_id that is later used to release the reference. Similar to in-kernel helper, we only allow passing one referenced PTR_TO_BTF_ID as an argument. It can also be passed in to normal kfunc, but in case of release kfunc there must always be one PTR_TO_BTF_ID argument that is referenced. - BTF_KFUNC_TYPE_RET_NULL For kfunc returning PTR_TO_BTF_ID, tells if it can be NULL, hence force caller to mark the pointer not null (using check) before accessing it. Note that taking into account the case fixed by commit 93c230e3f5bd ("bpf: Enforce id generation for all may-be-null register type") we assign a non-zero id for mark_ptr_or_null_reg logic. Later, if more return types are supported by kfunc, which have a _OR_NULL variant, it might be better to move this id generation under a common reg_type_may_be_null check, similar to the case in the commit. Referenced PTR_TO_BTF_ID is currently only limited to kfunc, but can be extended in the future to other BPF helpers as well. For now, we can rely on the btf_struct_ids_match check to ensure we get the pointer to the expected struct type. In the future, care needs to be taken to avoid ambiguity for reference PTR_TO_BTF_ID passed to release function, in case multiple candidates can release same BTF ID. e.g. there might be two release kfuncs (or kfunc and helper): foo(struct abc *p); bar(struct abc *p); ... such that both release a PTR_TO_BTF_ID with btf_id of struct abc. In this case we would need to track the acquire function corresponding to the release function to avoid type confusion, and store this information in the register state so that an incorrect program can be rejected. This is not a problem right now, hence it is left as an exercise for the future patch introducing such a case in the kernel. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 857fd687bdc2..ac4797155412 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -566,4 +566,9 @@ static inline u32 type_flag(u32 type) return type & ~BPF_BASE_TYPE_MASK; } +static inline enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) +{ + return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; +} + #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From b4c2b9593a1c4c3a718370e34af28e817fd5e5c6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 14 Jan 2022 22:09:49 +0530 Subject: net/netfilter: Add unstable CT lookup helpers for XDP and TC-BPF This change adds conntrack lookup helpers using the unstable kfunc call interface for the XDP and TC-BPF hooks. The primary usecase is implementing a synproxy in XDP, see Maxim's patchset [0]. Export get_net_ns_by_id as nf_conntrack_bpf.c needs to call it. This object is only built when CONFIG_DEBUG_INFO_BTF_MODULES is enabled. [0]: https://lore.kernel.org/bpf/20211019144655.3483197-1-maximmi@nvidia.com Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220114163953.1455836-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/net/netfilter/nf_conntrack_bpf.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 include/net/netfilter/nf_conntrack_bpf.h (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h new file mode 100644 index 000000000000..a473b56842c5 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _NF_CONNTRACK_BPF_H +#define _NF_CONNTRACK_BPF_H + +#include +#include + +#if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +extern int register_nf_conntrack_bpf(void); + +#else + +static inline int register_nf_conntrack_bpf(void) +{ + return 0; +} + +#endif + +#endif /* _NF_CONNTRACK_BPF_H */ -- cgit v1.2.3 From e40fbbf0572c5e41dc87ad79001748ed399ce32d Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 19 Jan 2022 11:44:40 +0000 Subject: uapi/bpf: Add missing description and returns for helper documentation Both description and returns section will become mandatory for helpers and syscalls in a later commit to generate man pages. This commit also adds in the documentation that BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN for anyone searching for the syscall in the generated man pages. Signed-off-by: Usama Arif Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220119114442.1452088-1-usama.arif@bytedance.com --- include/uapi/linux/bpf.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b0383d371b9a..a9c96c21330a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -330,6 +330,8 @@ union bpf_iter_link_info { * *ctx_out*, *data_in* and *data_out* must be NULL. * *repeat* must be zero. * + * BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN. + * * Return * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. @@ -1775,6 +1777,8 @@ union bpf_attr { * 0 on success, or a negative error in case of failure. * * u64 bpf_get_current_pid_tgid(void) + * Description + * Get the current pid and tgid. * Return * A 64-bit integer containing the current tgid and pid, and * created as such: @@ -1782,6 +1786,8 @@ union bpf_attr { * *current_task*\ **->pid**. * * u64 bpf_get_current_uid_gid(void) + * Description + * Get the current uid and gid. * Return * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. @@ -2256,6 +2262,8 @@ union bpf_attr { * The 32-bit hash. * * u64 bpf_get_current_task(void) + * Description + * Get the current task. * Return * A pointer to the current task struct. * @@ -2369,6 +2377,8 @@ union bpf_attr { * indicate that the hash is outdated and to trigger a * recalculation the next time the kernel tries to access this * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * Return + * void. * * long bpf_get_numa_node_id(void) * Description @@ -2466,6 +2476,8 @@ union bpf_attr { * A 8-byte long unique number or 0 if *sk* is NULL. * * u32 bpf_get_socket_uid(struct sk_buff *skb) + * Description + * Get the owner UID of the socked associated to *skb*. * Return * The owner UID of the socket associated to *skb*. If the socket * is **NULL**, or if it is not a full socket (i.e. if it is a @@ -3240,6 +3252,9 @@ union bpf_attr { * The id is returned or 0 in case the id could not be retrieved. * * u64 bpf_get_current_cgroup_id(void) + * Description + * Get the current cgroup id based on the cgroup within which + * the current task is running. * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. -- cgit v1.2.3 From f10d059661968b01ef61a8b516775f95a18ab8ae Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 16 Dec 2021 02:04:25 +0000 Subject: bpf: Make BPF_PROG_RUN_ARRAY return -err instead of allow boolean Right now BPF_PROG_RUN_ARRAY and related macros return 1 or 0 for whether the prog array allows or rejects whatever is being hooked. The caller of these macros then return -EPERM or continue processing based on thw macro's return value. Unforunately this is inflexible, since -EPERM is the only err that can be returned. This patch should be a no-op; it prepares for the next patch. The returning of the -EPERM is moved to inside the macros, so the outer functions are directly returning what the macros returned if they are non-zero. Signed-off-by: YiFei Zhu Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/788abcdca55886d1f43274c918eaa9f792a9f33b.1639619851.git.zhuyifei@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6d7346c54d83..83da1764fcfe 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1277,7 +1277,7 @@ static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx) typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); -static __always_inline u32 +static __always_inline int BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, const void *ctx, bpf_prog_run_fn run_prog, u32 *ret_flags) @@ -1287,7 +1287,7 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; - u32 ret = 1; + int ret = 0; u32 func_ret; migrate_disable(); @@ -1298,7 +1298,8 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; func_ret = run_prog(prog, ctx); - ret &= (func_ret & 1); + if (!(func_ret & 1)) + ret = -EPERM; *(ret_flags) |= (func_ret >> 1); item++; } @@ -1308,7 +1309,7 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, return ret; } -static __always_inline u32 +static __always_inline int BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, const void *ctx, bpf_prog_run_fn run_prog) { @@ -1317,7 +1318,7 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; - u32 ret = 1; + int ret = 0; migrate_disable(); rcu_read_lock(); @@ -1326,7 +1327,8 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; - ret &= run_prog(prog, ctx); + if (!run_prog(prog, ctx)) + ret = -EPERM; item++; } bpf_reset_run_ctx(old_run_ctx); @@ -1394,7 +1396,7 @@ out: u32 _ret; \ _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \ _cn = _flags & BPF_RET_SET_CN; \ - if (_ret) \ + if (!_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ else \ _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ -- cgit v1.2.3 From c4dcfdd406aa2167396ac215e351e5e4dfd7efe3 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 16 Dec 2021 02:04:26 +0000 Subject: bpf: Move getsockopt retval to struct bpf_cg_run_ctx The retval value is moved to struct bpf_cg_run_ctx for ease of access in different prog types with different context structs layouts. The helper implementation (to be added in a later patch in the series) can simply perform a container_of from current->bpf_ctx to retrieve bpf_cg_run_ctx. Unfortunately, there is no easy way to access the current task_struct via the verifier BPF bytecode rewrite, aside from possibly calling a helper, so a pointer to current task is added to struct bpf_sockopt_kern so that the rewritten BPF bytecode can access struct bpf_cg_run_ctx with an indirection. For backward compatibility, if a getsockopt program rejects a syscall by returning 0, an -EPERM will be generated, by having the BPF_PROG_RUN_ARRAY_CG family macros automatically set the retval to -EPERM. Unlike prior to this patch, this -EPERM will be visible to ctx->retval for any other hooks down the line in the prog array. Additionally, the restriction that getsockopt filters can only set the retval to 0 is removed, considering that certain getsockopt implementations may return optlen. Filters are now able to set the value arbitrarily. Signed-off-by: YiFei Zhu Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/73b0325f5c29912ccea7ea57ec1ed4d388fc1d37.1639619851.git.zhuyifei@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 20 +++++++++++--------- include/linux/filter.h | 5 ++++- 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 83da1764fcfe..7b0c11f414d0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1245,6 +1245,7 @@ struct bpf_run_ctx {}; struct bpf_cg_run_ctx { struct bpf_run_ctx run_ctx; const struct bpf_prog_array_item *prog_item; + int retval; }; struct bpf_trace_run_ctx { @@ -1280,16 +1281,16 @@ typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx); static __always_inline int BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, const void *ctx, bpf_prog_run_fn run_prog, - u32 *ret_flags) + int retval, u32 *ret_flags) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; - int ret = 0; u32 func_ret; + run_ctx.retval = retval; migrate_disable(); rcu_read_lock(); array = rcu_dereference(array_rcu); @@ -1299,27 +1300,28 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, run_ctx.prog_item = item; func_ret = run_prog(prog, ctx); if (!(func_ret & 1)) - ret = -EPERM; + run_ctx.retval = -EPERM; *(ret_flags) |= (func_ret >> 1); item++; } bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); migrate_enable(); - return ret; + return run_ctx.retval; } static __always_inline int BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, - const void *ctx, bpf_prog_run_fn run_prog) + const void *ctx, bpf_prog_run_fn run_prog, + int retval) { const struct bpf_prog_array_item *item; const struct bpf_prog *prog; const struct bpf_prog_array *array; struct bpf_run_ctx *old_run_ctx; struct bpf_cg_run_ctx run_ctx; - int ret = 0; + run_ctx.retval = retval; migrate_disable(); rcu_read_lock(); array = rcu_dereference(array_rcu); @@ -1328,13 +1330,13 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; if (!run_prog(prog, ctx)) - ret = -EPERM; + run_ctx.retval = -EPERM; item++; } bpf_reset_run_ctx(old_run_ctx); rcu_read_unlock(); migrate_enable(); - return ret; + return run_ctx.retval; } static __always_inline u32 @@ -1394,7 +1396,7 @@ out: u32 _flags = 0; \ bool _cn; \ u32 _ret; \ - _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, &_flags); \ + _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \ _cn = _flags & BPF_RET_SET_CN; \ if (!_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 71fa57b88bfc..d23e999dc032 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1356,7 +1356,10 @@ struct bpf_sockopt_kern { s32 level; s32 optname; s32 optlen; - s32 retval; + /* for retval in struct bpf_cg_run_ctx */ + struct task_struct *current_task; + /* Temporary "register" for indirect stores to ppos. */ + u64 tmp_reg; }; int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); -- cgit v1.2.3 From b44123b4a3dcad4664d3a0f72c011ffd4c9c4d93 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 16 Dec 2021 02:04:27 +0000 Subject: bpf: Add cgroup helpers bpf_{get,set}_retval to get/set syscall return value The helpers continue to use int for retval because all the hooks are int-returning rather than long-returning. The return value of bpf_set_retval is int for future-proofing, in case in the future there may be errors trying to set the retval. After the previous patch, if a program rejects a syscall by returning 0, an -EPERM will be generated no matter if the retval is already set to -err. This patch change it being forced only if retval is not -err. This is because we want to support, for example, invoking bpf_set_retval(-EINVAL) and return 0, and have the syscall return value be -EINVAL not -EPERM. For BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY, the prior behavior is that, if the return value is NET_XMIT_DROP, the packet is silently dropped. We preserve this behavior for backward compatibility reasons, so even if an errno is set, the errno does not return to caller. However, setting a non-err to retval cannot propagate so this is not allowed and we return a -EFAULT in that case. Signed-off-by: YiFei Zhu Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/b4013fd5d16bed0b01977c1fafdeae12e1de61fb.1639619851.git.zhuyifei@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++---- include/uapi/linux/bpf.h | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7b0c11f414d0..dce54eb0aae8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1299,7 +1299,7 @@ BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu, while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; func_ret = run_prog(prog, ctx); - if (!(func_ret & 1)) + if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; *(ret_flags) |= (func_ret >> 1); item++; @@ -1329,7 +1329,7 @@ BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu, old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); while ((prog = READ_ONCE(item->prog))) { run_ctx.prog_item = item; - if (!run_prog(prog, ctx)) + if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval)) run_ctx.retval = -EPERM; item++; } @@ -1389,7 +1389,7 @@ out: * 0: NET_XMIT_SUCCESS skb should be transmitted * 1: NET_XMIT_DROP skb should be dropped and cn * 2: NET_XMIT_CN skb should be transmitted and cn - * 3: -EPERM skb should be dropped + * 3: -err skb should be dropped */ #define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ ({ \ @@ -1398,10 +1398,12 @@ out: u32 _ret; \ _ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \ _cn = _flags & BPF_RET_SET_CN; \ + if (_ret && !IS_ERR_VALUE((long)_ret)) \ + _ret = -EFAULT; \ if (!_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ else \ - _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ + _ret = (_cn ? NET_XMIT_DROP : _ret); \ _ret; \ }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a9c96c21330a..fe2272defcd9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5033,6 +5033,22 @@ union bpf_attr { * * Return * The number of arguments of the traced function. + * + * int bpf_get_retval(void) + * Description + * Get the syscall's return value that will be returned to userspace. + * + * This helper is currently supported by cgroup programs only. + * Return + * The syscall's return value. + * + * int bpf_set_retval(int retval) + * Description + * Set the syscall's return value that will be returned to userspace. + * + * This helper is currently supported by cgroup programs only. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5221,6 +5237,8 @@ union bpf_attr { FN(get_func_arg), \ FN(get_func_ret), \ FN(get_func_arg_cnt), \ + FN(get_retval), \ + FN(set_retval), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 3368aa357f3ba133ae65fc26c04d24a1447a3903 Mon Sep 17 00:00:00 2001 From: Manish Mandlik Date: Tue, 11 Jan 2022 08:14:25 -0800 Subject: Bluetooth: msft: Handle MSFT Monitor Device Event Whenever the controller starts/stops monitoring a bt device, it sends MSFT Monitor Device event. Add handler to read this vendor event. Test performed: - Verified by logs that the MSFT Monitor Device event is received from the controller whenever it starts/stops monitoring a device. Signed-off-by: Manish Mandlik Reviewed-by: Miao-chen Chou Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 586f69d084a2..639fb9f57ae7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -258,6 +258,15 @@ struct adv_info { #define HCI_ADV_TX_POWER_NO_PREFERENCE 0x7F +struct monitored_device { + struct list_head list; + + bdaddr_t bdaddr; + __u8 addr_type; + __u16 handle; + bool notified; +}; + struct adv_pattern { struct list_head list; __u8 ad_type; @@ -591,6 +600,8 @@ struct hci_dev { struct delayed_work interleave_scan; + struct list_head monitored_devices; + #if IS_ENABLED(CONFIG_BT_LEDS) struct led_trigger *power_led; #endif -- cgit v1.2.3 From 8d7f167752c3e4c45a39e76ffa6f7209413d3fa6 Mon Sep 17 00:00:00 2001 From: Manish Mandlik Date: Tue, 11 Jan 2022 08:14:26 -0800 Subject: Bluetooth: mgmt: Add MGMT Adv Monitor Device Found/Lost events This patch introduces two new MGMT events for notifying the bluetoothd whenever the controller starts/stops monitoring a device. Test performed: - Verified by logs that the MSFT Monitor Device is received from the controller and the bluetoothd is notified whenever the controller starts/stops monitoring a device. Signed-off-by: Manish Mandlik Reviewed-by: Miao-chen Chou Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 3 +++ include/net/bluetooth/mgmt.h | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 639fb9f57ae7..21eadb113a31 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -601,6 +601,7 @@ struct hci_dev { struct delayed_work interleave_scan; struct list_head monitored_devices; + bool advmon_pend_notify; #if IS_ENABLED(CONFIG_BT_LEDS) struct led_trigger *power_led; @@ -1858,6 +1859,8 @@ void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle); int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); int mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, u8 status); int mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, u8 status); +void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, + bdaddr_t *bdaddr, u8 addr_type); u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 107b25deae68..99266f7aebdc 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -1104,3 +1104,19 @@ struct mgmt_ev_controller_resume { #define MGMT_WAKE_REASON_NON_BT_WAKE 0x0 #define MGMT_WAKE_REASON_UNEXPECTED 0x1 #define MGMT_WAKE_REASON_REMOTE_WAKE 0x2 + +#define MGMT_EV_ADV_MONITOR_DEVICE_FOUND 0x002f +struct mgmt_ev_adv_monitor_device_found { + __le16 monitor_handle; + struct mgmt_addr_info addr; + __s8 rssi; + __le32 flags; + __le16 eir_len; + __u8 eir[0]; +} __packed; + +#define MGMT_EV_ADV_MONITOR_DEVICE_LOST 0x0030 +struct mgmt_ev_adv_monitor_device_lost { + __le16 monitor_handle; + struct mgmt_addr_info addr; +} __packed; -- cgit v1.2.3 From 748cd5729ac7421091316e32dcdffb0578563880 Mon Sep 17 00:00:00 2001 From: Di Zhu Date: Wed, 19 Jan 2022 09:40:04 +0800 Subject: bpf: support BPF_PROG_QUERY for progs attached to sockmap Right now there is no way to query whether BPF programs are attached to a sockmap or not. we can use the standard interface in libbpf to query, such as: bpf_prog_query(mapFd, BPF_SK_SKB_STREAM_PARSER, 0, NULL, ...); the mapFd is the fd of sockmap. Signed-off-by: Di Zhu Acked-by: Yonghong Song Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20220119014005.1209-1-zhudi2@huawei.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dce54eb0aae8..80e3387ea3af 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2069,6 +2069,9 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog, int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags); +int sock_map_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); + void sock_map_unhash(struct sock *sk); void sock_map_close(struct sock *sk, long timeout); #else @@ -2122,6 +2125,12 @@ static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void { return -EOPNOTSUPP; } + +static inline int sock_map_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EINVAL; +} #endif /* CONFIG_BPF_SYSCALL */ #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From d16697cb6261d4cc23422e6b1cb2759df8aa76d0 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:44 +0100 Subject: net: skbuff: add size metadata to skb_shared_info for xdp Introduce xdp_frags_size field in skb_shared_info data structure to store xdp_buff/xdp_frame frame paged size (xdp_frags_size will be used in xdp frags support). In order to not increase skb_shared_info size we will use a hole due to skb_shared_info alignment. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Acked-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/8a849819a3e0a143d540f78a3a5add76e17e980d.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bf11e1fbd69b..8131d0de7559 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -557,6 +557,7 @@ struct skb_shared_info { * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; + unsigned int xdp_frags_size; /* Intermediate layers must ensure that destructor_arg * remains valid until skb destructor */ -- cgit v1.2.3 From 2e88d4ff03013937028f5397268b21e10cf68713 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:45 +0100 Subject: xdp: introduce flags field in xdp_buff/xdp_frame Introduce flags field in xdp_frame and xdp_buffer data structures to define additional buffer features. At the moment the only supported buffer feature is frags bit (XDP_FLAGS_HAS_FRAGS). frags bit is used to specify if this is a linear buffer (XDP_FLAGS_HAS_FRAGS not set) or a frags frame (XDP_FLAGS_HAS_FRAGS set). In the latter case the driver is expected to initialize the skb_shared_info structure at the end of the first buffer to link together subsequent buffers belonging to the same frame. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Acked-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/e389f14f3a162c0a5bc6a2e1aa8dd01a90be117d.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 8f0812e4996d..485e9495a690 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -66,6 +66,10 @@ struct xdp_txq_info { struct net_device *dev; }; +enum xdp_buff_flags { + XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ +}; + struct xdp_buff { void *data; void *data_end; @@ -74,13 +78,30 @@ struct xdp_buff { struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ + u32 flags; /* supported values defined in xdp_buff_flags */ }; +static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) +{ + return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); +} + +static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp) +{ + xdp->flags |= XDP_FLAGS_HAS_FRAGS; +} + +static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) +{ + xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; +} + static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { xdp->frame_sz = frame_sz; xdp->rxq = rxq; + xdp->flags = 0; } static __always_inline void @@ -122,8 +143,14 @@ struct xdp_frame { */ struct xdp_mem_info mem; struct net_device *dev_rx; /* used by cpumap */ + u32 flags; /* supported values defined in xdp_buff_flags */ }; +static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) +{ + return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); +} + #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; @@ -180,6 +207,7 @@ void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) xdp->data_end = frame->data + frame->len; xdp->data_meta = frame->data - frame->metasize; xdp->frame_sz = frame->frame_sz; + xdp->flags = frame->flags; } static inline @@ -206,6 +234,7 @@ int xdp_update_frame_from_buff(struct xdp_buff *xdp, xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; xdp_frame->frame_sz = xdp->frame_sz; + xdp_frame->flags = xdp->flags; return 0; } -- cgit v1.2.3 From d65a1906b31246492449eafe9cace188cb59e26c Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:48 +0100 Subject: net: xdp: add xdp_update_skb_shared_info utility routine Introduce xdp_update_skb_shared_info routine to update frags array metadata in skb_shared_info data structure converting to a skb from a xdp_buff or xdp_frame. According to the current skb_shared_info architecture in xdp_frame/xdp_buff and to the xdp frags support, there is no need to run skb_add_rx_frag() and reset frags array converting the buffer to a skb since the frag array will be in the same position for xdp_buff/xdp_frame and for the skb, we just need to update memory metadata. Introduce XDP_FLAGS_PF_MEMALLOC flag in xdp_buff_flags in order to mark the xdp_buff or xdp_frame as under memory-pressure if pages of the frags array are under memory pressure. Doing so we can avoid looping over all fragments in xdp_update_skb_shared_info routine. The driver is expected to set the flag constructing the xdp_buffer using xdp_buff_set_frag_pfmemalloc utility routine. Rely on xdp_update_skb_shared_info in __xdp_build_skb_from_frame routine converting the non-linear xdp_frame to a skb after performing a XDP_REDIRECT. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Acked-by: Jesper Dangaard Brouer Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/bfd23fb8a8d7438724f7819c567cdf99ffd6226f.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 485e9495a690..1f8641ec658e 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -67,7 +67,10 @@ struct xdp_txq_info { }; enum xdp_buff_flags { - XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ + XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ + XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under + * pressure + */ }; struct xdp_buff { @@ -96,6 +99,16 @@ static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } +static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp) +{ + return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); +} + +static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) +{ + xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; +} + static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { @@ -151,6 +164,11 @@ static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } +static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame) +{ + return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); +} + #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; @@ -186,6 +204,19 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) frame->dev_rx = NULL; } +static inline void +xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, + unsigned int size, unsigned int truesize, + bool pfmemalloc) +{ + skb_shinfo(skb)->nr_frags = nr_frags; + + skb->len += size; + skb->data_len += size; + skb->truesize += truesize; + skb->pfmemalloc |= pfmemalloc; +} + /* Avoids inlining WARN macro in fast-path */ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) -- cgit v1.2.3 From 7c48cb0176c6d6d3b55029f7ff4ffa05faee6446 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:50 +0100 Subject: xdp: add frags support to xdp_return_{buff/frame} Take into account if the received xdp_buff/xdp_frame is non-linear recycling/returning the frame memory to the allocator or into xdp_frame_bulk. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/a961069febc868508ce1bdf5e53a343eb4e57cb2.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 1f8641ec658e..8463dea8b4db 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -306,10 +306,24 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem); static inline void xdp_release_frame(struct xdp_frame *xdpf) { struct xdp_mem_info *mem = &xdpf->mem; + struct skb_shared_info *sinfo; + int i; /* Curr only page_pool needs this */ - if (mem->type == MEM_TYPE_PAGE_POOL) - __xdp_release_frame(xdpf->data, mem); + if (mem->type != MEM_TYPE_PAGE_POOL) + return; + + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + for (i = 0; i < sinfo->nr_frags; i++) { + struct page *page = skb_frag_page(&sinfo->frags[i]); + + __xdp_release_frame(page_address(page), mem); + } +out: + __xdp_release_frame(xdpf->data, mem); } int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, -- cgit v1.2.3 From c2f2cdbeffda7b153c19e0f3d73149c41026c0db Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:52 +0100 Subject: bpf: introduce BPF_F_XDP_HAS_FRAGS flag in prog_flags loading the ebpf program Introduce BPF_F_XDP_HAS_FRAGS and the related field in bpf_prog_aux in order to notify the driver the loaded program support xdp frags. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/db2e8075b7032a356003f407d1b0deb99adaa0ed.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80e3387ea3af..e93ed028a030 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -933,6 +933,7 @@ struct bpf_prog_aux { bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; + bool xdp_has_frags; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fe2272defcd9..945649c67e03 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1113,6 +1113,11 @@ enum bpf_link_type { */ #define BPF_F_SLEEPABLE (1U << 4) +/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program + * fully support xdp frags. + */ +#define BPF_F_XDP_HAS_FRAGS (1U << 5) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * the following extensions: * -- cgit v1.2.3 From 0165cc817075cf701e4289838f1d925ff1911b3e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:09:54 +0100 Subject: bpf: introduce bpf_xdp_get_buff_len helper Introduce bpf_xdp_get_buff_len helper in order to return the xdp buffer total size (linear and paged area) Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/aac9ac3504c84026cf66a3c71b7c5ae89bc991be.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 14 ++++++++++++++ include/uapi/linux/bpf.h | 7 +++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 8463dea8b4db..52b593321956 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -145,6 +145,20 @@ xdp_get_shared_info_from_buff(struct xdp_buff *xdp) return (struct skb_shared_info *)xdp_data_hard_end(xdp); } +static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) +{ + unsigned int len = xdp->data_end - xdp->data; + struct skb_shared_info *sinfo; + + if (likely(!xdp_buff_has_frags(xdp))) + goto out; + + sinfo = xdp_get_shared_info_from_buff(xdp); + len += sinfo->xdp_frags_size; +out: + return len; +} + struct xdp_frame { void *data; u16 len; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 945649c67e03..5a28772063f6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5054,6 +5054,12 @@ union bpf_attr { * This helper is currently supported by cgroup programs only. * Return * 0 on success, or a negative error in case of failure. + * + * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md) + * Description + * Get the total size of a given xdp buff (linear and paged area) + * Return + * The total size of a given xdp buffer. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5244,6 +5250,7 @@ union bpf_attr { FN(get_func_arg_cnt), \ FN(get_retval), \ FN(set_retval), \ + FN(xdp_get_buff_len), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From bf25146a5595269810b1f47d048f114c5ff9f544 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 21 Jan 2022 11:09:55 +0100 Subject: bpf: add frags support to the bpf_xdp_adjust_tail() API This change adds support for tail growing and shrinking for XDP frags. When called on a non-linear packet with a grow request, it will work on the last fragment of the packet. So the maximum grow size is the last fragments tailroom, i.e. no new buffer will be allocated. A XDP frags capable driver is expected to set frag_size in xdp_rxq_info data structure to notify the XDP core the fragment size. frag_size set to 0 is interpreted by the XDP core as tail growing is not allowed. Introduce __xdp_rxq_info_reg utility routine to initialize frag_size field. When shrinking, it will work from the last fragment, all the way down to the base buffer depending on the shrinking size. It's important to mention that once you shrink down the fragment(s) are freed, so you can not grow again to the original size. Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Acked-by: Jakub Kicinski Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Eelco Chaudron Link: https://lore.kernel.org/r/eabda3485dda4f2f158b477729337327e609461d.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 52b593321956..b7721c3e4d1f 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -60,6 +60,7 @@ struct xdp_rxq_info { u32 reg_state; struct xdp_mem_info mem; unsigned int napi_id; + u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { @@ -304,6 +305,8 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) return xdp_frame; } +void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, + struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); @@ -340,8 +343,17 @@ out: __xdp_release_frame(xdpf->data, mem); } -int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, - struct net_device *dev, u32 queue_index, unsigned int napi_id); +int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index, + unsigned int napi_id, u32 frag_size); +static inline int +xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index, + unsigned int napi_id) +{ + return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0); +} + void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); -- cgit v1.2.3 From f45d5b6ce2e835834c94b8b700787984f02cd662 Mon Sep 17 00:00:00 2001 From: Toke Hoiland-Jorgensen Date: Fri, 21 Jan 2022 11:10:02 +0100 Subject: bpf: generalise tail call map compatibility check The check for tail call map compatibility ensures that tail calls only happen between maps of the same type. To ensure backwards compatibility for XDP frags we need a similar type of check for cpumap and devmap programs, so move the state from bpf_array_aux into bpf_map, add xdp_has_frags to the check, and apply the same check to cpumap and devmap. Acked-by: John Fastabend Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Toke Hoiland-Jorgensen Link: https://lore.kernel.org/r/f19fd97c0328a39927f3ad03e1ca6b43fd53cdfd.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e93ed028a030..e8ec8d2f2fe3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -194,6 +194,17 @@ struct bpf_map { struct work_struct work; struct mutex freeze_mutex; atomic64_t writecnt; + /* 'Ownership' of program-containing map is claimed by the first program + * that is going to use this map or by the first program which FD is + * stored in the map to make sure that all callers and callees have the + * same prog type, JITed flag and xdp_has_frags flag. + */ + struct { + spinlock_t lock; + enum bpf_prog_type type; + bool jited; + bool xdp_has_frags; + } owner; }; static inline bool map_value_has_spin_lock(const struct bpf_map *map) @@ -994,16 +1005,6 @@ struct bpf_prog_aux { }; struct bpf_array_aux { - /* 'Ownership' of prog array is claimed by the first program that - * is going to use this map or by the first program which FD is - * stored in the map to make sure that all callers and callees have - * the same prog type and JITed flag. - */ - struct { - spinlock_t lock; - enum bpf_prog_type type; - bool jited; - } owner; /* Programs with direct jumps into programs part of this array. */ struct list_head poke_progs; struct bpf_map *map; @@ -1178,7 +1179,14 @@ struct bpf_event_entry { struct rcu_head rcu; }; -bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); +static inline bool map_type_contains_progs(struct bpf_map *map) +{ + return map->map_type == BPF_MAP_TYPE_PROG_ARRAY || + map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_CPUMAP; +} + +bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); -- cgit v1.2.3 From 3f364222d032eea6b245780e845ad213dab28cdd Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 21 Jan 2022 11:10:03 +0100 Subject: net: xdp: introduce bpf_xdp_pointer utility routine Similar to skb_header_pointer, introduce bpf_xdp_pointer utility routine to return a pointer to a given position in the xdp_buff if the requested area (offset + len) is contained in a contiguous memory area otherwise it will be copied in a bounce buffer provided by the caller. Similar to the tc counterpart, introduce the two following xdp helpers: - bpf_xdp_load_bytes - bpf_xdp_store_bytes Reviewed-by: Eelco Chaudron Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Acked-by: Jakub Kicinski Signed-off-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/ab285c1efdd5b7a9d361348b1e7d3ef49f6382b3.1642758637.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5a28772063f6..16a7574292a5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5060,6 +5060,22 @@ union bpf_attr { * Get the total size of a given xdp buff (linear and paged area) * Return * The total size of a given xdp buffer. + * + * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) + * Description + * This helper is provided as an easy way to load data from a + * xdp buffer. It can be used to load *len* bytes from *offset* from + * the frame associated to *xdp_md*, into the buffer pointed by + * *buf*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) + * Description + * Store *len* bytes from buffer *buf* into the frame + * associated to *xdp_md*, at *offset*. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5251,6 +5267,8 @@ union bpf_attr { FN(get_retval), \ FN(set_retval), \ FN(xdp_get_buff_len), \ + FN(xdp_load_bytes), \ + FN(xdp_store_bytes), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From d5ebaa7c5f6f688959e8d40840b2249ede63b8ed Mon Sep 17 00:00:00 2001 From: Soenke Huster Date: Sun, 23 Jan 2022 15:06:24 +0100 Subject: Bluetooth: hci_event: Ignore multiple conn complete events When one of the three connection complete events is received multiple times for the same handle, the device is registered multiple times which leads to memory corruptions. Therefore, consequent events for a single connection are ignored. The conn->state can hold different values, therefore HCI_CONN_HANDLE_UNSET is introduced to identify new connections. To make sure the events do not contain this or another invalid handle HCI_CONN_HANDLE_MAX and checks are introduced. Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=215497 Signed-off-by: Soenke Huster Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 21eadb113a31..f5caff1ddb29 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -303,6 +303,9 @@ struct adv_monitor { #define HCI_MAX_SHORT_NAME_LENGTH 10 +#define HCI_CONN_HANDLE_UNSET 0xffff +#define HCI_CONN_HANDLE_MAX 0x0eff + /* Min encryption key size to match with SMP */ #define HCI_MIN_ENC_KEY_SIZE 7 -- cgit v1.2.3 From 376040e47334c6dc6a939a32197acceb00fe4acf Mon Sep 17 00:00:00 2001 From: Kenny Yu Date: Mon, 24 Jan 2022 10:54:01 -0800 Subject: bpf: Add bpf_copy_from_user_task() helper This adds a helper for bpf programs to read the memory of other tasks. As an example use case at Meta, we are using a bpf task iterator program and this new helper to print C++ async stack traces for all threads of a given process. Signed-off-by: Kenny Yu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20220124185403.468466-3-kennyyu@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8c92c974bd12..394305a5e02f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2243,6 +2243,7 @@ extern const struct bpf_func_proto bpf_kallsyms_lookup_name_proto; extern const struct bpf_func_proto bpf_find_vma_proto; extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_strncmp_proto; +extern const struct bpf_func_proto bpf_copy_from_user_task_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 16a7574292a5..4a2f7041ebae 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5076,6 +5076,16 @@ union bpf_attr { * associated to *xdp_md*, at *offset*. * Return * 0 on success, or a negative error in case of failure. + * + * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags) + * Description + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * Return + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5269,6 +5279,7 @@ union bpf_attr { FN(xdp_get_buff_len), \ FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ + FN(copy_from_user_task), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From d507204d3c5cc57d9a8bdf0a477615bb59ea1611 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:52 -0800 Subject: tcp/dccp: add tw->tw_bslot We want to allow inet_twsk_kill() working even if netns has been dismantled/freed, to get rid of inet_twsk_purge(). This patch adds tw->tw_bslot to cache the bind bucket slot so that inet_twsk_kill() no longer needs to dereference twsk_net(tw) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index dfd919b3119e..c221fe2b77dd 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -72,6 +72,7 @@ struct inet_timewait_sock { tw_tos : 8; u32 tw_txhash; u32 tw_priority; + u32 tw_bslot; /* bind bucket slot */ struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; }; -- cgit v1.2.3 From 27dd35e02235902933626469a1c9198612d72564 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:53 -0800 Subject: tcp/dccp: no longer use twsk_net(tw) from tw_timer_handler() We will soon get rid of inet_twsk_purge(). This means that tw_timer_handler() might fire after a netns has been dismantled/freed. Instead of adding a function (and data structure) to find a netns from tw->tw_net_cookie, just update the SNMP counters a bit earlier, when the netns is known to be alive. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index c221fe2b77dd..b323db969b8b 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -65,10 +65,9 @@ struct inet_timewait_sock { /* these three are in inet_sock */ __be16 tw_sport; /* And these are ours. */ - unsigned int tw_kill : 1, - tw_transparent : 1, + unsigned int tw_transparent : 1, tw_flowlabel : 20, - tw_pad : 2, /* 2 bits hole */ + tw_pad : 3, /* 3 bits hole */ tw_tos : 8; u32 tw_txhash; u32 tw_priority; -- cgit v1.2.3 From 0dad4087a86a2cbe177404dc73f18ada26a2c390 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:54 -0800 Subject: tcp/dccp: get rid of inet_twsk_purge() Prior patches in the series made sure tw_timer_handler() can be fired after netns has been dismantled/freed. We no longer have to scan a potentially big TCP ehash table at netns dismantle. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index b323db969b8b..463ae5d33eb0 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -110,8 +110,6 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); -void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family); - static inline struct net *twsk_net(const struct inet_timewait_sock *twsk) { -- cgit v1.2.3 From a15c89c703d43490ea68ea4516553d4ea4f6b1e1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:55 -0800 Subject: ipv4: do not use per netns icmp sockets Back in linux-2.6.25 (commit 4a6ad7a141cb "[NETNS]: Make icmp_sk per namespace."), we added private per-cpu/per-netns ipv4 icmp sockets. This adds memory and cpu costs, which do not seem needed. Now typical servers have 256 or more cores, this adds considerable tax to netns users. icmp sockets are used from BH context, are not receiving packets, and do not store any persistent state but the 'struct net' pointer. icmp_xmit_lock() already makes sure to lock the chosen per-cpu socket. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 78557643526e..639a31638159 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -70,7 +70,6 @@ struct netns_ipv4 { struct hlist_head *fib_table_hash; struct sock *fibnl; - struct sock * __percpu *icmp_sk; struct sock *mc_autojoin_sk; struct inet_peer_base *peers; -- cgit v1.2.3 From 6a17b961ec19cd61ca646a6655ab93e8f6fe15c0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:56 -0800 Subject: ipv6: do not use per netns icmp sockets Back in linux-2.6.25 (commit 98c6d1b261e7 "[NETNS]: Make icmpv6_sk per namespace.", we added private per-cpu/per-netns ipv6 icmp sockets. This adds memory and cpu costs, which do not seem needed. Now typical servers have 256 or more cores, this adds considerable tax to netns users. icmp sockets are used from BH context, are not receiving packets, and do not store any persistent state but the 'struct net' pointer. icmpv6_xmit_lock() already makes sure to lock the chosen per-cpu socket. This patch has a considerable impact on the number of netns that the worker thread in cleanup_net() can dismantle per second, because ip6mr_sk_done() is no longer called, meaning we no longer acquire the rtnl mutex, competing with other threads adding new netns. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index a4b550380316..30cdfc4e1615 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -88,7 +88,6 @@ struct netns_ipv6 { struct fib6_table *fib6_local_tbl; struct fib_rules_ops *fib6_rules_ops; #endif - struct sock * __percpu *icmp_sk; struct sock *ndisc_sk; struct sock *tcp_sk; struct sock *igmp_sk; -- cgit v1.2.3 From 37ba017dcc3b1123206808979834655ddcf93251 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Jan 2022 12:24:57 -0800 Subject: ipv4/tcp: do not use per netns ctl sockets TCP ipv4 uses per-cpu/per-netns ctl sockets in order to send RST and some ACK packets (on behalf of TIMEWAIT sockets). This adds memory and cpu costs, which do not seem needed. Now typical servers have 256 or more cores, this adds considerable tax to netns users. tcp sockets are used from BH context, are not receiving packets, and do not store any persistent state but the 'struct net' pointer in order to be able to use IPv4 output functions. Note that I attempted a related change in the past, that had to be hot-fixed in commit bdbbb8527b6f ("ipv4: tcp: get rid of ugly unicast_sock") This patch could very well surface old bugs, on layers not taking care of sk->sk_kern_sock properly. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 639a31638159..22b4c6df1d2b 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -73,7 +73,6 @@ struct netns_ipv4 { struct sock *mc_autojoin_sk; struct inet_peer_base *peers; - struct sock * __percpu *tcp_sk; struct fqdir *fqdir; u8 sysctl_icmp_echo_ignore_all; -- cgit v1.2.3 From be6ec5b7026620b931e0fa9287d24ad2cd2ab9b6 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 26 Jan 2022 10:25:55 +0000 Subject: net: xpcs: add support for retrieving supported interface modes Add a function to the xpcs driver to retrieve the supported PHY interface modes, which can be used by drivers to fill in phylink's supported_interfaces mask. We validate the interface bit index to ensure that it fits within the bitmap as xpcs lists PHY_INTERFACE_MODE_MAX in an entry. Tested-by: Wong Vee Khee # Intel EHL Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/pcs/pcs-xpcs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index add077a81b21..3126a4924d92 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -33,6 +33,7 @@ int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface, unsigned int mode); void xpcs_validate(struct dw_xpcs *xpcs, unsigned long *supported, struct phylink_link_state *state); +void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable); struct dw_xpcs *xpcs_create(struct mdio_device *mdiodev, -- cgit v1.2.3 From fe70fb74b56407c1a5be347258082f8abbe7956d Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 26 Jan 2022 10:26:10 +0000 Subject: net: stmmac/xpcs: convert to pcs_validate() stmmac explicitly calls the xpcs driver to validate the ethtool linkmodes. This is no longer necessary as phylink now supports validation through a PCS method. Convert both drivers to use this new mechanism. Tested-by: Wong Vee Khee # Intel EHL Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/pcs/pcs-xpcs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index 3126a4924d92..266eb26fb029 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -31,8 +31,6 @@ void xpcs_link_up(struct phylink_pcs *pcs, unsigned int mode, phy_interface_t interface, int speed, int duplex); int xpcs_do_config(struct dw_xpcs *xpcs, phy_interface_t interface, unsigned int mode); -void xpcs_validate(struct dw_xpcs *xpcs, unsigned long *supported, - struct phylink_link_state *state); void xpcs_get_interfaces(struct dw_xpcs *xpcs, unsigned long *interfaces); int xpcs_config_eee(struct dw_xpcs *xpcs, int mult_fact_100ns, int enable); -- cgit v1.2.3 From fbb8295248e1d6f576d444309fcf79356008eac1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 26 Jan 2022 10:07:14 -0800 Subject: tcp: allocate tcp_death_row outside of struct netns_ipv4 I forgot tcp had per netns tracking of timewait sockets, and their sysctl to change the limit. After 0dad4087a86a ("tcp/dccp: get rid of inet_twsk_purge()"), whole struct net can be freed before last tw socket is freed. We need to allocate a separate struct inet_timewait_death_row object per netns. tw_count becomes a refcount and gains associated debugging infrastructure. BUG: KASAN: use-after-free in inet_twsk_kill+0x358/0x3c0 net/ipv4/inet_timewait_sock.c:46 Read of size 8 at addr ffff88807d5f9f40 by task kworker/1:7/3690 CPU: 1 PID: 3690 Comm: kworker/1:7 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events pwq_unbound_release_workfn Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x336 mm/kasan/report.c:255 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:459 inet_twsk_kill+0x358/0x3c0 net/ipv4/inet_timewait_sock.c:46 call_timer_fn+0x1a5/0x6b0 kernel/time/timer.c:1421 expire_timers kernel/time/timer.c:1466 [inline] __run_timers.part.0+0x67c/0xa30 kernel/time/timer.c:1734 __run_timers kernel/time/timer.c:1715 [inline] run_timer_softirq+0xb3/0x1d0 kernel/time/timer.c:1747 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x123/0x180 kernel/softirq.c:637 irq_exit_rcu+0x5/0x20 kernel/softirq.c:649 sysvec_apic_timer_interrupt+0x93/0xc0 arch/x86/kernel/apic/apic.c:1097 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:638 RIP: 0010:lockdep_unregister_key+0x1c9/0x250 kernel/locking/lockdep.c:6328 Code: 00 00 00 48 89 ee e8 46 fd ff ff 4c 89 f7 e8 5e c9 ff ff e8 09 cc ff ff 9c 58 f6 c4 02 75 26 41 f7 c4 00 02 00 00 74 01 fb 5b <5d> 41 5c 41 5d 41 5e 41 5f e9 19 4a 08 00 0f 0b 5b 5d 41 5c 41 5d RSP: 0018:ffffc90004077cb8 EFLAGS: 00000206 RAX: 0000000000000046 RBX: ffff88807b61b498 RCX: 0000000000000001 RDX: dffffc0000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff888077027128 R08: 0000000000000001 R09: ffffffff8f1ea4fc R10: fffffbfff1ff93ee R11: 000000000000af1e R12: 0000000000000246 R13: 0000000000000000 R14: ffffffff8ffc89b8 R15: ffffffff90157fb0 wq_unregister_lockdep kernel/workqueue.c:3508 [inline] pwq_unbound_release_workfn+0x254/0x340 kernel/workqueue.c:3746 process_one_work+0x9ac/0x1650 kernel/workqueue.c:2307 worker_thread+0x657/0x1110 kernel/workqueue.c:2454 kthread+0x2e9/0x3a0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 Allocated by task 3635: kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:46 [inline] set_alloc_info mm/kasan/common.c:437 [inline] __kasan_slab_alloc+0x90/0xc0 mm/kasan/common.c:470 kasan_slab_alloc include/linux/kasan.h:260 [inline] slab_post_alloc_hook mm/slab.h:732 [inline] slab_alloc_node mm/slub.c:3230 [inline] slab_alloc mm/slub.c:3238 [inline] kmem_cache_alloc+0x202/0x3a0 mm/slub.c:3243 kmem_cache_zalloc include/linux/slab.h:705 [inline] net_alloc net/core/net_namespace.c:407 [inline] copy_net_ns+0x125/0x760 net/core/net_namespace.c:462 create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:226 ksys_unshare+0x445/0x920 kernel/fork.c:3048 __do_sys_unshare kernel/fork.c:3119 [inline] __se_sys_unshare kernel/fork.c:3117 [inline] __x64_sys_unshare+0x2d/0x40 kernel/fork.c:3117 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae The buggy address belongs to the object at ffff88807d5f9a80 which belongs to the cache net_namespace of size 6528 The buggy address is located 1216 bytes inside of 6528-byte region [ffff88807d5f9a80, ffff88807d5fb400) The buggy address belongs to the page: page:ffffea0001f57e00 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88807d5f9a80 pfn:0x7d5f8 head:ffffea0001f57e00 order:3 compound_mapcount:0 compound_pincount:0 memcg:ffff888070023001 flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000010200 ffff888010dd4f48 ffffea0001404e08 ffff8880118fd000 raw: ffff88807d5f9a80 0000000000040002 00000001ffffffff ffff888070023001 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 3634, ts 119694798460, free_ts 119693556950 prep_new_page mm/page_alloc.c:2434 [inline] get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4165 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5389 alloc_pages+0x1aa/0x310 mm/mempolicy.c:2271 alloc_slab_page mm/slub.c:1799 [inline] allocate_slab mm/slub.c:1944 [inline] new_slab+0x28a/0x3b0 mm/slub.c:2004 ___slab_alloc+0x87c/0xe90 mm/slub.c:3018 __slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3105 slab_alloc_node mm/slub.c:3196 [inline] slab_alloc mm/slub.c:3238 [inline] kmem_cache_alloc+0x35c/0x3a0 mm/slub.c:3243 kmem_cache_zalloc include/linux/slab.h:705 [inline] net_alloc net/core/net_namespace.c:407 [inline] copy_net_ns+0x125/0x760 net/core/net_namespace.c:462 create_new_namespaces+0x3f6/0xb20 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc1/0x1f0 kernel/nsproxy.c:226 ksys_unshare+0x445/0x920 kernel/fork.c:3048 __do_sys_unshare kernel/fork.c:3119 [inline] __se_sys_unshare kernel/fork.c:3117 [inline] __x64_sys_unshare+0x2d/0x40 kernel/fork.c:3117 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1352 [inline] free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1404 free_unref_page_prepare mm/page_alloc.c:3325 [inline] free_unref_page+0x19/0x690 mm/page_alloc.c:3404 skb_free_head net/core/skbuff.c:655 [inline] skb_release_data+0x65d/0x790 net/core/skbuff.c:677 skb_release_all net/core/skbuff.c:742 [inline] __kfree_skb net/core/skbuff.c:756 [inline] consume_skb net/core/skbuff.c:914 [inline] consume_skb+0xc2/0x160 net/core/skbuff.c:908 skb_free_datagram+0x1b/0x1f0 net/core/datagram.c:325 netlink_recvmsg+0x636/0xea0 net/netlink/af_netlink.c:1998 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_recvmsg net/socket.c:962 [inline] ____sys_recvmsg+0x2c4/0x600 net/socket.c:2632 ___sys_recvmsg+0x127/0x200 net/socket.c:2674 __sys_recvmsg+0xe2/0x1a0 net/socket.c:2704 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Memory state around the buggy address: ffff88807d5f9e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807d5f9e80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff88807d5f9f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88807d5f9f80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807d5fa000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 0dad4087a86a ("tcp/dccp: get rid of inet_twsk_purge()") Signed-off-by: Eric Dumazet Reported-by: syzbot Reported-by: Paolo Abeni Tested-by: Paolo Abeni Link: https://lore.kernel.org/r/20220126180714.845362-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 22b4c6df1d2b..94568e022001 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -31,18 +31,16 @@ struct ping_group_range { struct inet_hashinfo; struct inet_timewait_death_row { - atomic_t tw_count; - char tw_pad[L1_CACHE_BYTES - sizeof(atomic_t)]; + refcount_t tw_refcount; - struct inet_hashinfo *hashinfo; + struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; int sysctl_max_tw_buckets; }; struct tcp_fastopen_context; struct netns_ipv4 { - /* Please keep tcp_death_row at first field in netns_ipv4 */ - struct inet_timewait_death_row tcp_death_row ____cacheline_aligned_in_smp; + struct inet_timewait_death_row *tcp_death_row; #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; -- cgit v1.2.3 From 8033c6c2fed235b3d571b5a5ede302b752bc5c7d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 10:54:12 -0800 Subject: bpf: remove unused static inlines Remove two dead stubs, sk_msg_clear_meta() was never used, use of xskq_cons_is_full() got replaced by xsk_tx_writeable() in v5.10. Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20220126185412.2776254-1-kuba@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ---------- include/linux/skmsg.h | 5 ----- 2 files changed, 15 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 394305a5e02f..2344f793c4dc 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1875,11 +1875,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } -static inline bool dev_map_can_have_prog(struct bpf_map *map) -{ - return false; -} - static inline void __dev_flush(void) { } @@ -1943,11 +1938,6 @@ static inline int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu, return -EOPNOTSUPP; } -static inline bool cpu_map_prog_allowed(struct bpf_map *map) -{ - return false; -} - static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) { diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 18a717fe62eb..ddde5f620901 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -171,11 +171,6 @@ static inline u32 sk_msg_iter_dist(u32 start, u32 end) #define sk_msg_iter_next(msg, which) \ sk_msg_iter_var_next(msg->sg.which) -static inline void sk_msg_clear_meta(struct sk_msg *msg) -{ - memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy)); -} - static inline void sk_msg_init(struct sk_msg *msg) { BUILD_BUG_ON(ARRAY_SIZE(msg->sg.data) - 1 != NR_MSG_FRAG_IDS); -- cgit v1.2.3 From 2e9589ff809e9232f689acd51da73390e135146a Mon Sep 17 00:00:00 2001 From: xu xin Date: Wed, 26 Jan 2022 07:10:58 +0000 Subject: ipv4: Namespaceify min_adv_mss sysctl knob Different netns has different requirement on the setting of min_adv_mss sysctl which the advertised MSS will be never lower than. Enable min_adv_mss to be configured per network namespace. Signed-off-by: xu xin Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 94568e022001..f0687867b5cd 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -83,6 +83,7 @@ struct netns_ipv4 { u32 ip_rt_min_pmtu; int ip_rt_mtu_expires; + int ip_rt_min_advmss; struct local_ports ip_local_ports; -- cgit v1.2.3 From bd5daba2d02479729526d35de85807aadf6fba20 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:10:55 -0800 Subject: mii: remove mii_lpa_to_linkmode_lpa_sgmii() The only caller of mii_lpa_to_linkmode_lpa_sgmii() disappeared in v5.10. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/mii.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include') diff --git a/include/linux/mii.h b/include/linux/mii.h index 12ea29e04293..b8a1a17a87dd 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -387,23 +387,6 @@ mii_lpa_mod_linkmode_lpa_sgmii(unsigned long *lp_advertising, u32 lpa) speed_duplex == LPA_SGMII_10FULL); } -/** - * mii_lpa_to_linkmode_adv_sgmii - * @advertising: pointer to destination link mode. - * @lpa: value of the MII_LPA register - * - * A small helper function that translates MII_ADVERTISE bits - * to linkmode advertisement settings when in SGMII mode. - * Clears the old value of advertising. - */ -static inline void mii_lpa_to_linkmode_lpa_sgmii(unsigned long *lp_advertising, - u32 lpa) -{ - linkmode_zero(lp_advertising); - - mii_lpa_mod_linkmode_lpa_sgmii(lp_advertising, lpa); -} - /** * mii_adv_mod_linkmode_adv_t * @advertising:pointer to destination link mode. -- cgit v1.2.3 From b1755400b4be33dbd286272b153579631be2e2ca Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:10:57 -0800 Subject: net: remove net_invalid_timestamp() No callers since v3.15. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8131d0de7559..60bd2347708c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3899,11 +3899,6 @@ static inline ktime_t net_timedelta(ktime_t t) return ktime_sub(ktime_get_real(), t); } -static inline ktime_t net_invalid_timestamp(void) -{ - return 0; -} - static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; -- cgit v1.2.3 From 08dfa5a19e1f4344ce5d3a5eed4c5529adafe0dc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:10:58 -0800 Subject: net: remove linkmode_change_bit() No callers since v5.7, the initial use case seems pretty esoteric so removing this should not harm the completeness of the API. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/linkmode.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h index f8397f300fcd..15e0e0209da4 100644 --- a/include/linux/linkmode.h +++ b/include/linux/linkmode.h @@ -66,11 +66,6 @@ static inline void linkmode_mod_bit(int nr, volatile unsigned long *addr, linkmode_clear_bit(nr, addr); } -static inline void linkmode_change_bit(int nr, volatile unsigned long *addr) -{ - __change_bit(nr, addr); -} - static inline int linkmode_test_bit(int nr, const volatile unsigned long *addr) { return test_bit(nr, addr); -- cgit v1.2.3 From 8b0fdcdc3a7d44aff907f0103f5ffb86b12bfe71 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:10:59 -0800 Subject: net: remove bond_slave_has_mac_rcu() No caller since v3.16. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/bonding.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index 83cfd2d70247..7dead855a72d 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -698,20 +698,6 @@ static inline struct slave *bond_slave_has_mac(struct bonding *bond, return NULL; } -/* Caller must hold rcu_read_lock() for read */ -static inline struct slave *bond_slave_has_mac_rcu(struct bonding *bond, - const u8 *mac) -{ - struct list_head *iter; - struct slave *tmp; - - bond_for_each_slave_rcu(bond, tmp, iter) - if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr)) - return tmp; - - return NULL; -} - /* Caller must hold rcu_read_lock() for read */ static inline bool bond_slave_has_mac_rx(struct bonding *bond, const u8 *mac) { -- cgit v1.2.3 From 560e08eda7969c3ef0639ab05f718be03a49d387 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:00 -0800 Subject: net: ax25: remove route refcount Nothing takes the refcount since v4.9. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/ax25.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/net/ax25.h b/include/net/ax25.h index 526e49589197..cb628c5d7c5b 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -187,18 +187,12 @@ typedef struct { typedef struct ax25_route { struct ax25_route *next; - refcount_t refcount; ax25_address callsign; struct net_device *dev; ax25_digi *digipeat; char ip_mode; } ax25_route; -static inline void ax25_hold_route(ax25_route *ax25_rt) -{ - refcount_inc(&ax25_rt->refcount); -} - void __ax25_put_route(ax25_route *ax25_rt); extern rwlock_t ax25_route_lock; @@ -213,12 +207,6 @@ static inline void ax25_route_lock_unuse(void) read_unlock(&ax25_route_lock); } -static inline void ax25_put_route(ax25_route *ax25_rt) -{ - if (refcount_dec_and_test(&ax25_rt->refcount)) - __ax25_put_route(ax25_rt); -} - typedef struct { char slave; /* slave_mode? */ struct timer_list slave_timer; /* timeout timer */ -- cgit v1.2.3 From 8b2d546e23bb3588f897089368b5f09e49f7762d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:02 -0800 Subject: ipv6: remove inet6_rsk() and tcp_twsk_ipv6only() The stubs under !CONFIG_IPV6 were missed when real functions got deleted ca. v3.13. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ipv6.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index a59d25f19385..1e0f8a31f3de 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -371,19 +371,12 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) return NULL; } -static inline struct inet6_request_sock * - inet6_rsk(const struct request_sock *rsk) -{ - return NULL; -} - static inline struct raw6_sock *raw6_sk(const struct sock *sk) { return NULL; } #define inet6_rcv_saddr(__sk) NULL -#define tcp_twsk_ipv6only(__sk) 0 #define inet_v6_ipv6only(__sk) 0 #endif /* IS_ENABLED(CONFIG_IPV6) */ #endif /* _IPV6_H */ -- cgit v1.2.3 From cc81df835c25b108248bad24021a21e77cbb84ac Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:04 -0800 Subject: udp: remove inner_udp_hdr() Not used since added in v3.8. Signed-off-by: Jakub Kicinski Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/udp.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/udp.h b/include/linux/udp.h index ae66dadd8543..254a2654400f 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -23,11 +23,6 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb) return (struct udphdr *)skb_transport_header(skb); } -static inline struct udphdr *inner_udp_hdr(const struct sk_buff *skb) -{ - return (struct udphdr *)skb_inner_transport_header(skb); -} - #define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256) static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) -- cgit v1.2.3 From 937fca918aacf54f1c9cb00d16d4e999a0569be0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:05 -0800 Subject: udplite: remove udplite_csum_outgoing() Not used since v4.0. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/udplite.h | 43 ------------------------------------------- 1 file changed, 43 deletions(-) (limited to 'include') diff --git a/include/net/udplite.h b/include/net/udplite.h index 9185e45b997f..a3c53110d30b 100644 --- a/include/net/udplite.h +++ b/include/net/udplite.h @@ -70,49 +70,6 @@ static inline int udplite_checksum_init(struct sk_buff *skb, struct udphdr *uh) return 0; } -/* Slow-path computation of checksum. Socket is locked. */ -static inline __wsum udplite_csum_outgoing(struct sock *sk, struct sk_buff *skb) -{ - const struct udp_sock *up = udp_sk(skb->sk); - int cscov = up->len; - __wsum csum = 0; - - if (up->pcflag & UDPLITE_SEND_CC) { - /* - * Sender has set `partial coverage' option on UDP-Lite socket. - * The special case "up->pcslen == 0" signifies full coverage. - */ - if (up->pcslen < up->len) { - if (0 < up->pcslen) - cscov = up->pcslen; - udp_hdr(skb)->len = htons(up->pcslen); - } - /* - * NOTE: Causes for the error case `up->pcslen > up->len': - * (i) Application error (will not be penalized). - * (ii) Payload too big for send buffer: data is split - * into several packets, each with its own header. - * In this case (e.g. last segment), coverage may - * exceed packet length. - * Since packets with coverage length > packet length are - * illegal, we fall back to the defaults here. - */ - } - - skb->ip_summed = CHECKSUM_NONE; /* no HW support for checksumming */ - - skb_queue_walk(&sk->sk_write_queue, skb) { - const int off = skb_transport_offset(skb); - const int len = skb->len - off; - - csum = skb_checksum(skb, off, (cscov > len)? len : cscov, csum); - - if ((cscov -= len) <= 0) - break; - } - return csum; -} - /* Fast-path computation of checksum. Socket may not be locked. */ static inline __wsum udplite_csum(struct sk_buff *skb) { -- cgit v1.2.3 From d59a67f2f3f39012271ed3d11c338706a011c5c2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:06 -0800 Subject: netlink: remove nl_set_extack_cookie_u32() Not used since v5.10. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netlink.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 1ec631838af9..bda1c385cffb 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -135,15 +135,6 @@ static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, extack->cookie_len = sizeof(cookie); } -static inline void nl_set_extack_cookie_u32(struct netlink_ext_ack *extack, - u32 cookie) -{ - if (!extack) - return; - memcpy(extack->cookie, &cookie, sizeof(cookie)); - extack->cookie_len = sizeof(cookie); -} - void netlink_kernel_release(struct sock *sk); int __netlink_change_ngroups(struct sock *sk, unsigned int groups); int netlink_change_ngroups(struct sock *sk, unsigned int groups); -- cgit v1.2.3 From 98b608629746946ecc6cda70bf0fd047785a1197 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:07 -0800 Subject: net: sched: remove psched_tdiff_bounded() Not used since v3.9. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 9e7b21c0b3a6..44a35531952e 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -63,12 +63,6 @@ static inline psched_time_t psched_get_time(void) return PSCHED_NS2TICKS(ktime_get_ns()); } -static inline psched_tdiff_t -psched_tdiff_bounded(psched_time_t tv1, psched_time_t tv2, psched_time_t bound) -{ - return min(tv1 - tv2, bound); -} - struct qdisc_watchdog { u64 last_expires; struct hrtimer timer; -- cgit v1.2.3 From a459bc9a3a68f2975ee6661fb9c86126a5636b25 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 11:11:08 -0800 Subject: net: sched: remove qdisc_qlen_cpu() Never used since it was added in v5.2. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/sch_generic.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 472843eedbae..9bab396c1f3b 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -518,11 +518,6 @@ static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) BUILD_BUG_ON(sizeof(qcb->data) < sz); } -static inline int qdisc_qlen_cpu(const struct Qdisc *q) -{ - return this_cpu_ptr(q->cpu_qstats)->qlen; -} - static inline int qdisc_qlen(const struct Qdisc *q) { return q->q.qlen; -- cgit v1.2.3 From d1bc532e99becf104635ed4da6fefa306f452321 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 25 Jan 2022 17:04:43 +0100 Subject: i40e: xsk: Move tmp desc array from driver to pool Move desc_array from the driver to the pool. The reason behind this is that we can then reuse this array as a temporary storage for descriptors in all zero-copy drivers that use the batched interface. This will make it easier to add batching to more drivers. i40e is the only driver that has a batched Tx zero-copy implementation, so no need to touch any other driver. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Reviewed-by: Alexander Lobakin Link: https://lore.kernel.org/bpf/20220125160446.78976-6-maciej.fijalkowski@intel.com --- include/net/xdp_sock_drv.h | 5 ++--- include/net/xsk_buff_pool.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 443d45951564..4aa031849668 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -13,7 +13,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); -u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, u32 max); +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); @@ -142,8 +142,7 @@ static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, return false; } -static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, - u32 max) +static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max) { return 0; } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index ddeefc4a1040..5554ee75e7da 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -60,6 +60,7 @@ struct xsk_buff_pool { */ dma_addr_t *dma_pages; struct xdp_buff_xsk *heads; + struct xdp_desc *tx_descs; u64 chunk_mask; u64 addrs_cnt; u32 free_list_cnt; -- cgit v1.2.3 From 46531a30364bd483bfa1b041c15d42a196e77e93 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jan 2022 14:09:13 +0000 Subject: cgroup/bpf: fast path skb BPF filtering Even though there is a static key protecting from overhead from cgroup-bpf skb filtering when there is nothing attached, in many cases it's not enough as registering a filter for one type will ruin the fast path for all others. It's observed in production servers I've looked at but also in laptops, where registration is done during init by systemd or something else. Add a per-socket fast path check guarding from such overhead. This affects both receive and transmit paths of TCP, UDP and other protocols. It showed ~1% tx/s improvement in small payload UDP send benchmarks using a real NIC and in a server environment and the number jumps to 2-3% for preemtible kernels. Reviewed-by: Stanislav Fomichev Signed-off-by: Pavel Begunkov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/d8c58857113185a764927a46f4b5a058d36d3ec3.1643292455.git.asml.silence@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 24 ++++++++++++++++++++---- include/linux/bpf.h | 13 +++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index b525d8cdc25b..88a51b242adc 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -8,6 +8,7 @@ #include #include #include +#include #include struct sock; @@ -165,11 +166,23 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, void *value, u64 flags); +/* Opportunistic check to see whether we have any BPF program attached*/ +static inline bool cgroup_bpf_sock_enabled(struct sock *sk, + enum cgroup_bpf_attach_type type) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_prog_array *array; + + array = rcu_access_pointer(cgrp->bpf.effective[type]); + return array != &bpf_empty_prog_array.hdr; +} + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(CGROUP_INET_INGRESS)) \ + if (cgroup_bpf_enabled(CGROUP_INET_INGRESS) && \ + cgroup_bpf_sock_enabled(sk, CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ CGROUP_INET_INGRESS); \ \ @@ -181,7 +194,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, int __ret = 0; \ if (cgroup_bpf_enabled(CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ - if (sk_fullsock(__sk)) \ + if (sk_fullsock(__sk) && \ + cgroup_bpf_sock_enabled(__sk, CGROUP_INET_EGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ CGROUP_INET_EGRESS); \ } \ @@ -347,7 +361,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_SETSOCKOPT) && \ + cgroup_bpf_sock_enabled(sock, CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -367,7 +382,8 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT)) \ + if (cgroup_bpf_enabled(CGROUP_GETSOCKOPT) && \ + cgroup_bpf_sock_enabled(sock, CGROUP_GETSOCKOPT)) \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2344f793c4dc..e3b82ce51445 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1233,6 +1233,19 @@ struct bpf_prog_array { struct bpf_prog_array_item items[]; }; +struct bpf_empty_prog_array { + struct bpf_prog_array hdr; + struct bpf_prog *null_prog; +}; + +/* to avoid allocating empty bpf_prog_array for cgroups that + * don't have bpf program attached use one global 'bpf_empty_prog_array' + * It will not be modified the caller of bpf_prog_array_alloc() + * (since caller requested prog_cnt == 0) + * that pointer should be 'freed' by bpf_prog_array_free() + */ +extern struct bpf_empty_prog_array bpf_empty_prog_array; + struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); void bpf_prog_array_free(struct bpf_prog_array *progs); int bpf_prog_array_length(struct bpf_prog_array *progs); -- cgit v1.2.3 From 7472d5a642c94a0ee1882ff3038de72ffe803a01 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 27 Jan 2022 07:46:00 -0800 Subject: compiler_types: define __user as __attribute__((btf_type_tag("user"))) The __user attribute is currently mainly used by sparse for type checking. The attribute indicates whether a memory access is in user memory address space or not. Such information is important during tracing kernel internal functions or data structures as accessing user memory often has different mechanisms compared to accessing kernel memory. For example, the perf-probe needs explicit command line specification to indicate a particular argument or string in user-space memory ([1], [2], [3]). Currently, vmlinux BTF is available in kernel with many distributions. If __user attribute information is available in vmlinux BTF, the explicit user memory access information from users will not be necessary as the kernel can figure it out by itself with vmlinux BTF. Besides the above possible use for perf/probe, another use case is for bpf verifier. Currently, for bpf BPF_PROG_TYPE_TRACING type of bpf programs, users can write direct code like p->m1->m2 and "p" could be a function parameter. Without __user information in BTF, the verifier will assume p->m1 accessing kernel memory and will generate normal loads. Let us say "p" actually tagged with __user in the source code. In such cases, p->m1 is actually accessing user memory and direct load is not right and may produce incorrect result. For such cases, bpf_probe_read_user() will be the correct way to read p->m1. To support encoding __user information in BTF, a new attribute __attribute__((btf_type_tag(""))) is implemented in clang ([4]). For example, if we have #define __user __attribute__((btf_type_tag("user"))) during kernel compilation, the attribute "user" information will be preserved in dwarf. After pahole converting dwarf to BTF, __user information will be available in vmlinux BTF. The following is an example with latest upstream clang (clang14) and pahole 1.23: [$ ~] cat test.c #define __user __attribute__((btf_type_tag("user"))) int foo(int __user *arg) { return *arg; } [$ ~] clang -O2 -g -c test.c [$ ~] pahole -JV test.o ... [1] INT int size=4 nr_bits=32 encoding=SIGNED [2] TYPE_TAG user type_id=1 [3] PTR (anon) type_id=2 [4] FUNC_PROTO (anon) return=1 args=(3 arg) [5] FUNC foo type_id=4 [$ ~] You can see for the function argument "int __user *arg", its type is described as PTR -> TYPE_TAG(user) -> INT The kernel can use this information for bpf verification or other use cases. Current btf_type_tag is only supported in clang (>= clang14) and pahole (>= 1.23). gcc support is also proposed and under development ([5]). [1] http://lkml.kernel.org/r/155789874562.26965.10836126971405890891.stgit@devnote2 [2] http://lkml.kernel.org/r/155789872187.26965.4468456816590888687.stgit@devnote2 [3] http://lkml.kernel.org/r/155789871009.26965.14167558859557329331.stgit@devnote2 [4] https://reviews.llvm.org/D111199 [5] https://lore.kernel.org/bpf/0cbeb2fb-1a18-f690-e360-24b1c90c2a91@fb.com/ Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220127154600.652613-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/compiler_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 3c1795fdb568..3f31ff400432 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -31,6 +31,9 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __kernel # ifdef STRUCTLEAK_PLUGIN # define __user __attribute__((user)) +# elif defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \ + __has_attribute(btf_type_tag) +# define __user __attribute__((btf_type_tag("user"))) # else # define __user # endif -- cgit v1.2.3 From c6f1bfe89ac95dc829dcb4ed54780da134ac5fce Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 27 Jan 2022 07:46:06 -0800 Subject: bpf: reject program if a __user tagged memory accessed in kernel way BPF verifier supports direct memory access for BPF_PROG_TYPE_TRACING type of bpf programs, e.g., a->b. If "a" is a pointer pointing to kernel memory, bpf verifier will allow user to write code in C like a->b and the verifier will translate it to a kernel load properly. If "a" is a pointer to user memory, it is expected that bpf developer should be bpf_probe_read_user() helper to get the value a->b. Without utilizing BTF __user tagging information, current verifier will assume that a->b is a kernel memory access and this may generate incorrect result. Now BTF contains __user information, it can check whether the pointer points to a user memory or not. If it is, the verifier can reject the program and force users to use bpf_probe_read_user() helper explicitly. In the future, we can easily extend btf_add_space for other address space tagging, for example, rcu/percpu etc. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220127154606.654961-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 ++++++--- include/linux/btf.h | 5 +++++ 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e3b82ce51445..6eb0b180d33b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -332,7 +332,10 @@ enum bpf_type_flag { */ MEM_ALLOC = BIT(2 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = MEM_ALLOC, + /* MEM is in user address space. */ + MEM_USER = BIT(3 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_USER, }; /* Max number of base types. */ @@ -588,7 +591,7 @@ struct bpf_verifier_ops { const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id); + u32 *next_btf_id, enum bpf_type_flag *flag); }; struct bpf_prog_offload_ops { @@ -1780,7 +1783,7 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size, int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, - u32 *next_btf_id); + u32 *next_btf_id, enum bpf_type_flag *flag); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, const struct btf *need_btf, u32 need_type_id); diff --git a/include/linux/btf.h b/include/linux/btf.h index b12cfe3b12bb..f6c43dd513fa 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -238,6 +238,11 @@ static inline bool btf_type_is_var(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; } +static inline bool btf_type_is_type_tag(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG; +} + /* union is only a special case of struct: * all its offsetof(member) == 0 */ -- cgit v1.2.3 From 9059b04b4108be71397941f4665d5aa79783125a Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 9 Jan 2022 21:46:34 +0200 Subject: net/mlx5: Remove unused TIR modify bitmask enums struct mlx5_ifc_modify_tir_bitmask_bits is used for the bitmask of MODIFY_TIR operations. Remove the unused bitmask enums. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 598ac3bcc901..27145c4d6820 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -63,13 +63,6 @@ enum { MLX5_EVENT_TYPE_CODING_FPGA_QP_ERROR = 0x21 }; -enum { - MLX5_MODIFY_TIR_BITMASK_LRO = 0x0, - MLX5_MODIFY_TIR_BITMASK_INDIRECT_TABLE = 0x1, - MLX5_MODIFY_TIR_BITMASK_HASH = 0x2, - MLX5_MODIFY_TIR_BITMASK_TUNNELED_OFFLOAD_EN = 0x3 -}; - enum { MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0, MLX5_SET_HCA_CAP_OP_MOD_ODP = 0x2, -- cgit v1.2.3 From f37a4cc6bb0ba08c2d9fd7d18a1da87161cbb7f9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jan 2022 00:36:28 +0000 Subject: udp6: pass flow in ip6_make_skb together with cork Another preparation patch. inet_cork_full already contains a field for iflow, so we can avoid passing a separate struct iflow6 into __ip6_append_data() and ip6_make_skb(), and use the flow stored in inet_cork_full. Make sure callers set cork->fl, i.e. we init it in ip6_append_data() and before calling ip6_make_skb(). Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 3afcb128e064..5e0b56d66724 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1020,7 +1020,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - struct ipcm6_cookie *ipc6, struct flowi6 *fl6, + struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork); -- cgit v1.2.3 From 31ed2261e88fbd1eb62fc870ef2c6b2cf2951336 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 27 Jan 2022 00:36:31 +0000 Subject: ipv6: partially inline ipv6_fixup_options Inline a part of ipv6_fixup_options() to avoid extra overhead on function call if opt is NULL. Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 5e0b56d66724..082f30256f59 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -437,8 +437,16 @@ struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, struct ipv6_opt_hdr *newopt); -struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, - struct ipv6_txoptions *opt); +struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space, + struct ipv6_txoptions *opt); + +static inline struct ipv6_txoptions * +ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt) +{ + if (!opt) + return NULL; + return __ipv6_fixup_options(opt_space, opt); +} bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt); -- cgit v1.2.3 From c265a3a6690b093e542aceb4a7bf8bf6c577e42b Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 25 Jan 2022 13:25:40 +0100 Subject: net: mac802154: Explain the use of ieee802154_wake/stop_queue() It is not straightforward to the newcomer that a single skb can currently be sent at a time and that the internal process is to stop the queue when processing a frame before re-enabling it. Make this clear by documenting the ieee802154_wake/stop_queue() helpers. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220125122540.855604-4-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/mac802154.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index d524ffb9eb25..2c3bbc6645ba 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -464,6 +464,12 @@ void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, * ieee802154_wake_queue - wake ieee802154 queue * @hw: pointer as obtained from ieee802154_alloc_hw(). * + * Tranceivers usually have either one transmit framebuffer or one framebuffer + * for both transmitting and receiving. Hence, the core currently only handles + * one frame at a time for each phy, which means we had to stop the queue to + * avoid new skb to come during the transmission. The queue then needs to be + * woken up after the operation. + * * Drivers should use this function instead of netif_wake_queue. */ void ieee802154_wake_queue(struct ieee802154_hw *hw); @@ -472,6 +478,12 @@ void ieee802154_wake_queue(struct ieee802154_hw *hw); * ieee802154_stop_queue - stop ieee802154 queue * @hw: pointer as obtained from ieee802154_alloc_hw(). * + * Tranceivers usually have either one transmit framebuffer or one framebuffer + * for both transmitting and receiving. Hence, the core currently only handles + * one frame at a time for each phy, which means we need to tell upper layers to + * stop giving us new skbs while we are busy with the transmitted one. The queue + * must then be stopped before transmitting. + * * Drivers should use this function instead of netif_stop_queue. */ void ieee802154_stop_queue(struct ieee802154_hw *hw); -- cgit v1.2.3 From b5b3d10ef638fcadc56609961875bf157a92aaa4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 27 Jan 2022 08:33:49 -0800 Subject: net: mii: remove mii_lpa_mod_linkmode_lpa_sgmii() Vladimir points out that since we removed mii_lpa_to_linkmode_lpa_sgmii(), mii_lpa_mod_linkmode_lpa_sgmii() is also no longer called. Suggested-by: Vladimir Oltean Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/mii.h | 33 --------------------------------- 1 file changed, 33 deletions(-) (limited to 'include') diff --git a/include/linux/mii.h b/include/linux/mii.h index b8a1a17a87dd..5ee13083cec7 100644 --- a/include/linux/mii.h +++ b/include/linux/mii.h @@ -354,39 +354,6 @@ static inline u32 mii_adv_to_ethtool_adv_x(u32 adv) return result; } -/** - * mii_lpa_mod_linkmode_adv_sgmii - * @lp_advertising: pointer to destination link mode. - * @lpa: value of the MII_LPA register - * - * A small helper function that translates MII_LPA bits to - * linkmode advertisement settings for SGMII. - * Leaves other bits unchanged. - */ -static inline void -mii_lpa_mod_linkmode_lpa_sgmii(unsigned long *lp_advertising, u32 lpa) -{ - u32 speed_duplex = lpa & LPA_SGMII_DPX_SPD_MASK; - - linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, lp_advertising, - speed_duplex == LPA_SGMII_1000HALF); - - linkmode_mod_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, lp_advertising, - speed_duplex == LPA_SGMII_1000FULL); - - linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, lp_advertising, - speed_duplex == LPA_SGMII_100HALF); - - linkmode_mod_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, lp_advertising, - speed_duplex == LPA_SGMII_100FULL); - - linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, lp_advertising, - speed_duplex == LPA_SGMII_10HALF); - - linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, lp_advertising, - speed_duplex == LPA_SGMII_10FULL); -} - /** * mii_adv_mod_linkmode_adv_t * @advertising:pointer to destination link mode. -- cgit v1.2.3 From 9690ae60429020f38e4aa2540c306f27eb021bc0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 27 Jan 2022 10:42:59 -0800 Subject: ethtool: add header/data split indication For applications running on a mix of platforms it's useful to have a clear indication whether host's NIC supports the geometry requirements of TCP zero-copy. TCP zero-copy Rx requires data to be neatly placed into memory pages. Most NICs can't do that. This patch is adding GET support only, since the NICs I work with either always have the feature enabled or enable it whenever MTU is set to jumbo. In other words I don't need SET. But adding set should be trivial. (The only note on SET is that we will likely want the setting to be "sticky" and use 0 / `unknown` to reset it back to driver default.) Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/ethtool.h | 2 ++ include/uapi/linux/ethtool_netlink.h | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 11efc45de66a..e0853f48b75e 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -70,9 +70,11 @@ enum { /** * struct kernel_ethtool_ringparam - RX/TX ring configuration * @rx_buf_len: Current length of buffers on the rx ring. + * @tcp_data_split: Scatter packet headers and data to separate buffers */ struct kernel_ethtool_ringparam { u32 rx_buf_len; + u8 tcp_data_split; }; /** diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index cca6e474a085..417d4280d7b5 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -318,6 +318,12 @@ enum { /* RINGS */ +enum { + ETHTOOL_TCP_DATA_SPLIT_UNKNOWN = 0, + ETHTOOL_TCP_DATA_SPLIT_DISABLED, + ETHTOOL_TCP_DATA_SPLIT_ENABLED, +}; + enum { ETHTOOL_A_RINGS_UNSPEC, ETHTOOL_A_RINGS_HEADER, /* nest - _A_HEADER_* */ @@ -330,6 +336,7 @@ enum { ETHTOOL_A_RINGS_RX_JUMBO, /* u32 */ ETHTOOL_A_RINGS_TX, /* u32 */ ETHTOOL_A_RINGS_RX_BUF_LEN, /* u32 */ + ETHTOOL_A_RINGS_TCP_DATA_SPLIT, /* u8 */ /* add new constants above here */ __ETHTOOL_A_RINGS_CNT, -- cgit v1.2.3 From 6cdef8a6ee748ec522aabee049633b16b4115f73 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Jan 2022 12:09:35 -0800 Subject: SUNRPC: add netns refcount tracker to struct svc_xprt struct svc_xprt holds a long lived reference to a netns, it is worth tracking it. Signed-off-by: Eric Dumazet Acked-by: Chuck Lever Signed-off-by: David S. Miller --- include/linux/sunrpc/svc_xprt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 571f605bc91e..382af90320ac 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -88,6 +88,7 @@ struct svc_xprt { struct list_head xpt_users; /* callbacks on free */ struct net *xpt_net; + netns_tracker ns_tracker; const struct cred *xpt_cred; struct rpc_xprt *xpt_bc_xprt; /* NFSv4.1 backchannel */ struct rpc_xprt_switch *xpt_bc_xps; /* NFSv4.1 backchannel */ -- cgit v1.2.3 From b9a0d6d143ec63132beff04802578c499083f87c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 27 Jan 2022 12:09:37 -0800 Subject: SUNRPC: add netns refcount tracker to struct rpc_xprt Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/sunrpc/xprt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 955ea4d7af0b..3cdc8d878d81 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -284,6 +284,7 @@ struct rpc_xprt { } stat; struct net *xprt_net; + netns_tracker ns_tracker; const char *servername; const char *address_strings[RPC_DISPLAY_MAX]; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -- cgit v1.2.3 From 4f0e30407ef6f2075b8e86d54be42e787087cd61 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 28 Jan 2022 08:06:54 -0800 Subject: ipv4: drop fragmentation code from ip_options_build() Since v2.5.44 and addition of ip_options_fragment() ip_options_build() does not render headers for fragments directly. @is_frag is always 0. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index b51bae43b0dd..fdbab0c00fca 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -712,7 +712,7 @@ int ip_forward(struct sk_buff *skb); */ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, - __be32 daddr, struct rtable *rt, int is_frag); + __be32 daddr, struct rtable *rt); int __ip_options_echo(struct net *net, struct ip_options *dopt, struct sk_buff *skb, const struct ip_options *sopt); -- cgit v1.2.3 From 73c105ad2a3e142d81fc59761ce8353d0b211b8f Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Fri, 28 Jan 2022 21:32:40 +0300 Subject: phy: make phy_set_max_speed() *void* After following the call tree of phy_set_max_speed(), it became clear that this function never returns anything but 0, so we can change its result type to *void* and drop the result checks from the three drivers that actually bothered to do it... Found by Linux Verification Center (linuxtesting.org) with the SVACE static analysis tool. Signed-off-by: Sergey Shtylyov Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 6de8d7a90d78..cd08cf1a8b0d 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1661,7 +1661,7 @@ int phy_disable_interrupts(struct phy_device *phydev); void phy_request_interrupt(struct phy_device *phydev); void phy_free_interrupt(struct phy_device *phydev); void phy_print_status(struct phy_device *phydev); -int phy_set_max_speed(struct phy_device *phydev, u32 max_speed); +void phy_set_max_speed(struct phy_device *phydev, u32 max_speed); void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode); void phy_advertise_supported(struct phy_device *phydev); void phy_support_sym_pause(struct phy_device *phydev); -- cgit v1.2.3 From 47ed9442b2ecfcdc72889667236d6c59b6a3337e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 28 Jan 2022 16:53:47 -0700 Subject: ipv4: Make ip_idents_reserve static ip_idents_reserve is only used in net/ipv4/route.c. Make it static and remove the export. Signed-off-by: David Ahern Cc: Eric Dumazet Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index fdbab0c00fca..3984f2c39c4b 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -517,7 +517,6 @@ void ip_dst_metrics_put(struct dst_entry *dst) kfree(p); } -u32 ip_idents_reserve(u32 hash, int segs); void __ip_select_ident(struct net *net, struct iphdr *iph, int segs); static inline void ip_select_ident_segs(struct net *net, struct sk_buff *skb, -- cgit v1.2.3 From e187013abeb4c2a7ec8a4bb978844c7e92a1a6ec Mon Sep 17 00:00:00 2001 From: Akhmat Karakotov Date: Mon, 31 Jan 2022 16:31:21 +0300 Subject: txhash: Make rethinking txhash behavior configurable via sysctl Add a per ns sysctl that controls the txhash rethink behavior: net.core.txrehash. When enabled, the same behavior is retained, when disabled, rethink is not performed. Sysctl is enabled by default. Signed-off-by: Akhmat Karakotov Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/core.h | 1 + include/net/sock.h | 34 +++++++++++++++++++++------------- include/uapi/linux/socket.h | 3 +++ 3 files changed, 25 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/netns/core.h b/include/net/netns/core.h index 552bc25b1933..388244e315e7 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -10,6 +10,7 @@ struct netns_core { struct ctl_table_header *sysctl_hdr; int sysctl_somaxconn; + u8 sysctl_txrehash; #ifdef CONFIG_PROC_FS struct prot_inuse __percpu *prot_inuse; diff --git a/include/net/sock.h b/include/net/sock.h index ff9b508d9c5f..0540e1b2aeb0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -587,6 +587,18 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk) __tmp | SK_USER_DATA_NOCOPY); \ }) +static inline +struct net *sock_net(const struct sock *sk) +{ + return read_pnet(&sk->sk_net); +} + +static inline +void sock_net_set(struct sock *sk, struct net *net) +{ + write_pnet(&sk->sk_net, net); +} + /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE @@ -2054,10 +2066,18 @@ static inline void sk_set_txhash(struct sock *sk) static inline bool sk_rethink_txhash(struct sock *sk) { - if (sk->sk_txhash) { + u8 rehash; + + if (!sk->sk_txhash) + return false; + + rehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); + + if (rehash) { sk_set_txhash(sk); return true; } + return false; } @@ -2704,18 +2724,6 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); } -static inline -struct net *sock_net(const struct sock *sk) -{ - return read_pnet(&sk->sk_net); -} - -static inline -void sock_net_set(struct sock *sk, struct net *net) -{ - write_pnet(&sk->sk_net, net); -} - static inline bool skb_sk_is_prefetched(struct sk_buff *skb) { diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h index eb0a9a5b6e71..0accd6102ece 100644 --- a/include/uapi/linux/socket.h +++ b/include/uapi/linux/socket.h @@ -31,4 +31,7 @@ struct __kernel_sockaddr_storage { #define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK) +#define SOCK_TXREHASH_DISABLED 0 +#define SOCK_TXREHASH_ENABLED 1 + #endif /* _UAPI_LINUX_SOCKET_H */ -- cgit v1.2.3 From 26859240e4ee701e0379f08634957adaff67e43a Mon Sep 17 00:00:00 2001 From: Akhmat Karakotov Date: Mon, 31 Jan 2022 16:31:22 +0300 Subject: txhash: Add socket option to control TX hash rethink behavior Add the SO_TXREHASH socket option to control hash rethink behavior per socket. When default mode is set, sockets disable rehash at initialization and use sysctl option when entering listen state. setsockopt() overrides default behavior. Signed-off-by: Akhmat Karakotov Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 12 +++--------- include/uapi/asm-generic/socket.h | 2 ++ include/uapi/linux/socket.h | 1 + 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 0540e1b2aeb0..d6c13f0fba40 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -316,6 +316,7 @@ struct sk_filter; * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit + * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received @@ -491,6 +492,7 @@ struct sock { u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; + u8 sk_txrehash; #ifdef CONFIG_NET_RX_BUSY_POLL u8 sk_prefer_busy_poll; u16 sk_busy_poll_budget; @@ -2066,18 +2068,10 @@ static inline void sk_set_txhash(struct sock *sk) static inline bool sk_rethink_txhash(struct sock *sk) { - u8 rehash; - - if (!sk->sk_txhash) - return false; - - rehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); - - if (rehash) { + if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) { sk_set_txhash(sk); return true; } - return false; } diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index c77a1313b3b0..467ca2f28760 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -128,6 +128,8 @@ #define SO_RESERVE_MEM 73 +#define SO_TXREHASH 74 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h index 0accd6102ece..51d6bb2f6765 100644 --- a/include/uapi/linux/socket.h +++ b/include/uapi/linux/socket.h @@ -31,6 +31,7 @@ struct __kernel_sockaddr_storage { #define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK) +#define SOCK_TXREHASH_DEFAULT ((u8)-1) #define SOCK_TXREHASH_DISABLED 0 #define SOCK_TXREHASH_ENABLED 1 -- cgit v1.2.3 From 4421a582718ab81608d8486734c18083b822390d Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 30 Jan 2022 12:55:17 +0100 Subject: bpf: Make dst_port field in struct bpf_sock 16-bit wide Menglong Dong reports that the documentation for the dst_port field in struct bpf_sock is inaccurate and confusing. From the BPF program PoV, the field is a zero-padded 16-bit integer in network byte order. The value appears to the BPF user as if laid out in memory as so: offsetof(struct bpf_sock, dst_port) + 0 + 8 +16 0x00 +24 0x00 32-, 16-, and 8-bit wide loads from the field are all allowed, but only if the offset into the field is 0. 32-bit wide loads from dst_port are especially confusing. The loaded value, after converting to host byte order with bpf_ntohl(dst_port), contains the port number in the upper 16-bits. Remove the confusion by splitting the field into two 16-bit fields. For backward compatibility, allow 32-bit wide loads from offsetof(struct bpf_sock, dst_port). While at it, allow loads 8-bit loads at offset [0] and [1] from dst_port. Reported-by: Menglong Dong Signed-off-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20220130115518.213259-2-jakub@cloudflare.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4a2f7041ebae..a7f0ddedac1f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5574,7 +5574,8 @@ struct bpf_sock { __u32 src_ip4; __u32 src_ip6[4]; __u32 src_port; /* host byte order */ - __u32 dst_port; /* network byte order */ + __be16 dst_port; /* network byte order */ + __u16 :16; /* zero padding */ __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; -- cgit v1.2.3 From 6d3ac94bae21de6b6a1d81596752428960012816 Mon Sep 17 00:00:00 2001 From: Yang Guang Date: Fri, 14 Jan 2022 08:11:02 +0800 Subject: ssb: fix boolreturn.cocci warning The coccinelle report ./include/linux/ssb/ssb_driver_gige.h:98:8-9: WARNING: return of 0/1 in function 'ssb_gige_must_flush_posted_writes' with return type bool Return statements in functions returning bool should use true/false instead of 1/0. Reported-by: Zeal Robot Signed-off-by: Yang Guang Signed-off-by: David Yang Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/fa4f1fa737e715eb62a85229ac5f12bae21145cf.1642065490.git.davidcomponentone@gmail.com --- include/linux/ssb/ssb_driver_gige.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ssb/ssb_driver_gige.h b/include/linux/ssb/ssb_driver_gige.h index 15ba0df1ee0d..28c145a51e57 100644 --- a/include/linux/ssb/ssb_driver_gige.h +++ b/include/linux/ssb/ssb_driver_gige.h @@ -95,7 +95,7 @@ static inline bool ssb_gige_must_flush_posted_writes(struct pci_dev *pdev) struct ssb_gige *dev = pdev_to_ssb_gige(pdev); if (dev) return (dev->dev->bus->chip_id == 0x4785); - return 0; + return false; } /* Get the device MAC address */ -- cgit v1.2.3 From 02b2a91c6f0d57df687e666b475849a54f295a12 Mon Sep 17 00:00:00 2001 From: David Girault Date: Tue, 1 Feb 2022 19:09:56 +0100 Subject: net: ieee802154: Provide a kdoc to the address structure Give this structure a header to better explain its content. Signed-off-by: David Girault [miquel.raynal@bootlin.com: Isolate this change from a bigger commit and reword the comment] Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20220201180956.93581-1-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 6ed07844eb24..833672d6fbe4 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -227,6 +227,16 @@ static inline void wpan_phy_net_set(struct wpan_phy *wpan_phy, struct net *net) write_pnet(&wpan_phy->_net, net); } +/** + * struct ieee802154_addr - IEEE802.15.4 device address + * @mode: Address mode from frame header. Can be one of: + * - @IEEE802154_ADDR_NONE + * - @IEEE802154_ADDR_SHORT + * - @IEEE802154_ADDR_LONG + * @pan_id: The PAN ID this address belongs to + * @short_addr: address if @mode is @IEEE802154_ADDR_SHORT + * @extended_addr: address if @mode is @IEEE802154_ADDR_LONG + */ struct ieee802154_addr { u8 mode; __le16 pan_id; -- cgit v1.2.3 From 295ab96f478d0fa56393e85406f19a867e26ce22 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Feb 2022 01:03:20 +0100 Subject: net: dsa: provide switch operations for tracking the master state Certain drivers may need to send management traffic to the switch for things like register access, FDB dump, etc, to accelerate what their slow bus (SPI, I2C, MDIO) can already do. Ethernet is faster (especially in bulk transactions) but is also more unreliable, since the user may decide to bring the DSA master down (or not bring it up), therefore severing the link between the host and the attached switch. Drivers needing Ethernet-based register access already should have fallback logic to the slow bus if the Ethernet method fails, but that fallback may be based on a timeout, and the I/O to the switch may slow down to a halt if the master is down, because every Ethernet packet will have to time out. The driver also doesn't have the option to turn off Ethernet-based I/O momentarily, because it wouldn't know when to turn it back on. Which is where this change comes in. By tracking NETDEV_CHANGE, NETDEV_UP and NETDEV_GOING_DOWN events on the DSA master, we should know the exact interval of time during which this interface is reliably available for traffic. Provide this information to switches so they can use it as they wish. An helper is added dsa_port_master_is_operational() to check if a master port is operational. Signed-off-by: Vladimir Oltean Signed-off-by: Ansuel Smith Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/net/dsa.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 57b3e4e7413b..43c4153ef53a 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -278,6 +278,10 @@ struct dsa_port { u8 devlink_port_setup:1; + /* Master state bits, valid only on CPU ports */ + u8 master_admin_up:1; + u8 master_oper_up:1; + u8 setup:1; struct device_node *dn; @@ -478,6 +482,12 @@ static inline bool dsa_port_is_unused(struct dsa_port *dp) return dp->type == DSA_PORT_TYPE_UNUSED; } +static inline bool dsa_port_master_is_operational(struct dsa_port *dp) +{ + return dsa_port_is_cpu(dp) && dp->master_admin_up && + dp->master_oper_up; +} + static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p) { return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED; @@ -1036,6 +1046,13 @@ struct dsa_switch_ops { int (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid, u16 flags); int (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid); + + /* + * DSA master tracking operations + */ + void (*master_state_change)(struct dsa_switch *ds, + const struct net_device *master, + bool operational); }; #define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes) \ -- cgit v1.2.3 From 3ec762fb13c7e7273800b94c80db1c2cc37590d1 Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Wed, 2 Feb 2022 01:03:23 +0100 Subject: net: dsa: tag_qca: move define to include linux/dsa Move tag_qca define to include dir linux/dsa as the qca8k require access to the tagger define to support in-band mdio read/write using ethernet packet. Signed-off-by: Ansuel Smith Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/tag_qca.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 include/linux/dsa/tag_qca.h (limited to 'include') diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h new file mode 100644 index 000000000000..c02d2d39ff4a --- /dev/null +++ b/include/linux/dsa/tag_qca.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __TAG_QCA_H +#define __TAG_QCA_H + +#define QCA_HDR_LEN 2 +#define QCA_HDR_VERSION 0x2 + +#define QCA_HDR_RECV_VERSION GENMASK(15, 14) +#define QCA_HDR_RECV_PRIORITY GENMASK(13, 11) +#define QCA_HDR_RECV_TYPE GENMASK(10, 6) +#define QCA_HDR_RECV_FRAME_IS_TAGGED BIT(3) +#define QCA_HDR_RECV_SOURCE_PORT GENMASK(2, 0) + +#define QCA_HDR_XMIT_VERSION GENMASK(15, 14) +#define QCA_HDR_XMIT_PRIORITY GENMASK(13, 11) +#define QCA_HDR_XMIT_CONTROL GENMASK(10, 8) +#define QCA_HDR_XMIT_FROM_CPU BIT(7) +#define QCA_HDR_XMIT_DP_BIT GENMASK(6, 0) + +#endif /* __TAG_QCA_H */ -- cgit v1.2.3 From c2ee8181fddb293d296477f60b3eb4fa3ce4e1a6 Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Wed, 2 Feb 2022 01:03:25 +0100 Subject: net: dsa: tag_qca: add define for handling mgmt Ethernet packet Add all the required define to prepare support for mgmt read/write in Ethernet packet. Any packet of this type has to be dropped as the only use of these special packet is receive ack for an mgmt write request or receive data for an mgmt read request. A struct is used that emulates the Ethernet header but is used for a different purpose. Signed-off-by: Ansuel Smith Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/tag_qca.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include') diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h index c02d2d39ff4a..f366422ab7a0 100644 --- a/include/linux/dsa/tag_qca.h +++ b/include/linux/dsa/tag_qca.h @@ -12,10 +12,54 @@ #define QCA_HDR_RECV_FRAME_IS_TAGGED BIT(3) #define QCA_HDR_RECV_SOURCE_PORT GENMASK(2, 0) +/* Packet type for recv */ +#define QCA_HDR_RECV_TYPE_NORMAL 0x0 +#define QCA_HDR_RECV_TYPE_MIB 0x1 +#define QCA_HDR_RECV_TYPE_RW_REG_ACK 0x2 + #define QCA_HDR_XMIT_VERSION GENMASK(15, 14) #define QCA_HDR_XMIT_PRIORITY GENMASK(13, 11) #define QCA_HDR_XMIT_CONTROL GENMASK(10, 8) #define QCA_HDR_XMIT_FROM_CPU BIT(7) #define QCA_HDR_XMIT_DP_BIT GENMASK(6, 0) +/* Packet type for xmit */ +#define QCA_HDR_XMIT_TYPE_NORMAL 0x0 +#define QCA_HDR_XMIT_TYPE_RW_REG 0x1 + +/* Check code for a valid mgmt packet. Switch will ignore the packet + * with this wrong. + */ +#define QCA_HDR_MGMT_CHECK_CODE_VAL 0x5 + +/* Specific define for in-band MDIO read/write with Ethernet packet */ +#define QCA_HDR_MGMT_SEQ_LEN 4 /* 4 byte for the seq */ +#define QCA_HDR_MGMT_COMMAND_LEN 4 /* 4 byte for the command */ +#define QCA_HDR_MGMT_DATA1_LEN 4 /* First 4 byte for the mdio data */ +#define QCA_HDR_MGMT_HEADER_LEN (QCA_HDR_MGMT_SEQ_LEN + \ + QCA_HDR_MGMT_COMMAND_LEN + \ + QCA_HDR_MGMT_DATA1_LEN) + +#define QCA_HDR_MGMT_DATA2_LEN 12 /* Other 12 byte for the mdio data */ +#define QCA_HDR_MGMT_PADDING_LEN 34 /* Padding to reach the min Ethernet packet */ + +#define QCA_HDR_MGMT_PKT_LEN (QCA_HDR_MGMT_HEADER_LEN + \ + QCA_HDR_LEN + \ + QCA_HDR_MGMT_DATA2_LEN + \ + QCA_HDR_MGMT_PADDING_LEN) + +#define QCA_HDR_MGMT_SEQ_NUM GENMASK(31, 0) /* 63, 32 */ +#define QCA_HDR_MGMT_CHECK_CODE GENMASK(31, 29) /* 31, 29 */ +#define QCA_HDR_MGMT_CMD BIT(28) /* 28 */ +#define QCA_HDR_MGMT_LENGTH GENMASK(23, 20) /* 23, 20 */ +#define QCA_HDR_MGMT_ADDR GENMASK(18, 0) /* 18, 0 */ + +/* Special struct emulating a Ethernet header */ +struct qca_mgmt_ethhdr { + u32 command; /* command bit 31:0 */ + u32 seq; /* seq 63:32 */ + u32 mdio_data; /* first 4byte mdio */ + __be16 hdr; /* qca hdr */ +} __packed; + #endif /* __TAG_QCA_H */ -- cgit v1.2.3 From 18be654a4345f7d937b4bfbad74bea8093e3a93c Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Wed, 2 Feb 2022 01:03:26 +0100 Subject: net: dsa: tag_qca: add define for handling MIB packet Add struct to correctly parse a mib Ethernet packet. Signed-off-by: Ansuel Smith Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/tag_qca.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h index f366422ab7a0..1fff57f2937b 100644 --- a/include/linux/dsa/tag_qca.h +++ b/include/linux/dsa/tag_qca.h @@ -62,4 +62,14 @@ struct qca_mgmt_ethhdr { __be16 hdr; /* qca hdr */ } __packed; +enum mdio_cmd { + MDIO_WRITE = 0x0, + MDIO_READ +}; + +struct mib_ethhdr { + u32 data[3]; /* first 3 mib counter */ + __be16 hdr; /* qca hdr */ +} __packed; + #endif /* __TAG_QCA_H */ -- cgit v1.2.3 From 31eb6b4386ad91930417e3f5c8157a4b5e31cbd5 Mon Sep 17 00:00:00 2001 From: Ansuel Smith Date: Wed, 2 Feb 2022 01:03:27 +0100 Subject: net: dsa: tag_qca: add support for handling mgmt and MIB Ethernet packet Add connect/disconnect helper to assign private struct to the DSA switch. Add support for Ethernet mgmt and MIB if the DSA driver provide an handler to correctly parse and elaborate the data. Signed-off-by: Ansuel Smith Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/dsa/tag_qca.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/dsa/tag_qca.h b/include/linux/dsa/tag_qca.h index 1fff57f2937b..4359fb0221cf 100644 --- a/include/linux/dsa/tag_qca.h +++ b/include/linux/dsa/tag_qca.h @@ -72,4 +72,11 @@ struct mib_ethhdr { __be16 hdr; /* qca hdr */ } __packed; +struct qca_tagger_data { + void (*rw_reg_ack_handler)(struct dsa_switch *ds, + struct sk_buff *skb); + void (*mib_autocast_handler)(struct dsa_switch *ds, + struct sk_buff *skb); +}; + #endif /* __TAG_QCA_H */ -- cgit v1.2.3 From 5903123f662ed18483f05cac3f9e800a074c29ff Mon Sep 17 00:00:00 2001 From: Akhmat Karakotov Date: Fri, 28 Jan 2022 22:26:21 +0300 Subject: tcp: Use BPF timeout setting for SYN ACK RTO When setting RTO through BPF program, some SYN ACK packets were unaffected and continued to use TCP_TIMEOUT_INIT constant. This patch adds timeout option to struct request_sock. Option is initialized with TCP_TIMEOUT_INIT and is reassigned through BPF using tcp_timeout_init call. SYN ACK retransmits now use newly added timeout option. Signed-off-by: Akhmat Karakotov Acked-by: Martin KaFai Lau v2: - Add timeout option to struct request_sock. Do not call tcp_timeout_init on every syn ack retransmit. v3: - Use unsigned long for min. Bound tcp_timeout_init to TCP_RTO_MAX. v4: - Refactor duplicate code by adding reqsk_timeout function. Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 8 ++++++++ include/net/request_sock.h | 2 ++ include/net/tcp.h | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 4ad47d9f9d27..3908296d103f 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -285,6 +285,14 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req); +static inline unsigned long +reqsk_timeout(struct request_sock *req, unsigned long max_timeout) +{ + u64 timeout = (u64)req->timeout << req->num_timeout; + + return (unsigned long)min_t(u64, timeout, max_timeout); +} + static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) { /* The below has to be done to allow calling inet_csk_destroy_sock */ diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 29e41ff3ec93..144c39db9898 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -70,6 +70,7 @@ struct request_sock { struct saved_syn *saved_syn; u32 secid; u32 peer_secid; + u32 timeout; }; static inline struct request_sock *inet_reqsk(const struct sock *sk) @@ -104,6 +105,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, sk_node_init(&req_to_sk(req)->sk_node); sk_tx_queue_clear(req_to_sk(req)); req->saved_syn = NULL; + req->timeout = 0; req->num_timeout = 0; req->num_retrans = 0; req->sk = NULL; diff --git a/include/net/tcp.h b/include/net/tcp.h index b9fc978fb2ca..eff2487d972d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2358,7 +2358,7 @@ static inline u32 tcp_timeout_init(struct sock *sk) if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; - return timeout; + return min_t(int, timeout, TCP_RTO_MAX); } static inline u32 tcp_rwnd_init_bpf(struct sock *sk) -- cgit v1.2.3 From 52cc6ffc0ab2c61a76127b9347567fc97c15582f Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 31 Jan 2022 08:40:01 -0800 Subject: page_pool: Refactor page_pool to enable fragmenting after allocation This change is meant to permit a driver to perform "fragmenting" of the page from within the driver instead of the current model which requires pre-partitioning the page. The main motivation behind this is to support use cases where the page will be split up by the driver after DMA instead of before. With this change it becomes possible to start using page pool to replace some of the existing use cases where multiple references were being used for a single page, but the number needed was unknown as the size could be dynamic. For example, with this code it would be possible to do something like the following to handle allocation: page = page_pool_alloc_pages(); if (!page) return NULL; page_pool_fragment_page(page, DRIVER_PAGECNT_BIAS_MAX); rx_buf->page = page; rx_buf->pagecnt_bias = DRIVER_PAGECNT_BIAS_MAX; Then we would process a received buffer by handling it with: rx_buf->pagecnt_bias--; Once the page has been fully consumed we could then flush the remaining instances with: if (page_pool_defrag_page(page, rx_buf->pagecnt_bias)) continue; page_pool_put_defragged_page(pool, page -1, !!budget); The general idea is that we want to have the ability to allocate a page with excess fragment count and then trim off the unneeded fragments. Signed-off-by: Alexander Duyck Reviewed-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 82 +++++++++++++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 79a805542d0f..97c3c19872ff 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -201,21 +201,67 @@ static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, } #endif -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct); +void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, + bool allow_direct); -/* Same as above but will try to sync the entire area pool->max_len */ -static inline void page_pool_put_full_page(struct page_pool *pool, - struct page *page, bool allow_direct) +static inline void page_pool_fragment_page(struct page *page, long nr) +{ + atomic_long_set(&page->pp_frag_count, nr); +} + +static inline long page_pool_defrag_page(struct page *page, long nr) +{ + long ret; + + /* If nr == pp_frag_count then we have cleared all remaining + * references to the page. No need to actually overwrite it, instead + * we can leave this to be overwritten by the calling function. + * + * The main advantage to doing this is that an atomic_read is + * generally a much cheaper operation than an atomic update, + * especially when dealing with a page that may be partitioned + * into only 2 or 3 pieces. + */ + if (atomic_long_read(&page->pp_frag_count) == nr) + return 0; + + ret = atomic_long_sub_return(nr, &page->pp_frag_count); + WARN_ON(ret < 0); + return ret; +} + +static inline bool page_pool_is_last_frag(struct page_pool *pool, + struct page *page) +{ + /* If fragments aren't enabled or count is 0 we were the last user */ + return !(pool->p.flags & PP_FLAG_PAGE_FRAG) || + (page_pool_defrag_page(page, 1) == 0); +} + +static inline void page_pool_put_page(struct page_pool *pool, + struct page *page, + unsigned int dma_sync_size, + bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - page_pool_put_page(pool, page, -1, allow_direct); + if (!page_pool_is_last_frag(pool, page)) + return; + + page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); #endif } +/* Same as above but will try to sync the entire area pool->max_len */ +static inline void page_pool_put_full_page(struct page_pool *pool, + struct page *page, bool allow_direct) +{ + page_pool_put_page(pool, page, -1, allow_direct); +} + /* Same as above but the caller must guarantee safe context. e.g NAPI */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) @@ -243,30 +289,6 @@ static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr) page->dma_addr_upper = upper_32_bits(addr); } -static inline void page_pool_set_frag_count(struct page *page, long nr) -{ - atomic_long_set(&page->pp_frag_count, nr); -} - -static inline long page_pool_atomic_sub_frag_count_return(struct page *page, - long nr) -{ - long ret; - - /* As suggested by Alexander, atomic_long_read() may cover up the - * reference count errors, so avoid calling atomic_long_read() in - * the cases of freeing or draining the page_frags, where we would - * not expect it to match or that are slowpath anyway. - */ - if (__builtin_constant_p(nr) && - atomic_long_read(&page->pp_frag_count) == nr) - return 0; - - ret = atomic_long_sub_return(nr, &page->pp_frag_count); - WARN_ON(ret < 0); - return ret; -} - static inline bool is_page_pool_compiled_in(void) { #ifdef CONFIG_PAGE_POOL -- cgit v1.2.3 From 7af4a361a62f59b44ec301f21090823365dd0244 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Thu, 3 Feb 2022 11:16:53 +0100 Subject: net: dsa: mv88e6xxx: Improve isolation of standalone ports Clear MapDA on standalone ports to bypass any ATU lookup that might point the packet in the wrong direction. This means that all packets are flooded using the PVT config. So make sure that standalone ports are only allowed to communicate with the local upstream port. Here is a scenario in which this is needed: CPU | .----. .---0---. | .--0--. | sw0 | | | sw1 | '-1-2-3-' | '-1-2-' '---' - sw0p1 and sw1p1 are bridged - sw0p2 and sw1p2 are in standalone mode - Learning must be enabled on sw0p3 in order for hardware forwarding to work properly between bridged ports 1. A packet with SA :aa comes in on sw1p2 1a. Egresses sw1p0 1b. Ingresses sw0p3, ATU adds an entry for :aa towards port 3 1c. Egresses sw0p0 2. A packet with DA :aa comes in on sw0p2 2a. If an ATU lookup is done at this point, the packet will be incorrectly forwarded towards sw0p3. With this change in place, the ATU is bypassed and the packet is forwarded in accordance with the PVT, which only contains the CPU port. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 43c4153ef53a..6e5ef62a7dce 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -591,6 +591,18 @@ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port) return port == dsa_upstream_port(ds, port); } +/* Return the local port used to reach the CPU port */ +static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds) +{ + struct dsa_port *dp; + + dsa_switch_for_each_available_port(dp, ds) { + return dsa_upstream_port(ds, dp->index); + } + + return ds->num_ports; +} + /* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning * that the routing port from @downstream_ds to @upstream_ds is also the port * which @downstream_ds uses to reach its dedicated CPU. -- cgit v1.2.3 From d352b20f4174a6bd998992329b773ab513232880 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Thu, 3 Feb 2022 11:16:56 +0100 Subject: net: dsa: mv88e6xxx: Improve multichip isolation of standalone ports Given that standalone ports are now configured to bypass the ATU and forward all frames towards the upstream port, extend the ATU bypass to multichip systems. Load VID 0 (standalone) into the VTU with the policy bit set. Since VID 4095 (bridged) is already loaded, we now know that all VIDs in use are always available in all VTUs. Therefore, we can safely enable 802.1Q on DSA ports. Setting the DSA ports' VTU policy to TRAP means that all incoming frames on VID 0 will be classified as MGMT - as a result, the ATU is bypassed on all subsequent switches. With this isolation in place, we are able to support configurations that are simultaneously very quirky and very useful. Quirky because it involves looping cables between local switchports like in this example: CPU | .------. .---0---. | .----0----. | sw0 | | | sw1 | '-1-2-3-' | '-1-2-3-4-' $ @ '---' $ @ % % We have three physically looped pairs ($, @, and %). This is very useful because it allows us to run the kernel's kselftests for the bridge on mv88e6xxx hardware. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6e5ef62a7dce..ca8c14b547b4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -591,6 +591,12 @@ static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port) return port == dsa_upstream_port(ds, port); } +/* Return true if this is a DSA port leading away from the CPU */ +static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port) +{ + return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port); +} + /* Return the local port used to reach the CPU port */ static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds) { -- cgit v1.2.3 From 00edb2bac29f2e9c1674e85479fa5e3aa8cc648f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 10:55:44 -0800 Subject: i40e: remove enum i40e_client_state It's not used. Signed-off-by: Jakub Kicinski Reviewed-by: Jesse Brandeburg Tested-by: Gurucharan G Signed-off-by: Tony Nguyen --- include/linux/net/intel/i40e_client.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/net/intel/i40e_client.h b/include/linux/net/intel/i40e_client.h index 6b3267b49755..ed42bd5f639f 100644 --- a/include/linux/net/intel/i40e_client.h +++ b/include/linux/net/intel/i40e_client.h @@ -26,11 +26,6 @@ struct i40e_client_version { u8 rsvd; }; -enum i40e_client_state { - __I40E_CLIENT_NULL, - __I40E_CLIENT_REGISTERED -}; - enum i40e_client_instance_state { __I40E_CLIENT_INSTANCE_NONE, __I40E_CLIENT_INSTANCE_OPENED, @@ -190,11 +185,6 @@ struct i40e_client { const struct i40e_client_ops *ops; /* client ops provided by the client */ }; -static inline bool i40e_client_is_registered(struct i40e_client *client) -{ - return test_bit(__I40E_CLIENT_REGISTERED, &client->state); -} - void i40e_client_device_register(struct i40e_info *ldev, struct i40e_client *client); void i40e_client_device_unregister(struct i40e_info *ldev); -- cgit v1.2.3 From b794eecb2af77145d36b9c28f056e242add9c4b2 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Tue, 23 Nov 2021 10:25:36 -0800 Subject: ice: add support for DSCP QoS for IDC The ice driver provides QoS information to auxiliary drivers through the exported function ice_get_qos_params. This function doesn't currently support L3 DSCP QoS. Add the necessary defines, structure elements and code to support DSCP QoS through the IIDC functions. Signed-off-by: Dave Ertman Signed-off-by: Shiraz Saleem Signed-off-by: Tony Nguyen --- include/linux/net/intel/iidc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/net/intel/iidc.h b/include/linux/net/intel/iidc.h index 1289593411d3..1c1332e4df26 100644 --- a/include/linux/net/intel/iidc.h +++ b/include/linux/net/intel/iidc.h @@ -32,6 +32,8 @@ enum iidc_rdma_protocol { }; #define IIDC_MAX_USER_PRIORITY 8 +#define IIDC_MAX_DSCP_MAPPING 64 +#define IIDC_DSCP_PFC_MODE 0x1 /* Struct to hold per RDMA Qset info */ struct iidc_rdma_qset_params { @@ -60,6 +62,8 @@ struct iidc_qos_params { u8 vport_relative_bw; u8 vport_priority_type; u8 num_tc; + u8 pfc_mode; + u8 dscp_map[IIDC_MAX_DSCP_MAPPING]; }; struct iidc_event { -- cgit v1.2.3 From 8b5413647262dda8d8d0e07e14ea1de9ac7cf0b2 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 17 Jan 2022 21:56:13 +0100 Subject: netfilter: nfqueue: enable to get skb->priority This info could be useful to improve traffic analysis. Signed-off-by: Nicolas Dichtel Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_queue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nfnetlink_queue.h b/include/uapi/linux/netfilter/nfnetlink_queue.h index aed90c4df0c8..ef7c97f21a15 100644 --- a/include/uapi/linux/netfilter/nfnetlink_queue.h +++ b/include/uapi/linux/netfilter/nfnetlink_queue.h @@ -61,6 +61,7 @@ enum nfqnl_attr_type { NFQA_SECCTX, /* security context string */ NFQA_VLAN, /* nested attribute: packet vlan info */ NFQA_L2HDR, /* full L2 header */ + NFQA_PRIORITY, /* skb->priority */ __NFQA_MAX }; -- cgit v1.2.3 From bb62a765b1b5597d32a426096aa78d2a8eb6b091 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Jan 2022 13:06:59 +0100 Subject: netfilter: conntrack: make all extensions 8-byte alignned All extensions except one need 8 byte alignment, so just make that the default. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_extend.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index c7515d82ab06..705a4487f023 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -49,7 +49,7 @@ enum nf_ct_ext_id { struct nf_ct_ext { u8 offset[NF_CT_EXT_NUM]; u8 len; - char data[]; + char data[] __aligned(8); }; static inline bool __nf_ct_ext_exist(const struct nf_ct_ext *ext, u8 id) @@ -83,10 +83,7 @@ struct nf_ct_ext_type { void (*destroy)(struct nf_conn *ct); enum nf_ct_ext_id id; - - /* Length and min alignment. */ u8 len; - u8 align; }; int nf_ct_extend_register(const struct nf_ct_ext_type *type); -- cgit v1.2.3 From 5f31edc0676b55cd6baf5ba119d1881f3fbadee1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Jan 2022 13:07:00 +0100 Subject: netfilter: conntrack: move extension sizes into core No need to specify this in the registration modules, we already collect all sizes for build-time checks on the maximum combined size. After this change, all extensions except nat have no meaningful content in their nf_ct_ext_type struct definition. Next patch handles nat, this will then allow to remove the dynamic register api completely. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_extend.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 705a4487f023..87d818414092 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -83,7 +83,6 @@ struct nf_ct_ext_type { void (*destroy)(struct nf_conn *ct); enum nf_ct_ext_id id; - u8 len; }; int nf_ct_extend_register(const struct nf_ct_ext_type *type); -- cgit v1.2.3 From 1bc91a5ddf3eaea0e0ea957cccf3abdcfcace00e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Jan 2022 13:07:01 +0100 Subject: netfilter: conntrack: handle ->destroy hook via nat_ops instead The nat module already exposes a few functions to the conntrack core. Move the nat extension destroy hook to it. After this, no conntrack extension needs a destroy hook. 'struct nf_ct_ext_type' and the register/unregister api can be removed in a followup patch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 1 + include/net/netfilter/nf_conntrack_extend.h | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 15e71bfff726..c2c6f332fb90 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -379,6 +379,7 @@ struct nf_nat_hook { unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir); + void (*remove_nat_bysrc)(struct nf_conn *ct); }; extern const struct nf_nat_hook __rcu *nf_nat_hook; diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 87d818414092..343f9194423a 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -79,9 +79,6 @@ void nf_ct_ext_destroy(struct nf_conn *ct); void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); struct nf_ct_ext_type { - /* Destroys relationships (can be NULL). */ - void (*destroy)(struct nf_conn *ct); - enum nf_ct_ext_id id; }; -- cgit v1.2.3 From 1015c3de23eedb8ac5a163165434484df44cfe00 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Jan 2022 13:07:02 +0100 Subject: netfilter: conntrack: remove extension register api These no longer register/unregister a meaningful structure so remove it. Cc: Paul Blakey Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_acct.h | 1 - include/net/netfilter/nf_conntrack_ecache.h | 13 ------------- include/net/netfilter/nf_conntrack_extend.h | 9 --------- include/net/netfilter/nf_conntrack_labels.h | 3 --- include/net/netfilter/nf_conntrack_seqadj.h | 3 --- include/net/netfilter/nf_conntrack_timeout.h | 12 ------------ include/net/netfilter/nf_conntrack_timestamp.h | 13 ------------- 7 files changed, 54 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_acct.h b/include/net/netfilter/nf_conntrack_acct.h index 7f44a771530e..4b2b7f8914ea 100644 --- a/include/net/netfilter/nf_conntrack_acct.h +++ b/include/net/netfilter/nf_conntrack_acct.h @@ -78,7 +78,6 @@ static inline void nf_ct_acct_update(struct nf_conn *ct, u32 dir, void nf_conntrack_acct_pernet_init(struct net *net); -int nf_conntrack_acct_init(void); void nf_conntrack_acct_fini(void); #endif /* _NF_CONNTRACK_ACCT_H */ diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index d932e22edcb4..16bcff809b18 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -166,9 +166,6 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state); void nf_conntrack_ecache_pernet_init(struct net *net); void nf_conntrack_ecache_pernet_fini(struct net *net); -int nf_conntrack_ecache_init(void); -void nf_conntrack_ecache_fini(void); - static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) { return net->ct.ecache_dwork_pending; @@ -194,16 +191,6 @@ static inline void nf_conntrack_ecache_pernet_init(struct net *net) static inline void nf_conntrack_ecache_pernet_fini(struct net *net) { } - -static inline int nf_conntrack_ecache_init(void) -{ - return 0; -} - -static inline void nf_conntrack_ecache_fini(void) -{ -} - static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net) { return false; } #endif /* CONFIG_NF_CONNTRACK_EVENTS */ #endif /*_NF_CONNTRACK_ECACHE_H*/ diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 343f9194423a..96635ad2acc7 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -72,16 +72,7 @@ static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id) #define nf_ct_ext_find(ext, id) \ ((id##_TYPE *)__nf_ct_ext_find((ext), (id))) -/* Destroy all relationships */ -void nf_ct_ext_destroy(struct nf_conn *ct); - /* Add this type, returns pointer to data or NULL. */ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); -struct nf_ct_ext_type { - enum nf_ct_ext_id id; -}; - -int nf_ct_extend_register(const struct nf_ct_ext_type *type); -void nf_ct_extend_unregister(const struct nf_ct_ext_type *type); #endif /* _NF_CONNTRACK_EXTEND_H */ diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h index ba916411c4e1..3c23298e68ca 100644 --- a/include/net/netfilter/nf_conntrack_labels.h +++ b/include/net/netfilter/nf_conntrack_labels.h @@ -45,12 +45,9 @@ int nf_connlabels_replace(struct nf_conn *ct, #ifdef CONFIG_NF_CONNTRACK_LABELS int nf_conntrack_labels_init(void); -void nf_conntrack_labels_fini(void); int nf_connlabels_get(struct net *net, unsigned int bit); void nf_connlabels_put(struct net *net); #else -static inline int nf_conntrack_labels_init(void) { return 0; } -static inline void nf_conntrack_labels_fini(void) {} static inline int nf_connlabels_get(struct net *net, unsigned int bit) { return 0; } static inline void nf_connlabels_put(struct net *net) {} #endif diff --git a/include/net/netfilter/nf_conntrack_seqadj.h b/include/net/netfilter/nf_conntrack_seqadj.h index 0a10b50537ae..883c414b768e 100644 --- a/include/net/netfilter/nf_conntrack_seqadj.h +++ b/include/net/netfilter/nf_conntrack_seqadj.h @@ -42,7 +42,4 @@ int nf_ct_seq_adjust(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int protoff); s32 nf_ct_seq_offset(const struct nf_conn *ct, enum ip_conntrack_dir, u32 seq); -int nf_conntrack_seqadj_init(void); -void nf_conntrack_seqadj_fini(void); - #endif /* _NF_CONNTRACK_SEQADJ_H */ diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 659b0ea25b4d..db507e4a65bb 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -89,23 +89,11 @@ static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -int nf_conntrack_timeout_init(void); -void nf_conntrack_timeout_fini(void); void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout); int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name); void nf_ct_destroy_timeout(struct nf_conn *ct); #else -static inline int nf_conntrack_timeout_init(void) -{ - return 0; -} - -static inline void nf_conntrack_timeout_fini(void) -{ - return; -} - static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name) diff --git a/include/net/netfilter/nf_conntrack_timestamp.h b/include/net/netfilter/nf_conntrack_timestamp.h index 820ea34b6029..57138d974a9f 100644 --- a/include/net/netfilter/nf_conntrack_timestamp.h +++ b/include/net/netfilter/nf_conntrack_timestamp.h @@ -40,21 +40,8 @@ struct nf_conn_tstamp *nf_ct_tstamp_ext_add(struct nf_conn *ct, gfp_t gfp) #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP void nf_conntrack_tstamp_pernet_init(struct net *net); - -int nf_conntrack_tstamp_init(void); -void nf_conntrack_tstamp_fini(void); #else static inline void nf_conntrack_tstamp_pernet_init(struct net *net) {} - -static inline int nf_conntrack_tstamp_init(void) -{ - return 0; -} - -static inline void nf_conntrack_tstamp_fini(void) -{ - return; -} #endif /* CONFIG_NF_CONNTRACK_TIMESTAMP */ #endif /* _NF_CONNTRACK_TSTAMP_H */ -- cgit v1.2.3 From 20ff32024624102596f2b4083a17a97ca71d6cd8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Jan 2022 16:09:13 +0100 Subject: netfilter: conntrack: pptp: use single option structure Instead of exposing the four hooks individually use a sinle hook ops structure. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_pptp.h | 38 +++++++++++++---------------- 1 file changed, 17 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/nf_conntrack_pptp.h b/include/linux/netfilter/nf_conntrack_pptp.h index a28aa289afdc..c3bdb4370938 100644 --- a/include/linux/netfilter/nf_conntrack_pptp.h +++ b/include/linux/netfilter/nf_conntrack_pptp.h @@ -300,26 +300,22 @@ union pptp_ctrl_union { struct PptpSetLinkInfo setlink; }; -extern int -(*nf_nat_pptp_hook_outbound)(struct sk_buff *skb, - struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned int protoff, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq); - -extern int -(*nf_nat_pptp_hook_inbound)(struct sk_buff *skb, - struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned int protoff, - struct PptpControlHeader *ctlh, - union pptp_ctrl_union *pptpReq); - -extern void -(*nf_nat_pptp_hook_exp_gre)(struct nf_conntrack_expect *exp_orig, - struct nf_conntrack_expect *exp_reply); - -extern void -(*nf_nat_pptp_hook_expectfn)(struct nf_conn *ct, - struct nf_conntrack_expect *exp); +struct nf_nat_pptp_hook { + int (*outbound)(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq); + int (*inbound)(struct sk_buff *skb, + struct nf_conn *ct, enum ip_conntrack_info ctinfo, + unsigned int protoff, + struct PptpControlHeader *ctlh, + union pptp_ctrl_union *pptpReq); + void (*exp_gre)(struct nf_conntrack_expect *exp_orig, + struct nf_conntrack_expect *exp_reply); + void (*expectfn)(struct nf_conn *ct, + struct nf_conntrack_expect *exp); +}; +extern const struct nf_nat_pptp_hook __rcu *nf_nat_pptp_hook; #endif /* _NF_CONNTRACK_PPTP_H */ -- cgit v1.2.3 From b93235e68921b9acd38ee309953a3a9808105289 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 2 Feb 2022 14:20:31 -0800 Subject: tls: cap the output scatter list to something reasonable TLS recvmsg() passes user pages as destination for decrypt. The decrypt operation is repeated record by record, each record being 16kB, max. TLS allocates an sg_table and uses iov_iter_get_pages() to populate it with enough pages to fit the decrypted record. Even though we decrypt a single message at a time we size the sg_table based on the entire length of the iovec. This leads to unnecessarily large allocations, risking triggering OOM conditions. Use iov_iter_truncate() / iov_iter_reexpand() to construct a "capped" version of iov_iter_npages(). Alternatively we could parametrize iov_iter_npages() to take the size as arg instead of using i->count, or do something else.. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/uio.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/uio.h b/include/linux/uio.h index 1198a2bfc9bf..739285fe5a2f 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -273,6 +273,23 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) i->count = count; } +static inline int +iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes) +{ + size_t shorted = 0; + int npages; + + if (iov_iter_count(i) > max_bytes) { + shorted = iov_iter_count(i) - max_bytes; + iov_iter_truncate(i, max_bytes); + } + npages = iov_iter_npages(i, INT_MAX); + if (shorted) + iov_iter_reexpand(i, iov_iter_count(i) + shorted); + + return npages; +} + struct csum_state { __wsum csum; size_t off; -- cgit v1.2.3 From bed89478934a9803d1a4d97574dcc30e509aa972 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Feb 2022 10:49:38 +0200 Subject: ieee80211: fix -Wcast-qual warnings When enabling -Wcast-qual e.g. via W=3, we get a lot of warnings from this file, whenever it's included. Since the fixes are simple, just do that. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20220202104617.79ec4a4bab29.I8177a0c79d656c552e22c88931d8da06f2977896@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 559b6c644938..60ee7b3f58e7 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2427,7 +2427,7 @@ struct ieee80211_tx_pwr_env { static inline u8 ieee80211_he_oper_size(const u8 *he_oper_ie) { - struct ieee80211_he_operation *he_oper = (void *)he_oper_ie; + const struct ieee80211_he_operation *he_oper = (const void *)he_oper_ie; u8 oper_len = sizeof(struct ieee80211_he_operation); u32 he_oper_params; @@ -2460,7 +2460,7 @@ ieee80211_he_oper_size(const u8 *he_oper_ie) static inline const struct ieee80211_he_6ghz_oper * ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) { - const u8 *ret = (void *)&he_oper->optional; + const u8 *ret = (const void *)&he_oper->optional; u32 he_oper_params; if (!he_oper) @@ -2475,7 +2475,7 @@ ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) ret++; - return (void *)ret; + return (const void *)ret; } /* HE Spatial Reuse defines */ @@ -2496,7 +2496,7 @@ ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) static inline u8 ieee80211_he_spr_size(const u8 *he_spr_ie) { - struct ieee80211_he_spr *he_spr = (void *)he_spr_ie; + const struct ieee80211_he_spr *he_spr = (const void *)he_spr_ie; u8 spr_len = sizeof(struct ieee80211_he_spr); u8 he_spr_params; -- cgit v1.2.3 From 7e367b06f16b3d22e884af55117601d2a73fca03 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Feb 2022 10:49:39 +0200 Subject: cfg80211: fix -Wcast-qual warnings When enabling -Wcast-qual e.g. via W=3, we get a lot of warnings from this file, whenever it's included. Since the fixes are simple, just do that. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20220202104617.6a1d52213019.I92d82e7251cf712faa43fd09db3142327a3bce3d@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d19e48f9fdc6..f6db085ff3df 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2604,7 +2604,7 @@ const struct element *ieee80211_bss_get_elem(struct cfg80211_bss *bss, u8 id); */ static inline const u8 *ieee80211_bss_get_ie(struct cfg80211_bss *bss, u8 id) { - return (void *)ieee80211_bss_get_elem(bss, id); + return (const void *)ieee80211_bss_get_elem(bss, id); } @@ -5970,9 +5970,9 @@ cfg80211_find_ie_match(u8 eid, const u8 *ies, unsigned int len, (!match_len && match_offset))) return NULL; - return (void *)cfg80211_find_elem_match(eid, ies, len, - match, match_len, - match_offset ? + return (const void *)cfg80211_find_elem_match(eid, ies, len, + match, match_len, + match_offset ? match_offset - 2 : 0); } @@ -6099,7 +6099,7 @@ static inline const u8 * cfg80211_find_vendor_ie(unsigned int oui, int oui_type, const u8 *ies, unsigned int len) { - return (void *)cfg80211_find_vendor_elem(oui, oui_type, ies, len); + return (const void *)cfg80211_find_vendor_elem(oui, oui_type, ies, len); } /** -- cgit v1.2.3 From 5beb53d6ba4f39743d057e756db22bd8c079fc21 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Feb 2022 10:49:40 +0200 Subject: ieee80211: radiotap: fix -Wcast-qual warnings When enabling -Wcast-qual e.g. via W=3, we get a lot of warnings from this file, whenever it's included. Since the fixes are simple, just do that. Signed-off-by: Johannes Berg Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20220202104617.cc733aeb1a18.I03396e1bf7a1af364cbd0916037f65d800035039@changeid Signed-off-by: Johannes Berg --- include/net/ieee80211_radiotap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h index 11630351c978..598f53d2a3a0 100644 --- a/include/net/ieee80211_radiotap.h +++ b/include/net/ieee80211_radiotap.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2017 Intel Deutschland GmbH - * Copyright (c) 2018-2019 Intel Corporation + * Copyright (c) 2018-2019, 2021 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -365,7 +365,7 @@ enum ieee80211_radiotap_zero_len_psdu_type { */ static inline u16 ieee80211_get_radiotap_len(const char *data) { - struct ieee80211_radiotap_header *hdr = (void *)data; + const struct ieee80211_radiotap_header *hdr = (const void *)data; return get_unaligned_le16(&hdr->it_len); } -- cgit v1.2.3 From ea5907db2a9ccf37fdb6d1e67bcb620c1fea10f8 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Wed, 2 Feb 2022 10:49:47 +0200 Subject: mac80211: fix struct ieee80211_tx_info size The size of the status_driver_data field was not adjusted when the is_valid_ack_signal field was added. Since the size of struct ieee80211_tx_info is limited, replace the is_valid_ack_signal field with a flags field, and adjust the struct size accordingly. Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20220202104617.0ff363d4fa56.I45792c0187034a6d0e1c99a7db741996ef7caba3@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c50221d7e82c..bd6912d0292b 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7,7 +7,7 @@ * Copyright 2007-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2021 Intel Corporation + * Copyright (C) 2018 - 2022 Intel Corporation */ #ifndef MAC80211_H @@ -883,6 +883,17 @@ enum mac80211_tx_control_flags { IEEE80211_TX_CTRL_DONT_REORDER = BIT(8), }; +/** + * enum mac80211_tx_status_flags - flags to describe transmit status + * + * @IEEE80211_TX_STATUS_ACK_SIGNAL_VALID: ACK signal is valid + * + * These flags are used in tx_info->status.flags. + */ +enum mac80211_tx_status_flags { + IEEE80211_TX_STATUS_ACK_SIGNAL_VALID = BIT(0), +}; + /* * This definition is used as a mask to clear all temporary flags, which are * set by the tx handlers for each transmission attempt by the mac80211 stack. @@ -1046,7 +1057,7 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) * @status.antenna: (legacy, kept only for iwlegacy) * @status.tx_time: airtime consumed for transmission; note this is only * used for WMM AC, not for airtime fairness - * @status.is_valid_ack_signal: ACK signal is valid + * @status.flags: status flags, see &enum mac80211_tx_status_flags * @status.status_driver_data: driver use area * @ack: union part for pure ACK data * @ack.cookie: cookie for the ACK @@ -1099,8 +1110,8 @@ struct ieee80211_tx_info { u8 ampdu_len; u8 antenna; u16 tx_time; - bool is_valid_ack_signal; - void *status_driver_data[19 / sizeof(void *)]; + u8 flags; + void *status_driver_data[18 / sizeof(void *)]; } status; struct { struct ieee80211_tx_rate driver_rates[ -- cgit v1.2.3 From e70e13e7d4ab8f932f49db1c9500b30a34a6d420 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 4 Feb 2022 01:55:18 +0100 Subject: bpf: Implement bpf_core_types_are_compat(). Adopt libbpf's bpf_core_types_are_compat() for kernel duty by adding explicit recursion limit of 2 which is enough to handle 2 levels of function prototypes. Signed-off-by: Matteo Croce Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204005519.60361-2-mcroce@linux.microsoft.com --- include/linux/btf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index f6c43dd513fa..36bc09b8e890 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -327,6 +327,11 @@ static inline const struct btf_var_secinfo *btf_type_var_secinfo( return (const struct btf_var_secinfo *)(t + 1); } +static inline struct btf_param *btf_params(const struct btf_type *t) +{ + return (struct btf_param *)(t + 1); +} + #ifdef CONFIG_BPF_SYSCALL struct bpf_prog; -- cgit v1.2.3 From c78b8b20e34920231eda02fb40c7aca7d88be837 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 3 Feb 2022 15:12:40 -0800 Subject: net: don't include ndisc.h from ipv6.h Nothing in ipv6.h needs ndisc.h, drop it. Link: https://lore.kernel.org/r/20220203043457.2222388-1-kuba@kernel.org Acked-by: Jeremy Kerr Acked-by: Stefan Schmidt Link: https://lore.kernel.org/r/20220203231240.2297588-1-kuba@kernel.org Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 1 - include/net/ipv6_frag.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 082f30256f59..cda1f205f391 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 0a4779175a52..5052c66e22d2 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _IPV6_FRAG_H #define _IPV6_FRAG_H +#include #include #include #include -- cgit v1.2.3 From be847673cfffce8bb6e9ed6ae186081280c58831 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Wed, 2 Feb 2022 15:25:53 +0100 Subject: uapi: ioam: Insertion frequency Add the insertion frequency uapi for IOAM lwtunnels. Signed-off-by: Justin Iurman Signed-off-by: Jakub Kicinski --- include/uapi/linux/ioam6_iptunnel.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ioam6_iptunnel.h b/include/uapi/linux/ioam6_iptunnel.h index 829ffdfcacca..38f6a8fdfd34 100644 --- a/include/uapi/linux/ioam6_iptunnel.h +++ b/include/uapi/linux/ioam6_iptunnel.h @@ -41,6 +41,15 @@ enum { /* IOAM Trace Header */ IOAM6_IPTUNNEL_TRACE, /* struct ioam6_trace_hdr */ + /* Insertion frequency: + * "k over n" packets (0 < k <= n) + * [0.0001% ... 100%] + */ +#define IOAM6_IPTUNNEL_FREQ_MIN 1 +#define IOAM6_IPTUNNEL_FREQ_MAX 1000000 + IOAM6_IPTUNNEL_FREQ_K, /* u32 */ + IOAM6_IPTUNNEL_FREQ_N, /* u32 */ + __IOAM6_IPTUNNEL_MAX, }; -- cgit v1.2.3 From 35d39fecbc242150af5587506e58ec1f8541fb68 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Thu, 3 Feb 2022 10:44:30 +0200 Subject: net/sched: Enable tc skb ext allocation on chain miss only when needed Currently tc skb extension is used to send miss info from tc to ovs datapath module, and driver to tc. For the tc to ovs miss it is currently always allocated even if it will not be used by ovs datapath (as it depends on a requested feature). Export the static key which is used by openvswitch module to guard this code path as well, so it will be skipped if ovs datapath doesn't need it. Enable this code path once ovs datapath needs it. Signed-off-by: Paul Blakey Reviewed-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 676cb8ea9e15..a3b57a93228a 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -1028,4 +1028,15 @@ struct tc_fifo_qopt_offload { }; }; +#ifdef CONFIG_NET_CLS_ACT +DECLARE_STATIC_KEY_FALSE(tc_skb_ext_tc); +void tc_skb_ext_tc_enable(void); +void tc_skb_ext_tc_disable(void); +#define tc_skb_ext_tc_enabled() static_branch_unlikely(&tc_skb_ext_tc) +#else /* CONFIG_NET_CLS_ACT */ +static inline void tc_skb_ext_tc_enable(void) { } +static inline void tc_skb_ext_tc_disable(void) { } +#define tc_skb_ext_tc_enabled() false +#endif + #endif -- cgit v1.2.3 From de5a1f3ce4c862150dc442530dba19e1d1dc6bc2 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 4 Feb 2022 12:28:37 +0100 Subject: net: gro: minor optimization for dev_gro_receive() While inspecting some perf report, I noticed that the compiler emits suboptimal code for the napi CB initialization, fetching and storing multiple times the memory for flags bitfield. This is with gcc 10.3.1, but I observed the same with older compiler versions. We can help the compiler to do a nicer work clearing several fields at once using an u32 alias. The generated code is quite smaller, with the same number of conditional. Before: objdump -t net/core/gro.o | grep " F .text" 0000000000000bb0 l F .text 0000000000000357 dev_gro_receive After: 0000000000000bb0 l F .text 000000000000033c dev_gro_receive v1 -> v2: - use struct_group (Alexander and Alex) RFC -> v1: - use __struct_group to delimit the zeroed area (Alexander) Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/gro.h | 52 ++++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h index 8f75802d50fd..a765fedda5c4 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -29,46 +29,50 @@ struct napi_gro_cb { /* Number of segments aggregated. */ u16 count; - /* Start offset for remote checksum offload */ - u16 gro_remcsum_start; + /* Used in ipv6_gro_receive() and foo-over-udp */ + u16 proto; /* jiffies when first packet was created/queued */ unsigned long age; - /* Used in ipv6_gro_receive() and foo-over-udp */ - u16 proto; + /* portion of the cb set to zero at every gro iteration */ + struct_group(zeroed, + + /* Start offset for remote checksum offload */ + u16 gro_remcsum_start; - /* This is non-zero if the packet may be of the same flow. */ - u8 same_flow:1; + /* This is non-zero if the packet may be of the same flow. */ + u8 same_flow:1; - /* Used in tunnel GRO receive */ - u8 encap_mark:1; + /* Used in tunnel GRO receive */ + u8 encap_mark:1; - /* GRO checksum is valid */ - u8 csum_valid:1; + /* GRO checksum is valid */ + u8 csum_valid:1; - /* Number of checksums via CHECKSUM_UNNECESSARY */ - u8 csum_cnt:3; + /* Number of checksums via CHECKSUM_UNNECESSARY */ + u8 csum_cnt:3; - /* Free the skb? */ - u8 free:2; + /* Free the skb? */ + u8 free:2; #define NAPI_GRO_FREE 1 #define NAPI_GRO_FREE_STOLEN_HEAD 2 - /* Used in foo-over-udp, set in udp[46]_gro_receive */ - u8 is_ipv6:1; + /* Used in foo-over-udp, set in udp[46]_gro_receive */ + u8 is_ipv6:1; - /* Used in GRE, set in fou/gue_gro_receive */ - u8 is_fou:1; + /* Used in GRE, set in fou/gue_gro_receive */ + u8 is_fou:1; - /* Used to determine if flush_id can be ignored */ - u8 is_atomic:1; + /* Used to determine if flush_id can be ignored */ + u8 is_atomic:1; - /* Number of gro_receive callbacks this packet already went through */ - u8 recursion_counter:4; + /* Number of gro_receive callbacks this packet already went through */ + u8 recursion_counter:4; - /* GRO is done by frag_list pointer chaining. */ - u8 is_flist:1; + /* GRO is done by frag_list pointer chaining. */ + u8 is_flist:1; + ); /* used to support CHECKSUM_COMPLETE for tunneling protocols */ __wsum csum; -- cgit v1.2.3 From 0463e320421b313db320bbcf2928ebf49f556b67 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 4 Feb 2022 11:42:10 +0000 Subject: net: phylink: remove phylink_set_10g_modes() phylink_set_10g_modes() is no longer used with the conversion of drivers to phylink_generic_validate(), so we can remove it. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 713a0c928b7c..cca149f78d35 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -582,7 +582,6 @@ int phylink_speed_up(struct phylink *pl); #define phylink_test(bm, mode) __phylink_do_bit(test_bit, bm, mode) void phylink_set_port_modes(unsigned long *bits); -void phylink_set_10g_modes(unsigned long *mask); void phylink_helper_basex_speed(struct phylink_link_state *state); void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state, -- cgit v1.2.3 From 145c7a793838add5e004e7d49a67654dc7eba147 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Feb 2022 12:15:45 -0800 Subject: ipv6: make mc_forwarding atomic This fixes minor data-races in ip6_mc_input() and batadv_mcast_mla_rtr_flags_softif_get_ipv6() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/ipv6.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 1e0f8a31f3de..16870f86c74d 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -51,7 +51,7 @@ struct ipv6_devconf { __s32 use_optimistic; #endif #ifdef CONFIG_IPV6_MROUTE - __s32 mc_forwarding; + atomic_t mc_forwarding; #endif __s32 disable_ipv6; __s32 drop_unicast_in_l2_multicast; -- cgit v1.2.3 From e3ececfe668facd87d920b608349a32607060e66 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Feb 2022 14:42:35 -0800 Subject: ref_tracker: implement use-after-free detection Whenever ref_tracker_dir_init() is called, mark the struct ref_tracker_dir as dead. Test the dead status from ref_tracker_alloc() and ref_tracker_free() This should detect buggy dev_put()/dev_hold() happening too late in netdevice dismantle process. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/ref_tracker.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h index 60f3453be23e..a443abda937d 100644 --- a/include/linux/ref_tracker.h +++ b/include/linux/ref_tracker.h @@ -13,6 +13,7 @@ struct ref_tracker_dir { spinlock_t lock; unsigned int quarantine_avail; refcount_t untracked; + bool dead; struct list_head list; /* List of active trackers */ struct list_head quarantine; /* List of dead trackers */ #endif @@ -26,6 +27,7 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, INIT_LIST_HEAD(&dir->quarantine); spin_lock_init(&dir->lock); dir->quarantine_avail = quarantine_count; + dir->dead = false; refcount_set(&dir->untracked, 1); stack_depot_init(); } -- cgit v1.2.3 From 8fd5522f44dcd7f05454ddc4f16d0f821b676cd9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Feb 2022 14:42:36 -0800 Subject: ref_tracker: add a count of untracked references We are still chasing a netdev refcount imbalance, and we suspect we have one rogue dev_put() that is consuming a reference taken from a dev_hold_track() To detect this case, allow ref_tracker_alloc() and ref_tracker_free() to be called with a NULL @trackerp parameter, and use a dedicated refcount_t just for them. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/ref_tracker.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ref_tracker.h b/include/linux/ref_tracker.h index a443abda937d..9ca353ab712b 100644 --- a/include/linux/ref_tracker.h +++ b/include/linux/ref_tracker.h @@ -13,6 +13,7 @@ struct ref_tracker_dir { spinlock_t lock; unsigned int quarantine_avail; refcount_t untracked; + refcount_t no_tracker; bool dead; struct list_head list; /* List of active trackers */ struct list_head quarantine; /* List of dead trackers */ @@ -29,6 +30,7 @@ static inline void ref_tracker_dir_init(struct ref_tracker_dir *dir, dir->quarantine_avail = quarantine_count; dir->dead = false; refcount_set(&dir->untracked, 1); + refcount_set(&dir->no_tracker, 1); stack_depot_init(); } -- cgit v1.2.3 From 4c6c11ea0f7b00a1894803efe980dfaf3b074886 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Feb 2022 14:42:37 -0800 Subject: net: refine dev_put()/dev_hold() debugging We are still chasing some syzbot reports where we think a rogue dev_put() is called with no corresponding prior dev_hold(). Unfortunately it eats a reference on dev->dev_refcnt taken by innocent dev_hold_track(), meaning that the refcount saturation splat comes too late to be useful. Make sure that 'not tracked' dev_put() and dev_hold() better use CONFIG_NET_DEV_REFCNT_TRACKER=y debug infrastructure: Prior patch in the series allowed ref_tracker_alloc() and ref_tracker_free() to be called with a NULL @trackerp parameter, and to use a separate refcount only to detect too many put() even in the following case: dev_hold_track(dev, tracker_1, GFP_ATOMIC); dev_hold(dev); dev_put(dev); dev_put(dev); // Should complain loudly here. dev_put_track(dev, tracker_1); // instead of here Add clarification about netdev_tracker_alloc() role. v2: I replaced the dev_put() in linkwatch_do_dev() with __dev_put() because callers called netdev_tracker_free(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 69 +++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e490b84732d1..3fb6fb67ed77 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3817,14 +3817,7 @@ extern unsigned int netdev_budget_usecs; /* Called by rtnetlink.c:rtnl_unlock() */ void netdev_run_todo(void); -/** - * dev_put - release reference to device - * @dev: network device - * - * Release reference to device to allow it to be freed. - * Try using dev_put_track() instead. - */ -static inline void dev_put(struct net_device *dev) +static inline void __dev_put(struct net_device *dev) { if (dev) { #ifdef CONFIG_PCPU_DEV_REFCNT @@ -3835,14 +3828,7 @@ static inline void dev_put(struct net_device *dev) } } -/** - * dev_hold - get reference to device - * @dev: network device - * - * Hold reference to device to keep it from being freed. - * Try using dev_hold_track() instead. - */ -static inline void dev_hold(struct net_device *dev) +static inline void __dev_hold(struct net_device *dev) { if (dev) { #ifdef CONFIG_PCPU_DEV_REFCNT @@ -3853,11 +3839,24 @@ static inline void dev_hold(struct net_device *dev) } } +static inline void __netdev_tracker_alloc(struct net_device *dev, + netdevice_tracker *tracker, + gfp_t gfp) +{ +#ifdef CONFIG_NET_DEV_REFCNT_TRACKER + ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp); +#endif +} + +/* netdev_tracker_alloc() can upgrade a prior untracked reference + * taken by dev_get_by_name()/dev_get_by_index() to a tracked one. + */ static inline void netdev_tracker_alloc(struct net_device *dev, netdevice_tracker *tracker, gfp_t gfp) { #ifdef CONFIG_NET_DEV_REFCNT_TRACKER - ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp); + refcount_dec(&dev->refcnt_tracker.no_tracker); + __netdev_tracker_alloc(dev, tracker, gfp); #endif } @@ -3873,8 +3872,8 @@ static inline void dev_hold_track(struct net_device *dev, netdevice_tracker *tracker, gfp_t gfp) { if (dev) { - dev_hold(dev); - netdev_tracker_alloc(dev, tracker, gfp); + __dev_hold(dev); + __netdev_tracker_alloc(dev, tracker, gfp); } } @@ -3883,10 +3882,34 @@ static inline void dev_put_track(struct net_device *dev, { if (dev) { netdev_tracker_free(dev, tracker); - dev_put(dev); + __dev_put(dev); } } +/** + * dev_hold - get reference to device + * @dev: network device + * + * Hold reference to device to keep it from being freed. + * Try using dev_hold_track() instead. + */ +static inline void dev_hold(struct net_device *dev) +{ + dev_hold_track(dev, NULL, GFP_ATOMIC); +} + +/** + * dev_put - release reference to device + * @dev: network device + * + * Release reference to device to allow it to be freed. + * Try using dev_put_track() instead. + */ +static inline void dev_put(struct net_device *dev) +{ + dev_put_track(dev, NULL); +} + static inline void dev_replace_track(struct net_device *odev, struct net_device *ndev, netdevice_tracker *tracker, @@ -3895,11 +3918,11 @@ static inline void dev_replace_track(struct net_device *odev, if (odev) netdev_tracker_free(odev, tracker); - dev_hold(ndev); - dev_put(odev); + __dev_hold(ndev); + __dev_put(odev); if (ndev) - netdev_tracker_alloc(ndev, tracker, gfp); + __netdev_tracker_alloc(ndev, tracker, gfp); } /* Carrier loss detection, dial on demand. The functions netif_carrier_on -- cgit v1.2.3 From 5a8fb33e530512ee67a11b30f3451a4f030f4b01 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 4 Feb 2022 20:56:14 -0800 Subject: skmsg: convert struct sk_msg_sg::copy to a bitmap We have plans for increasing MAX_SKB_FRAGS, but sk_msg_sg::copy is currently an unsigned long, limiting MAX_SKB_FRAGS to 30 on 32bit arches. Convert it to a bitmap, as Jakub suggested. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skmsg.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 18a717fe62eb..1ff68a88c58d 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -29,7 +29,7 @@ struct sk_msg_sg { u32 end; u32 size; u32 copybreak; - unsigned long copy; + DECLARE_BITMAP(copy, MAX_MSG_FRAGS + 2); /* The extra two elements: * 1) used for chaining the front and sections when the list becomes * partitioned (e.g. end < start). The crypto APIs require the @@ -38,7 +38,6 @@ struct sk_msg_sg { */ struct scatterlist data[MAX_MSG_FRAGS + 2]; }; -static_assert(BITS_PER_LONG >= NR_MSG_FRAG_IDS); /* UAPI in filter.c depends on struct sk_msg_sg being first element. */ struct sk_msg { @@ -234,7 +233,7 @@ static inline void sk_msg_compute_data_pointers(struct sk_msg *msg) { struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start); - if (test_bit(msg->sg.start, &msg->sg.copy)) { + if (test_bit(msg->sg.start, msg->sg.copy)) { msg->data = NULL; msg->data_end = NULL; } else { @@ -253,7 +252,7 @@ static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page, sg_set_page(sge, page, len, offset); sg_unmark_end(sge); - __set_bit(msg->sg.end, &msg->sg.copy); + __set_bit(msg->sg.end, msg->sg.copy); msg->sg.size += len; sk_msg_iter_next(msg, end); } @@ -262,9 +261,9 @@ static inline void sk_msg_sg_copy(struct sk_msg *msg, u32 i, bool copy_state) { do { if (copy_state) - __set_bit(i, &msg->sg.copy); + __set_bit(i, msg->sg.copy); else - __clear_bit(i, &msg->sg.copy); + __clear_bit(i, msg->sg.copy); sk_msg_iter_var_next(i); if (i == msg->sg.end) break; -- cgit v1.2.3 From 9c1be1935fb68b2413796cdc03d019b8cf35ab51 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 5 Feb 2022 09:01:25 -0800 Subject: net: initialize init_net earlier While testing a patch that will follow later ("net: add netns refcount tracker to struct nsproxy") I found that devtmpfs_init() was called before init_net was initialized. This is a bug, because devtmpfs_setup() calls ksys_unshare(CLONE_NEWNS); This has the effect of increasing init_net refcount, which will be later overwritten to 1, as part of setup_net(&init_net) We had too many prior patches [1] trying to work around the root cause. Really, make sure init_net is in BSS section, and that net_ns_init() is called earlier at boot time. Note that another patch ("vfs: add netns refcount tracker to struct fs_context") also will need net_ns_init() being called before vfs_caches_init() As a bonus, this patch saves around 4KB in .data section. [1] f8c46cb39079 ("netns: do not call pernet ops for not yet set up init_net namespace") b5082df8019a ("net: Initialise init_net.count to 1") 734b65417b24 ("net: Statically initialize init_net.dev_base_head") v2: fixed a build error reported by kernel build bots (CONFIG_NET=n) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/net_namespace.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 5b61c462e534..374cc7b260fc 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -513,4 +513,10 @@ static inline void fnhe_genid_bump(struct net *net) atomic_inc(&net->fnhe_genid); } +#ifdef CONFIG_NET +void net_ns_init(void); +#else +static inline void net_ns_init(void) {} +#endif + #endif /* __NET_NET_NAMESPACE_H */ -- cgit v1.2.3 From 88590b369354092183bcba04e2368010c462557f Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Feb 2022 15:47:33 +0800 Subject: net: skb_drop_reason: add document for drop reasons Add document for following existing drop reasons: SKB_DROP_REASON_NOT_SPECIFIED SKB_DROP_REASON_NO_SOCKET SKB_DROP_REASON_PKT_TOO_SMALL SKB_DROP_REASON_TCP_CSUM SKB_DROP_REASON_SOCKET_FILTER SKB_DROP_REASON_UDP_CSUM Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a27bcc4f7e9a..f04e3a1f4455 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -314,12 +314,12 @@ struct sk_buff; * used to translate the reason to string. */ enum skb_drop_reason { - SKB_DROP_REASON_NOT_SPECIFIED, - SKB_DROP_REASON_NO_SOCKET, - SKB_DROP_REASON_PKT_TOO_SMALL, - SKB_DROP_REASON_TCP_CSUM, - SKB_DROP_REASON_SOCKET_FILTER, - SKB_DROP_REASON_UDP_CSUM, + SKB_DROP_REASON_NOT_SPECIFIED, /* drop reason is not specified */ + SKB_DROP_REASON_NO_SOCKET, /* socket not found */ + SKB_DROP_REASON_PKT_TOO_SMALL, /* packet size is too small */ + SKB_DROP_REASON_TCP_CSUM, /* TCP checksum error */ + SKB_DROP_REASON_SOCKET_FILTER, /* dropped by socket filter */ + SKB_DROP_REASON_UDP_CSUM, /* UDP checksum error */ SKB_DROP_REASON_MAX, }; -- cgit v1.2.3 From 2df3041ba3be950376e8c25a8f6da22f7fcc765c Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Feb 2022 15:47:34 +0800 Subject: net: netfilter: use kfree_drop_reason() for NF_DROP Replace kfree_skb() with kfree_skb_reason() in nf_hook_slow() when skb is dropped by reason of NF_DROP. Following new drop reasons are introduced: SKB_DROP_REASON_NETFILTER_DROP Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/trace/events/skb.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f04e3a1f4455..9060159b4375 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -320,6 +320,7 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_CSUM, /* TCP checksum error */ SKB_DROP_REASON_SOCKET_FILTER, /* dropped by socket filter */ SKB_DROP_REASON_UDP_CSUM, /* UDP checksum error */ + SKB_DROP_REASON_NETFILTER_DROP, /* dropped by netfilter */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index a8a64b97504d..3d89f7b09a43 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -16,6 +16,7 @@ EM(SKB_DROP_REASON_TCP_CSUM, TCP_CSUM) \ EM(SKB_DROP_REASON_SOCKET_FILTER, SOCKET_FILTER) \ EM(SKB_DROP_REASON_UDP_CSUM, UDP_CSUM) \ + EM(SKB_DROP_REASON_NETFILTER_DROP, NETFILTER_DROP) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 33cba42985c8144eef78d618fc1e51aaa074b169 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Feb 2022 15:47:35 +0800 Subject: net: ipv4: use kfree_skb_reason() in ip_rcv_core() Replace kfree_skb() with kfree_skb_reason() in ip_rcv_core(). Three new drop reasons are introduced: SKB_DROP_REASON_OTHERHOST SKB_DROP_REASON_IP_CSUM SKB_DROP_REASON_IP_INHDR Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++++ include/trace/events/skb.h | 3 +++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9060159b4375..8e82130b3c52 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -321,6 +321,15 @@ enum skb_drop_reason { SKB_DROP_REASON_SOCKET_FILTER, /* dropped by socket filter */ SKB_DROP_REASON_UDP_CSUM, /* UDP checksum error */ SKB_DROP_REASON_NETFILTER_DROP, /* dropped by netfilter */ + SKB_DROP_REASON_OTHERHOST, /* packet don't belong to current + * host (interface is in promisc + * mode) + */ + SKB_DROP_REASON_IP_CSUM, /* IP checksum error */ + SKB_DROP_REASON_IP_INHDR, /* there is something wrong with + * IP header (see + * IPSTATS_MIB_INHDRERRORS) + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 3d89f7b09a43..f2b1778485f0 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -17,6 +17,9 @@ EM(SKB_DROP_REASON_SOCKET_FILTER, SOCKET_FILTER) \ EM(SKB_DROP_REASON_UDP_CSUM, UDP_CSUM) \ EM(SKB_DROP_REASON_NETFILTER_DROP, NETFILTER_DROP) \ + EM(SKB_DROP_REASON_OTHERHOST, OTHERHOST) \ + EM(SKB_DROP_REASON_IP_CSUM, IP_CSUM) \ + EM(SKB_DROP_REASON_IP_INHDR, IP_INHDR) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From c1f166d1f7eef212096a98b22f5acf92f9af353d Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Feb 2022 15:47:36 +0800 Subject: net: ipv4: use kfree_skb_reason() in ip_rcv_finish_core() Replace kfree_skb() with kfree_skb_reason() in ip_rcv_finish_core(), following drop reasons are introduced: SKB_DROP_REASON_IP_RPFILTER SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++++ include/trace/events/skb.h | 3 +++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8e82130b3c52..4baba45f223d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -330,6 +330,15 @@ enum skb_drop_reason { * IP header (see * IPSTATS_MIB_INHDRERRORS) */ + SKB_DROP_REASON_IP_RPFILTER, /* IP rpfilter validate failed. + * see the document for rp_filter + * in ip-sysctl.rst for more + * information + */ + SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST, /* destination address of L2 + * is multicast, but L3 is + * unicast. + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index f2b1778485f0..485a1d3034a4 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -20,6 +20,9 @@ EM(SKB_DROP_REASON_OTHERHOST, OTHERHOST) \ EM(SKB_DROP_REASON_IP_CSUM, IP_CSUM) \ EM(SKB_DROP_REASON_IP_INHDR, IP_INHDR) \ + EM(SKB_DROP_REASON_IP_RPFILTER, IP_RPFILTER) \ + EM(SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST, \ + UNICAST_IN_L2_MULTICAST) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 10580c4791902571777603cb980414892dd5f149 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Feb 2022 15:47:37 +0800 Subject: net: ipv4: use kfree_skb_reason() in ip_protocol_deliver_rcu() Replace kfree_skb() with kfree_skb_reason() in ip_protocol_deliver_rcu(). Following new drop reasons are introduced: SKB_DROP_REASON_XFRM_POLICY SKB_DROP_REASON_IP_NOPROTO Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ include/trace/events/skb.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4baba45f223d..2a64afa97910 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -339,6 +339,8 @@ enum skb_drop_reason { * is multicast, but L3 is * unicast. */ + SKB_DROP_REASON_XFRM_POLICY, /* xfrm policy check failed */ + SKB_DROP_REASON_IP_NOPROTO, /* no support for IP protocol */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 485a1d3034a4..985e481c092d 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -23,6 +23,8 @@ EM(SKB_DROP_REASON_IP_RPFILTER, IP_RPFILTER) \ EM(SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST, \ UNICAST_IN_L2_MULTICAST) \ + EM(SKB_DROP_REASON_XFRM_POLICY, XFRM_POLICY) \ + EM(SKB_DROP_REASON_IP_NOPROTO, IP_NOPROTO) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 08d4c0370c400fa6ef2194f9ee2e8dccc4a7ab39 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 5 Feb 2022 15:47:39 +0800 Subject: net: udp: use kfree_skb_reason() in __udp_queue_rcv_skb() Replace kfree_skb() with kfree_skb_reason() in __udp_queue_rcv_skb(). Following new drop reasons are introduced: SKB_DROP_REASON_SOCKET_RCVBUFF SKB_DROP_REASON_PROTO_MEM Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ include/trace/events/skb.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2a64afa97910..a5adbf6b51e8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -341,6 +341,11 @@ enum skb_drop_reason { */ SKB_DROP_REASON_XFRM_POLICY, /* xfrm policy check failed */ SKB_DROP_REASON_IP_NOPROTO, /* no support for IP protocol */ + SKB_DROP_REASON_SOCKET_RCVBUFF, /* socket receive buff is full */ + SKB_DROP_REASON_PROTO_MEM, /* proto memory limition, such as + * udp packet drop out of + * udp_memory_allocated. + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 985e481c092d..cfcfd26399f7 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -25,6 +25,8 @@ UNICAST_IN_L2_MULTICAST) \ EM(SKB_DROP_REASON_XFRM_POLICY, XFRM_POLICY) \ EM(SKB_DROP_REASON_IP_NOPROTO, IP_NOPROTO) \ + EM(SKB_DROP_REASON_SOCKET_RCVBUFF, SOCKET_RCVBUFF) \ + EM(SKB_DROP_REASON_PROTO_MEM, PROTO_MEM) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 3486bedd99196ecdfe99c0ab5b67ad3c47e8a8fa Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:35 -0800 Subject: bpf: Use bytes instead of pages for bpf_jit_[charge|uncharge]_modmem This enables sub-page memory charge and allocation. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-3-song@kernel.org --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6eb0b180d33b..366f88afd56b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -846,8 +846,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); -int bpf_jit_charge_modmem(u32 pages); -void bpf_jit_uncharge_modmem(u32 pages); +int bpf_jit_charge_modmem(u32 size); +void bpf_jit_uncharge_modmem(u32 size); bool bpf_prog_has_trampoline(const struct bpf_prog *prog); #else static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, -- cgit v1.2.3 From ed2d9e1a26cca963ff5ed3b76326d70f7d8201a9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:36 -0800 Subject: bpf: Use size instead of pages in bpf_binary_header This is necessary to charge sub page memory for the BPF program. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-4-song@kernel.org --- include/linux/filter.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index d23e999dc032..5855eb474c62 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -548,7 +548,7 @@ struct sock_fprog_kern { #define BPF_IMAGE_ALIGNMENT 8 struct bpf_binary_header { - u32 pages; + u32 size; u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; @@ -886,8 +886,8 @@ static inline void bpf_prog_lock_ro(struct bpf_prog *fp) static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_vm_flush_reset_perms(hdr); - set_memory_ro((unsigned long)hdr, hdr->pages); - set_memory_x((unsigned long)hdr, hdr->pages); + set_memory_ro((unsigned long)hdr, hdr->size >> PAGE_SHIFT); + set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } static inline struct bpf_binary_header * -- cgit v1.2.3 From ebc1415d9b4f043cef5a1fb002ec316e32167e7a Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:39 -0800 Subject: bpf: Introduce bpf_arch_text_copy This will be used to copy JITed text to RO protected module memory. On x86, bpf_arch_text_copy is implemented with text_poke_copy. bpf_arch_text_copy returns pointer to dst on success, and ERR_PTR(errno) on errors. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-7-song@kernel.org --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 366f88afd56b..ea0d7fd4a410 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2362,6 +2362,8 @@ enum bpf_text_poke_type { int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2); +void *bpf_arch_text_copy(void *dst, void *src, size_t len); + struct btf_id_set; bool btf_id_set_contains(const struct btf_id_set *set, u32 id); -- cgit v1.2.3 From 33c9805860e584b194199cab1a1e81f4e6395408 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 4 Feb 2022 10:57:41 -0800 Subject: bpf: Introduce bpf_jit_binary_pack_[alloc|finalize|free] This is the jit binary allocator built on top of bpf_prog_pack. bpf_prog_pack allocates RO memory, which cannot be used directly by the JIT engine. Therefore, a temporary rw buffer is allocated for the JIT engine. Once JIT is done, bpf_jit_binary_pack_finalize is used to copy the program to the RO memory. bpf_jit_binary_pack_alloc reserves 16 bytes of extra space for illegal instructions, which is small than the 128 bytes space reserved by bpf_jit_binary_alloc. This change is necessary for bpf_jit_binary_hdr to find the correct header. Also, flag use_bpf_prog_pack is added to differentiate a program allocated by bpf_jit_binary_pack_alloc. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220204185742.271030-9-song@kernel.org --- include/linux/bpf.h | 1 + include/linux/filter.h | 21 ++++++++++++--------- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ea0d7fd4a410..2fc7e5c5ef41 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -953,6 +953,7 @@ struct bpf_prog_aux { bool sleepable; bool tail_call_reachable; bool xdp_has_frags; + bool use_bpf_prog_pack; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; diff --git a/include/linux/filter.h b/include/linux/filter.h index 5855eb474c62..1cb1af917617 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -890,15 +890,6 @@ static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) set_memory_x((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } -static inline struct bpf_binary_header * -bpf_jit_binary_hdr(const struct bpf_prog *fp) -{ - unsigned long real_start = (unsigned long)fp->bpf_func; - unsigned long addr = real_start & PAGE_MASK; - - return (void *)addr; -} - int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { @@ -1068,6 +1059,18 @@ void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); +struct bpf_binary_header * +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image, + unsigned int alignment, + struct bpf_binary_header **rw_hdr, + u8 **rw_image, + bpf_jit_fill_hole_t bpf_fill_ill_insns); +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, + struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header); +void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header); + int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke); -- cgit v1.2.3 From a410a0cf98854a698a519bfbeb604145da384c0e Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 4 Feb 2022 14:58:11 +0100 Subject: ipv6: Define dscp_t and stop taking ECN bits into account in fib6-rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define a dscp_t type and its appropriate helpers that ensure ECN bits are not taken into account when handling DSCP. Use this new type to replace the tclass field of struct fib6_rule, so that fib6-rules don't get influenced by ECN bits anymore. Before this patch, fib6-rules didn't make any distinction between the DSCP and ECN bits. Therefore, rules specifying a DSCP (tos or dsfield options in iproute2) stopped working as soon a packets had at least one of its ECN bits set (as a work around one could create four rules for each DSCP value to match, one for each possible ECN value). After this patch fib6-rules only compare the DSCP bits. ECN doesn't influence the result anymore. Also, fib6-rules now must have the ECN bits cleared or they will be rejected. Signed-off-by: Guillaume Nault Acked-by: David Ahern Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jakub Kicinski --- include/net/inet_dscp.h | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ include/net/ipv6.h | 6 ++++++ 2 files changed, 63 insertions(+) create mode 100644 include/net/inet_dscp.h (limited to 'include') diff --git a/include/net/inet_dscp.h b/include/net/inet_dscp.h new file mode 100644 index 000000000000..72f250dffada --- /dev/null +++ b/include/net/inet_dscp.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * inet_dscp.h: helpers for handling differentiated services codepoints (DSCP) + * + * DSCP is defined in RFC 2474: + * + * 0 1 2 3 4 5 6 7 + * +---+---+---+---+---+---+---+---+ + * | DSCP | CU | + * +---+---+---+---+---+---+---+---+ + * + * DSCP: differentiated services codepoint + * CU: currently unused + * + * The whole DSCP + CU bits form the DS field. + * The DS field is also commonly called TOS or Traffic Class (for IPv6). + * + * Note: the CU bits are now used for Explicit Congestion Notification + * (RFC 3168). + */ + +#ifndef _INET_DSCP_H +#define _INET_DSCP_H + +#include + +/* Special type for storing DSCP values. + * + * A dscp_t variable stores a DS field with the CU (ECN) bits cleared. + * Using dscp_t allows to strictly separate DSCP and ECN bits, thus avoiding + * bugs where ECN bits are erroneously taken into account during FIB lookups + * or policy routing. + * + * Note: to get the real DSCP value contained in a dscp_t variable one would + * have to do a bit shift after calling inet_dscp_to_dsfield(). We could have + * a helper for that, but there's currently no users. + */ +typedef u8 __bitwise dscp_t; + +#define INET_DSCP_MASK 0xfc + +static inline dscp_t inet_dsfield_to_dscp(__u8 dsfield) +{ + return (__force dscp_t)(dsfield & INET_DSCP_MASK); +} + +static inline __u8 inet_dscp_to_dsfield(dscp_t dscp) +{ + return (__force __u8)dscp; +} + +static inline bool inet_validate_dscp(__u8 val) +{ + return !(val & ~INET_DSCP_MASK); +} + +#endif /* _INET_DSCP_H */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index cda1f205f391..f693784e1419 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -974,6 +975,11 @@ static inline u8 ip6_tclass(__be32 flowinfo) return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT; } +static inline dscp_t ip6_dscp(__be32 flowinfo) +{ + return inet_dsfield_to_dscp(ip6_tclass(flowinfo)); +} + static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) { return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; -- cgit v1.2.3 From f55fbb6afb8d701e3185e31e73f5ea9503a66744 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 4 Feb 2022 14:58:16 +0100 Subject: ipv4: Reject routes specifying ECN bits in rtm_tos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the new dscp_t type to replace the fc_tos field of fib_config, to ensure IPv4 routes aren't influenced by ECN bits when configured with non-zero rtm_tos. Before this patch, IPv4 routes specifying an rtm_tos with some of the ECN bits set were accepted. However they wouldn't work (never match) as IPv4 normally clears the ECN bits with IPTOS_RT_MASK before doing a FIB lookup (although a few buggy code paths don't). After this patch, IPv4 routes specifying an rtm_tos with any ECN bit set is rejected. Note: IPv6 routes ignore rtm_tos altogether, any rtm_tos is accepted, but treated as if it were 0. Signed-off-by: Guillaume Nault Acked-by: David Ahern Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: Jakub Kicinski --- include/net/ip_fib.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index c4297704bbcb..6a82bcb8813b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -24,7 +25,7 @@ struct fib_config { u8 fc_dst_len; - u8 fc_tos; + dscp_t fc_dscp; u8 fc_protocol; u8 fc_scope; u8 fc_type; -- cgit v1.2.3 From b2309a71c1f2fc841feb184195b2e46b2e139bf4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 10:41:07 -0800 Subject: net: add dev->dev_registered_tracker Convert one dev_hold()/dev_put() pair in register_netdevice() and unregister_netdevice_many() to dev_hold_track() and dev_put_track(). This would allow to detect a rogue dev_put() a bit earlier. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20220207184107.1401096-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3fb6fb67ed77..5f6e2c0b0c90 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1948,6 +1948,8 @@ enum netdev_ml_priv_type { * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. * @watchdog_dev_tracker: refcount tracker used by watchdog. + * @dev_registered_tracker: tracker for reference held while + * registered * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2282,6 +2284,7 @@ struct net_device { u8 dev_addr_shadow[MAX_ADDR_LEN]; netdevice_tracker linkwatch_dev_tracker; netdevice_tracker watchdog_dev_tracker; + netdevice_tracker dev_registered_tracker; }; #define to_net_dev(d) container_of(d, struct net_device, dev) -- cgit v1.2.3 From 21a216a8fc630161e69eaf02cbebdd1816ff1a13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:28 -0800 Subject: ipv6/addrconf: allocate a per netns hash table Add a per netns hash table and a dedicated spinlock, first step to get rid of the global inet6_addr_lst[] one. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 30cdfc4e1615..755f12001c8b 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -92,6 +92,10 @@ struct netns_ipv6 { struct sock *tcp_sk; struct sock *igmp_sk; struct sock *mc_autojoin_sk; + + struct hlist_head *inet6_addr_lst; + spinlock_t addrconf_hash_lock; + #ifdef CONFIG_IPV6_MROUTE #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES struct mr_table *mrt6; -- cgit v1.2.3 From 8805d13ff1b2bef6a7bb8a005d2441763286dd7a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:29 -0800 Subject: ipv6/addrconf: use one delayed work per netns Next step for using per netns inet6_addr_lst is to have per netns work item to ultimately call addrconf_verify_rtnl() and addrconf_verify() with a new 'struct net*' argument. Everything is still using the global inet6_addr_lst[] table. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 755f12001c8b..d145f1966682 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -95,6 +95,7 @@ struct netns_ipv6 { struct hlist_head *inet6_addr_lst; spinlock_t addrconf_hash_lock; + struct delayed_work addr_chk_work; #ifdef CONFIG_IPV6_MROUTE #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES -- cgit v1.2.3 From c7d9a6751a5fcebc87feee9b999b40078ed9fb43 Mon Sep 17 00:00:00 2001 From: Luiz Angelo Daros de Luca Date: Tue, 8 Feb 2022 02:32:10 -0300 Subject: net: dsa: typo in comment Signed-off-by: Luiz Angelo Daros de Luca Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20220208053210.14831-1-luizluca@gmail.com Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index ca8c14b547b4..fd1f62a6e0a8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1281,7 +1281,7 @@ module_exit(dsa_tag_driver_module_exit) /** * module_dsa_tag_drivers() - Helper macro for registering DSA tag * drivers - * @__ops_array: Array of tag driver strucutres + * @__ops_array: Array of tag driver structures * * Helper macro for DSA tag drivers which do not do anything special * in module init/exit. Each module may only use this macro once, and -- cgit v1.2.3 From 8dd8678e42b5a8bbd3e2c49cda76c743318c56b0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 5 Feb 2022 13:00:04 +0100 Subject: netfilter: ecache: don't use nf_conn spinlock For updating eache missed value we can use cmpxchg. This also avoids need to disable BH. kernel robot reported build failure on v1 because not all arches support cmpxchg for u16, so extend this to u32. This doesn't increase struct size, existing padding is used. Reported-by: kernel test robot Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_ecache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 16bcff809b18..6c4c490a3e34 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -21,10 +21,10 @@ enum nf_ct_ecache_state { struct nf_conntrack_ecache { unsigned long cache; /* bitops want long */ - u16 missed; /* missed events */ u16 ctmask; /* bitmask of ct events to be delivered */ u16 expmask; /* bitmask of expect events to be delivered */ enum nf_ct_ecache_state state:8;/* ecache state */ + u32 missed; /* missed events */ u32 portid; /* netlink portid of destroyer */ }; -- cgit v1.2.3 From 7afa38831aee2ca2f41b22747ed8545e1887aaa9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Feb 2022 12:29:47 +0100 Subject: netfilter: cttimeout: use option structure Instead of two exported functions, export a single option structure. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_timeout.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index db507e4a65bb..3ea94f6f3844 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -108,8 +108,12 @@ static inline void nf_ct_destroy_timeout(struct nf_conn *ct) #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT -extern struct nf_ct_timeout *(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name); -extern void (*nf_ct_timeout_put_hook)(struct nf_ct_timeout *timeout); +struct nf_ct_timeout_hooks { + struct nf_ct_timeout *(*timeout_find_get)(struct net *net, const char *name); + void (*timeout_put)(struct nf_ct_timeout *timeout); +}; + +extern const struct nf_ct_timeout_hooks *nf_ct_timeout_hook; #endif #endif /* _NF_CONNTRACK_TIMEOUT_H */ -- cgit v1.2.3 From 23f68d462984bfda47c7bf663dca347e8e3df549 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 7 Feb 2022 19:25:08 +0100 Subject: netfilter: nft_cmp: optimize comparison for 16-bytes Allow up to 16-byte comparisons with a new cmp fast version. Use two 64-bit words and calculate the mask representing the bits to be compared. Make sure the comparison is 64-bit aligned and avoid out-of-bound memory access on registers. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index b6fb1fdff9b2..0ea7c55cea4d 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -42,6 +42,14 @@ struct nft_cmp_fast_expr { bool inv; }; +struct nft_cmp16_fast_expr { + struct nft_data data; + struct nft_data mask; + u8 sreg; + u8 len; + bool inv; +}; + struct nft_immediate_expr { struct nft_data data; u8 dreg; @@ -59,6 +67,7 @@ static inline u32 nft_cmp_fast_mask(unsigned int len) } extern const struct nft_expr_ops nft_cmp_fast_ops; +extern const struct nft_expr_ops nft_cmp16_fast_ops; struct nft_payload { enum nft_payload_bases base:8; -- cgit v1.2.3 From 8069b22d656f6e1922352bff90ab78e6fab73779 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 9 Feb 2022 12:05:55 +0800 Subject: mctp: Add helper for address match checking Currently, we have a couple of paths that check that an EID matches, or the match value is MCTP_ADDR_ANY. Rather than open coding this, add a little helper. Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/net/mctp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/mctp.h b/include/net/mctp.h index 7e35ec79b909..706d329dd8e8 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -45,6 +45,11 @@ static inline bool mctp_address_ok(mctp_eid_t eid) return eid >= 8 && eid < 255; } +static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid) +{ + return match == eid || match == MCTP_ADDR_ANY; +} + static inline struct mctp_hdr *mctp_hdr(struct sk_buff *skb) { return (struct mctp_hdr *)skb_network_header(skb); -- cgit v1.2.3 From 63ed1aab3d40aa61aaa66819bdce9377ac7f40fa Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Wed, 9 Feb 2022 12:05:57 +0800 Subject: mctp: Add SIOCMCTP{ALLOC,DROP}TAG ioctls for tag control This change adds a couple of new ioctls for mctp sockets: SIOCMCTPALLOCTAG and SIOCMCTPDROPTAG. These ioctls provide facilities for explicit allocation / release of tags, overriding the automatic allocate-on-send/release-on-reply and timeout behaviours. This allows userspace more control over messages that may not fit a simple request/response model. In order to indicate a pre-allocated tag to the sendmsg() syscall, we introduce a new flag to the struct sockaddr_mctp.smctp_tag value: MCTP_TAG_PREALLOC. Additional changes from Jeremy Kerr . Contains a fix that was: Reported-by: kernel test robot Signed-off-by: Matt Johnston Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/net/mctp.h | 11 ++++++++++- include/trace/events/mctp.h | 5 ++++- include/uapi/linux/mctp.h | 18 ++++++++++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mctp.h b/include/net/mctp.h index 706d329dd8e8..e80a4baf8379 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -126,7 +126,7 @@ struct mctp_sock { */ struct mctp_sk_key { mctp_eid_t peer_addr; - mctp_eid_t local_addr; + mctp_eid_t local_addr; /* MCTP_ADDR_ANY for local owned tags */ __u8 tag; /* incoming tag match; invert TO for local */ /* we hold a ref to sk when set */ @@ -163,6 +163,12 @@ struct mctp_sk_key { */ unsigned long dev_flow_state; struct mctp_dev *dev; + + /* a tag allocated with SIOCMCTPALLOCTAG ioctl will not expire + * automatically on timeout or response, instead SIOCMCTPDROPTAG + * is used. + */ + bool manual_alloc; }; struct mctp_skb_cb { @@ -239,6 +245,9 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); void mctp_key_unref(struct mctp_sk_key *key); +struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, + mctp_eid_t daddr, mctp_eid_t saddr, + bool manual, u8 *tagp); /* routing <--> device interface */ unsigned int mctp_default_net(struct net *net); diff --git a/include/trace/events/mctp.h b/include/trace/events/mctp.h index 175b057c507f..165cf25f77a7 100644 --- a/include/trace/events/mctp.h +++ b/include/trace/events/mctp.h @@ -15,6 +15,7 @@ enum { MCTP_TRACE_KEY_REPLIED, MCTP_TRACE_KEY_INVALIDATED, MCTP_TRACE_KEY_CLOSED, + MCTP_TRACE_KEY_DROPPED, }; #endif /* __TRACE_MCTP_ENUMS */ @@ -22,6 +23,7 @@ TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_TIMEOUT); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_REPLIED); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_INVALIDATED); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_CLOSED); +TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_DROPPED); TRACE_EVENT(mctp_key_acquire, TP_PROTO(const struct mctp_sk_key *key), @@ -66,7 +68,8 @@ TRACE_EVENT(mctp_key_release, { MCTP_TRACE_KEY_TIMEOUT, "timeout" }, { MCTP_TRACE_KEY_REPLIED, "replied" }, { MCTP_TRACE_KEY_INVALIDATED, "invalidated" }, - { MCTP_TRACE_KEY_CLOSED, "closed" }) + { MCTP_TRACE_KEY_CLOSED, "closed" }, + { MCTP_TRACE_KEY_DROPPED, "dropped" }) ) ); diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h index 07b0318716fc..154ab56651f1 100644 --- a/include/uapi/linux/mctp.h +++ b/include/uapi/linux/mctp.h @@ -44,7 +44,25 @@ struct sockaddr_mctp_ext { #define MCTP_TAG_MASK 0x07 #define MCTP_TAG_OWNER 0x08 +#define MCTP_TAG_PREALLOC 0x10 #define MCTP_OPT_ADDR_EXT 1 +#define SIOCMCTPALLOCTAG (SIOCPROTOPRIVATE + 0) +#define SIOCMCTPDROPTAG (SIOCPROTOPRIVATE + 1) + +struct mctp_ioc_tag_ctl { + mctp_eid_t peer_addr; + + /* For SIOCMCTPALLOCTAG: must be passed as zero, kernel will + * populate with the allocated tag value. Returned tag value will + * always have TO and PREALLOC set. + * + * For SIOCMCTPDROPTAG: userspace provides tag value to drop, from + * a prior SIOCMCTPALLOCTAG call (and so must have TO and PREALLOC set). + */ + __u8 tag; + __u16 flags; +}; + #endif /* __UAPI_MCTP_H */ -- cgit v1.2.3 From 9a69e2b385f443f244a7e8b8bcafe5ccfb0866b4 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 9 Feb 2022 19:43:32 +0100 Subject: bpf: Make remote_port field in struct bpf_sk_lookup 16-bit wide remote_port is another case of a BPF context field documented as a 32-bit value in network byte order for which the BPF context access converter generates a load of a zero-padded 16-bit integer in network byte order. First such case was dst_port in bpf_sock which got addressed in commit 4421a582718a ("bpf: Make dst_port field in struct bpf_sock 16-bit wide"). Loading 4-bytes from the remote_port offset and converting the value with bpf_ntohl() leads to surprising results, as the expected value is shifted by 16 bits. Reduce the confusion by splitting the field in two - a 16-bit field holding a big-endian integer, and a 16-bit zero-padding anonymous field that follows it. Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220209184333.654927-2-jakub@cloudflare.com --- include/uapi/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a7f0ddedac1f..afe3d0d7f5f2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6453,7 +6453,8 @@ struct bpf_sk_lookup { __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ __u32 remote_ip4; /* Network byte order */ __u32 remote_ip6[4]; /* Network byte order */ - __u32 remote_port; /* Network byte order */ + __be16 remote_port; /* Network byte order */ + __u16 :16; /* Zero padding */ __u32 local_ip4; /* Network byte order */ __u32 local_ip6[4]; /* Network byte order */ __u32 local_port; /* Host byte order */ -- cgit v1.2.3 From 5cad527d5ffa9a1c4731bb9c97d2ee93f8960d50 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Wed, 9 Feb 2022 14:08:38 +0800 Subject: net: drop_monitor: support drop reason In the commit c504e5c2f964 ("net: skb: introduce kfree_skb_reason()") drop reason is introduced to the tracepoint of kfree_skb. Therefore, drop_monitor is able to report the drop reason to users by netlink. The drop reasons are reported as string to users, which is exactly the same as what we do when reporting it to ftrace. Signed-off-by: Menglong Dong Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220209060838.55513-1-imagedong@tencent.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/net_dropmon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h index 66048cc5d7b3..1bbea8f0681e 100644 --- a/include/uapi/linux/net_dropmon.h +++ b/include/uapi/linux/net_dropmon.h @@ -93,6 +93,7 @@ enum net_dm_attr { NET_DM_ATTR_SW_DROPS, /* flag */ NET_DM_ATTR_HW_DROPS, /* flag */ NET_DM_ATTR_FLOW_ACTION_COOKIE, /* binary */ + NET_DM_ATTR_REASON, /* string */ __NET_DM_ATTR_MAX, NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1 -- cgit v1.2.3 From ede6c39c4f9068cbeb4036448c45fff5393e0432 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Feb 2022 18:59:32 -0800 Subject: net: make net->dev_unreg_count atomic Having to acquire rtnl from netdev_run_todo() for every dismantled device is not desirable when/if rtnl is under stress. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/net_namespace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 374cc7b260fc..c4f5601f6e32 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -63,7 +63,7 @@ struct net { */ spinlock_t rules_mod_lock; - unsigned int dev_unreg_count; + atomic_t dev_unreg_count; unsigned int dev_base_seq; /* protected by rtnl_mutex */ int ifindex; -- cgit v1.2.3 From 48b6190a00425a1bebac9f7ae4b338a1e20f50f3 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 10 Feb 2022 17:11:36 +0800 Subject: net/smc: Limit SMC visits when handshake workqueue congested This patch intends to provide a mechanism to put constraint on SMC connections visit according to the pressure of SMC handshake process. At present, frequent visits will cause the incoming connections to be backlogged in SMC handshake queue, raise the connections established time. Which is quite unacceptable for those applications who base on short lived connections. There are two ways to implement this mechanism: 1. Put limitation after TCP established. 2. Put limitation before TCP established. In the first way, we need to wait and receive CLC messages that the client will potentially send, and then actively reply with a decline message, in a sense, which is also a sort of SMC handshake, affect the connections established time on its way. In the second way, the only problem is that we need to inject SMC logic into TCP when it is about to reply the incoming SYN, since we already do that, it's seems not a problem anymore. And advantage is obvious, few additional processes are required to complete the constraint. This patch use the second way. After this patch, connections who beyond constraint will not informed any SMC indication, and SMC will not be involved in any of its subsequent processes. Link: https://lore.kernel.org/all/1641301961-59331-1-git-send-email-alibuda@linux.alibaba.com/ Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 78b91bb92f0d..1168302b7927 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -394,6 +394,7 @@ struct tcp_sock { bool is_mptcp; #endif #if IS_ENABLED(CONFIG_SMC) + bool (*smc_hs_congested)(const struct sock *sk); bool syn_smc; /* SYN includes SMC */ #endif -- cgit v1.2.3 From a6a6fe27bab48f0d09a64b051e7bde432fcae081 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 10 Feb 2022 17:11:37 +0800 Subject: net/smc: Dynamic control handshake limitation by socket options This patch aims to add dynamic control for SMC handshake limitation for every smc sockets, in production environment, it is possible for the same applications to handle different service types, and may have different opinion on SMC handshake limitation. This patch try socket options to complete it, since we don't have socket option level for SMC yet, which requires us to implement it at the same time. This patch does the following: - add new socket option level: SOL_SMC. - add new SMC socket option: SMC_LIMIT_HS. - provide getter/setter for SMC socket options. Link: https://lore.kernel.org/all/20f504f961e1a803f85d64229ad84260434203bd.1644323503.git.alibuda@linux.alibaba.com/ Signed-off-by: D. Wythe Signed-off-by: David S. Miller --- include/linux/socket.h | 1 + include/uapi/linux/smc.h | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 8ef26d89ef49..6f85f5d957ef 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -366,6 +366,7 @@ struct ucred { #define SOL_XDP 283 #define SOL_MPTCP 284 #define SOL_MCTP 285 +#define SOL_SMC 286 /* IPX options */ #define IPX_TYPE 1 diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 6c2874fd2c00..343e7450c3a3 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -284,4 +284,8 @@ enum { __SMC_NLA_SEID_TABLE_MAX, SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1 }; + +/* SMC socket options */ +#define SMC_LIMIT_HS 1 /* constraint on smc handshake */ + #endif /* _UAPI_LINUX_SMC_H */ -- cgit v1.2.3 From f9496b7c1b48ce02cd17a3ee88b1e049c689a222 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Thu, 10 Feb 2022 17:11:38 +0800 Subject: net/smc: Add global configure for handshake limitation by netlink Although we can control SMC handshake limitation through socket options, which means that applications who need it must modify their code. It's quite troublesome for many existing applications. This patch modifies the global default value of SMC handshake limitation through netlink, providing a way to put constraint on handshake without modifies any code for applications. Suggested-by: Tony Lu Signed-off-by: D. Wythe Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- include/net/netns/smc.h | 2 ++ include/uapi/linux/smc.h | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index ea8a9cf2619b..47b166684fd8 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -12,5 +12,7 @@ struct netns_smc { /* protect fback_rsn */ struct mutex mutex_fback_rsn; struct smc_stats_rsn *fback_rsn; + + bool limit_smc_hs; /* constraint on handshake */ }; #endif diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 343e7450c3a3..693f549f6966 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -59,6 +59,9 @@ enum { SMC_NETLINK_DUMP_SEID, SMC_NETLINK_ENABLE_SEID, SMC_NETLINK_DISABLE_SEID, + SMC_NETLINK_DUMP_HS_LIMITATION, + SMC_NETLINK_ENABLE_HS_LIMITATION, + SMC_NETLINK_DISABLE_HS_LIMITATION, }; /* SMC_GENL_FAMILY top level attributes */ @@ -285,6 +288,14 @@ enum { SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1 }; +/* SMC_NETLINK_HS_LIMITATION attributes */ +enum { + SMC_NLA_HS_LIMITATION_UNSPEC, + SMC_NLA_HS_LIMITATION_ENABLED, /* u8 */ + __SMC_NLA_HS_LIMITATION_MAX, + SMC_NLA_HS_LIMITATION_MAX = __SMC_NLA_HS_LIMITATION_MAX - 1 +}; + /* SMC socket options */ #define SMC_LIMIT_HS 1 /* constraint on smc handshake */ -- cgit v1.2.3 From 2d4feb2c1ba728fe53d6f80705b2d7e5c5b666a5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Feb 2022 13:42:28 -0800 Subject: ipv6: get rid of net->ipv6.rt6_stats->fib_rt_uncache This counter has never been visible, there is little point trying to maintain it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 40ae8f1b18e5..32c83bb68879 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -367,9 +367,8 @@ struct rt6_statistics { __u32 fib_rt_cache; /* cached rt entries in exception table */ __u32 fib_discarded_routes; /* total number of routes delete */ - /* The following stats are not protected by any lock */ + /* The following stat is not protected by any lock */ atomic_t fib_rt_alloc; /* total number of routes alloced */ - atomic_t fib_rt_uncache; /* rt entries in uncached list */ }; #define RTN_TL_ROOT 0x0001 -- cgit v1.2.3 From 2618a0dae09ef37728dab89ff60418cbe25ae6bd Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 12 Feb 2022 09:14:49 -0800 Subject: etherdevice: Adjust ether_addr* prototypes to silence -Wstringop-overead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With GCC 12, -Wstringop-overread was warning about an implicit cast from char[6] to char[8]. However, the extra 2 bytes are always thrown away, alignment doesn't matter, and the risk of hitting the edge of unallocated memory has been accepted, so this prototype can just be converted to a regular char *. Silences: net/core/dev.c: In function ‘bpf_prog_run_generic_xdp’: net/core/dev.c:4618:21: warning: ‘ether_addr_equal_64bits’ reading 8 bytes from a region of size 6 [-Wstringop-overread] 4618 | orig_host = ether_addr_equal_64bits(eth->h_dest, > skb->dev->dev_addr); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ net/core/dev.c:4618:21: note: referencing argument 1 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’} net/core/dev.c:4618:21: note: referencing argument 2 of type ‘const u8[8]’ {aka ‘const unsigned char[8]’} In file included from net/core/dev.c:91: include/linux/etherdevice.h:375:20: note: in a call to function ‘ether_addr_equal_64bits’ 375 | static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], | ^~~~~~~~~~~~~~~~~~~~~~~ Reported-by: Marc Kleine-Budde Tested-by: Marc Kleine-Budde Link: https://lore.kernel.org/netdev/20220212090811.uuzk6d76agw2vv73@pengutronix.de Cc: Jakub Kicinski Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 2ad71cc90b37..92b10e67d5f8 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -134,7 +134,7 @@ static inline bool is_multicast_ether_addr(const u8 *addr) #endif } -static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2]) +static inline bool is_multicast_ether_addr_64bits(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN @@ -372,8 +372,7 @@ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ -static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], - const u8 addr2[6+2]) +static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); -- cgit v1.2.3 From 65c53595bc2a14e77b455c8a12fc2631cbe43868 Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Sun, 13 Feb 2022 11:12:52 -0800 Subject: net: ocelot: align macros for consistency In the ocelot.h file, several read / write macros were split across multiple lines, while others weren't. Split all macros that exceed the 80 character column width and match the style of the rest of the file. Signed-off-by: Colin Foster Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 44 +++++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 5c3a3597f1d2..1dbf8aff861c 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -742,25 +742,39 @@ struct ocelot_policer { u32 burst; /* bytes */ }; -#define ocelot_read_ix(ocelot, reg, gi, ri) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) -#define ocelot_read_gix(ocelot, reg, gi) __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi)) -#define ocelot_read_rix(ocelot, reg, ri) __ocelot_read_ix(ocelot, reg, reg##_RSZ * (ri)) -#define ocelot_read(ocelot, reg) __ocelot_read_ix(ocelot, reg, 0) - -#define ocelot_write_ix(ocelot, val, reg, gi, ri) __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) -#define ocelot_write_gix(ocelot, val, reg, gi) __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi)) -#define ocelot_write_rix(ocelot, val, reg, ri) __ocelot_write_ix(ocelot, val, reg, reg##_RSZ * (ri)) +#define ocelot_read_ix(ocelot, reg, gi, ri) \ + __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) +#define ocelot_read_gix(ocelot, reg, gi) \ + __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi)) +#define ocelot_read_rix(ocelot, reg, ri) \ + __ocelot_read_ix(ocelot, reg, reg##_RSZ * (ri)) +#define ocelot_read(ocelot, reg) \ + __ocelot_read_ix(ocelot, reg, 0) + +#define ocelot_write_ix(ocelot, val, reg, gi, ri) \ + __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) +#define ocelot_write_gix(ocelot, val, reg, gi) \ + __ocelot_write_ix(ocelot, val, reg, reg##_GSZ * (gi)) +#define ocelot_write_rix(ocelot, val, reg, ri) \ + __ocelot_write_ix(ocelot, val, reg, reg##_RSZ * (ri)) #define ocelot_write(ocelot, val, reg) __ocelot_write_ix(ocelot, val, reg, 0) -#define ocelot_rmw_ix(ocelot, val, m, reg, gi, ri) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) -#define ocelot_rmw_gix(ocelot, val, m, reg, gi) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi)) -#define ocelot_rmw_rix(ocelot, val, m, reg, ri) __ocelot_rmw_ix(ocelot, val, m, reg, reg##_RSZ * (ri)) +#define ocelot_rmw_ix(ocelot, val, m, reg, gi, ri) \ + __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) +#define ocelot_rmw_gix(ocelot, val, m, reg, gi) \ + __ocelot_rmw_ix(ocelot, val, m, reg, reg##_GSZ * (gi)) +#define ocelot_rmw_rix(ocelot, val, m, reg, ri) \ + __ocelot_rmw_ix(ocelot, val, m, reg, reg##_RSZ * (ri)) #define ocelot_rmw(ocelot, val, m, reg) __ocelot_rmw_ix(ocelot, val, m, reg, 0) -#define ocelot_field_write(ocelot, reg, val) regmap_field_write((ocelot)->regfields[(reg)], (val)) -#define ocelot_field_read(ocelot, reg, val) regmap_field_read((ocelot)->regfields[(reg)], (val)) -#define ocelot_fields_write(ocelot, id, reg, val) regmap_fields_write((ocelot)->regfields[(reg)], (id), (val)) -#define ocelot_fields_read(ocelot, id, reg, val) regmap_fields_read((ocelot)->regfields[(reg)], (id), (val)) +#define ocelot_field_write(ocelot, reg, val) \ + regmap_field_write((ocelot)->regfields[(reg)], (val)) +#define ocelot_field_read(ocelot, reg, val) \ + regmap_field_read((ocelot)->regfields[(reg)], (val)) +#define ocelot_fields_write(ocelot, id, reg, val) \ + regmap_fields_write((ocelot)->regfields[(reg)], (id), (val)) +#define ocelot_fields_read(ocelot, id, reg, val) \ + regmap_fields_read((ocelot)->regfields[(reg)], (id), (val)) #define ocelot_target_read_ix(ocelot, target, reg, gi, ri) \ __ocelot_target_read_ix(ocelot, target, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) -- cgit v1.2.3 From 40f3a5c81555880a437dfd3301826074dff18138 Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Sun, 13 Feb 2022 11:12:53 -0800 Subject: net: mscc: ocelot: add ability to perform bulk reads Regmap supports bulk register reads. Ocelot does not. This patch adds support for Ocelot to invoke bulk regmap reads. That will allow any driver that performs consecutive reads over memory regions to optimize that access. Signed-off-by: Colin Foster Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 1dbf8aff861c..6687ba35eaf3 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -742,6 +742,9 @@ struct ocelot_policer { u32 burst; /* bytes */ }; +#define ocelot_bulk_read_rix(ocelot, reg, ri, buf, count) \ + __ocelot_bulk_read_ix(ocelot, reg, reg##_RSZ * (ri), buf, count) + #define ocelot_read_ix(ocelot, reg, gi, ri) \ __ocelot_read_ix(ocelot, reg, reg##_GSZ * (gi) + reg##_RSZ * (ri)) #define ocelot_read_gix(ocelot, reg, gi) \ @@ -798,6 +801,8 @@ struct ocelot_policer { u32 ocelot_port_readl(struct ocelot_port *port, u32 reg); void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg); void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg); +int __ocelot_bulk_read_ix(struct ocelot *ocelot, u32 reg, u32 offset, void *buf, + int count); u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset); void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset); void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg, -- cgit v1.2.3 From d87b1c08f38a2ce40cf559df36c107a2e6c16a8f Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Sun, 13 Feb 2022 11:12:54 -0800 Subject: net: mscc: ocelot: use bulk reads for stats Create and utilize bulk regmap reads instead of single access for gathering stats. The background reading of statistics happens frequently, and over a few contiguous memory regions. High speed PCIe buses and MMIO access will probably see negligible performance increase. Lower speed buses like SPI and I2C could see significant performance increase, since the bus configuration and register access times account for a large percentage of data transfer time. Signed-off-by: Colin Foster Reviewed-by: Vladimir Oltean Tested-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 6687ba35eaf3..cacb103e4bad 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -540,6 +540,13 @@ struct ocelot_stat_layout { char name[ETH_GSTRING_LEN]; }; +struct ocelot_stats_region { + struct list_head node; + u32 offset; + int count; + u32 *buf; +}; + enum ocelot_tag_prefix { OCELOT_TAG_PREFIX_DISABLED = 0, OCELOT_TAG_PREFIX_NONE, @@ -671,6 +678,7 @@ struct ocelot { struct regmap_field *regfields[REGFIELD_MAX]; const u32 *const *map; const struct ocelot_stat_layout *stats_layout; + struct list_head stats_regions; unsigned int num_stats; u32 pool_size[OCELOT_SB_NUM][OCELOT_SB_POOL_NUM]; -- cgit v1.2.3 From baebdf48c360080710f80699eea3affbb13d6c65 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 12 Feb 2022 00:38:38 +0100 Subject: net: dev: Makes sure netif_rx() can be invoked in any context. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dave suggested a while ago (eleven years by now) "Let's make netif_rx() work in all contexts and get rid of netif_rx_ni()". Eric agreed and pointed out that modern devices should use netif_receive_skb() to avoid the overhead. In the meantime someone added another variant, netif_rx_any_context(), which behaves as suggested. netif_rx() must be invoked with disabled bottom halves to ensure that pending softirqs, which were raised within the function, are handled. netif_rx_ni() can be invoked only from process context (bottom halves must be enabled) because the function handles pending softirqs without checking if bottom halves were disabled or not. netif_rx_any_context() invokes on the former functions by checking in_interrupts(). netif_rx() could be taught to handle both cases (disabled and enabled bottom halves) by simply disabling bottom halves while invoking netif_rx_internal(). The local_bh_enable() invocation will then invoke pending softirqs only if the BH-disable counter drops to zero. Eric is concerned about the overhead of BH-disable+enable especially in regard to the loopback driver. As critical as this driver is, it will receive a shortcut to avoid the additional overhead which is not needed. Add a local_bh_disable() section in netif_rx() to ensure softirqs are handled if needed. Provide __netif_rx() which does not disable BH and has a lockdep assert to ensure that interrupts are disabled. Use this shortcut in the loopback driver and in drivers/net/*.c. Make netif_rx_ni() and netif_rx_any_context() invoke netif_rx() so they can be removed once they are no more users left. Link: https://lkml.kernel.org/r/20100415.020246.218622820.davem@davemloft.net Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Eric Dumazet Reviewed-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 ++++++++++++-- include/trace/events/net.h | 14 -------------- 2 files changed, 12 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5f6e2c0b0c90..e99db8341da5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3672,8 +3672,18 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog); int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); -int netif_rx_ni(struct sk_buff *skb); -int netif_rx_any_context(struct sk_buff *skb); +int __netif_rx(struct sk_buff *skb); + +static inline int netif_rx_ni(struct sk_buff *skb) +{ + return netif_rx(skb); +} + +static inline int netif_rx_any_context(struct sk_buff *skb) +{ + return netif_rx(skb); +} + int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); void netif_receive_skb_list_internal(struct list_head *head); diff --git a/include/trace/events/net.h b/include/trace/events/net.h index 78c448c6ab4c..032b431b987b 100644 --- a/include/trace/events/net.h +++ b/include/trace/events/net.h @@ -260,13 +260,6 @@ DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry, TP_ARGS(skb) ); -DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_ni_entry, - - TP_PROTO(const struct sk_buff *skb), - - TP_ARGS(skb) -); - DECLARE_EVENT_CLASS(net_dev_rx_exit_template, TP_PROTO(int ret), @@ -312,13 +305,6 @@ DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit, TP_ARGS(ret) ); -DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_ni_exit, - - TP_PROTO(int ret), - - TP_ARGS(ret) -); - DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit, TP_PROTO(int ret), -- cgit v1.2.3 From 76f05d88623e559eb9fc41db9fb911e67fab0e7a Mon Sep 17 00:00:00 2001 From: M Chetan Kumar Date: Mon, 14 Feb 2022 12:46:52 +0530 Subject: net: wwan: debugfs obtained dev reference not dropped WWAN driver call's wwan_get_debugfs_dir() to obtain WWAN debugfs dir entry. As part of this procedure it returns a reference to a found device. Since there is no debugfs interface available at WWAN subsystem, it is not possible to drop dev reference post debugfs use. This leads to side effects like post wwan driver load and reload the wwan instance gets increment from wwanX to wwanX+1. A new debugfs interface is added in wwan subsystem so that wwan driver can drop the obtained dev reference post debugfs use. void wwan_put_debugfs_dir(struct dentry *dir) Signed-off-by: M Chetan Kumar Signed-off-by: David S. Miller --- include/linux/wwan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index afb3334ec8c5..5ce2acf444fb 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -174,11 +174,13 @@ void wwan_unregister_ops(struct device *parent); #ifdef CONFIG_WWAN_DEBUGFS struct dentry *wwan_get_debugfs_dir(struct device *parent); +void wwan_put_debugfs_dir(struct dentry *dir); #else static inline struct dentry *wwan_get_debugfs_dir(struct device *parent) { return ERR_PTR(-ENODEV); } +static inline void wwan_put_debugfs_dir(struct dentry *dir) {} #endif #endif /* __WWAN_H */ -- cgit v1.2.3 From 8d23a54f5beea59b560855fb571e5d73d783e0b4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Feb 2022 19:02:12 +0200 Subject: net: bridge: switchdev: differentiate new VLANs from changed ones br_switchdev_port_vlan_add() currently emits a SWITCHDEV_PORT_OBJ_ADD event with a SWITCHDEV_OBJ_ID_PORT_VLAN for 2 distinct cases: - a struct net_bridge_vlan got created - an existing struct net_bridge_vlan was modified This makes it impossible for switchdev drivers to properly balance PORT_OBJ_ADD with PORT_OBJ_DEL events, so if we want to allow that to happen, we must provide a way for drivers to distinguish between a VLAN with changed flags and a new one. Annotate struct switchdev_obj_port_vlan with a "bool changed" that distinguishes the 2 cases above. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/switchdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index d353793dfeb5..92cc763991e9 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -81,6 +81,13 @@ struct switchdev_obj_port_vlan { struct switchdev_obj obj; u16 flags; u16 vid; + /* If set, the notifier signifies a change of one of the following + * flags for a VLAN that already exists: + * - BRIDGE_VLAN_INFO_PVID + * - BRIDGE_VLAN_INFO_UNTAGGED + * Entries with BRIDGE_VLAN_INFO_BRENTRY unset are not notified at all. + */ + bool changed; }; #define SWITCHDEV_OBJ_PORT_VLAN(OBJ) \ -- cgit v1.2.3 From c4076cdd21f8d68a96f1e7124bd8915c7e31a474 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Feb 2022 19:02:16 +0200 Subject: net: switchdev: introduce switchdev_handle_port_obj_{add,del} for foreign interfaces The switchdev_handle_port_obj_add() helper is good for replicating a port object on the lower interfaces of @dev, if that object was emitted on a bridge, or on a bridge port that is a LAG. However, drivers that use this helper limit themselves to a box from which they can no longer intercept port objects notified on neighbor ports ("foreign interfaces"). One such driver is DSA, where software bridging with foreign interfaces such as standalone NICs or Wi-Fi APs is an important use case. There, a VLAN installed on a neighbor bridge port roughly corresponds to a forwarding VLAN installed on the DSA switch's CPU port. To support this use case while also making use of the benefits of the switchdev_handle_* replication helper for port objects, introduce a new variant of these functions that crawls through the neighbor ports of @dev, in search of potentially compatible switchdev ports that are interested in the event. The strategy is identical to switchdev_handle_fdb_event_to_device(): if @dev wasn't a switchdev interface, then go one step upper, and recursively call this function on the bridge that this port belongs to. At the next recursion step, __switchdev_handle_port_obj_add() will iterate through the bridge's lower interfaces. Among those, some will be switchdev interfaces, and one will be the original @dev that we came from. To prevent infinite recursion, we must suppress reentry into the original @dev, and just call the @add_cb for the switchdev_interfaces. It looks like this: br0 / | \ / | \ / | \ swp0 swp1 eth0 1. __switchdev_handle_port_obj_add(eth0) -> check_cb(eth0) returns false -> eth0 has no lower interfaces -> eth0's bridge is br0 -> switchdev_lower_dev_find(br0, check_cb, foreign_dev_check_cb)) finds br0 2. __switchdev_handle_port_obj_add(br0) -> check_cb(br0) returns false -> netdev_for_each_lower_dev -> check_cb(swp0) returns true, so we don't skip this interface 3. __switchdev_handle_port_obj_add(swp0) -> check_cb(swp0) returns true, so we call add_cb(swp0) (back to netdev_for_each_lower_dev from 2) -> check_cb(swp1) returns true, so we don't skip this interface 4. __switchdev_handle_port_obj_add(swp1) -> check_cb(swp1) returns true, so we call add_cb(swp1) (back to netdev_for_each_lower_dev from 2) -> check_cb(eth0) returns false, so we skip this interface to avoid infinite recursion Note: eth0 could have been a LAG, and we don't want to suppress the recursion through its lowers if those exist, so when check_cb() returns false, we still call switchdev_lower_dev_find() to estimate whether there's anything worth a recursion beneath that LAG. Using check_cb() and foreign_dev_check_cb(), switchdev_lower_dev_find() not only figures out whether the lowers of the LAG are switchdev, but also whether they actively offload the LAG or not (whether the LAG is "foreign" to the switchdev interface or not). The port_obj_info->orig_dev is preserved across recursive calls, so switchdev drivers still know on which device was this notification originally emitted. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/switchdev.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 92cc763991e9..c32e1c8f79ec 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -324,11 +324,26 @@ int switchdev_handle_port_obj_add(struct net_device *dev, int (*add_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj, struct netlink_ext_ack *extack)); +int switchdev_handle_port_obj_add_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*add_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj, + struct netlink_ext_ack *extack)); int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), int (*del_cb)(struct net_device *dev, const void *ctx, const struct switchdev_obj *obj)); +int switchdev_handle_port_obj_del_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*del_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj)); int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, @@ -447,6 +462,18 @@ switchdev_handle_port_obj_add(struct net_device *dev, return 0; } +static inline int switchdev_handle_port_obj_add_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*add_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj, + struct netlink_ext_ack *extack)) +{ + return 0; +} + static inline int switchdev_handle_port_obj_del(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, @@ -457,6 +484,18 @@ switchdev_handle_port_obj_del(struct net_device *dev, return 0; } +static inline int +switchdev_handle_port_obj_del_foreign(struct net_device *dev, + struct switchdev_notifier_port_obj_info *port_obj_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*del_cb)(struct net_device *dev, const void *ctx, + const struct switchdev_obj *obj)) +{ + return 0; +} + static inline int switchdev_handle_port_attr_set(struct net_device *dev, struct switchdev_notifier_port_attr_info *port_attr_info, -- cgit v1.2.3 From 134ef2388e7f3271d13223decdb5e45b0f84489f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Feb 2022 19:02:17 +0200 Subject: net: dsa: add explicit support for host bridge VLANs Currently, DSA programs VLANs on shared (DSA and CPU) ports each time it does so on user ports. This is good for basic functionality but has several limitations: - the VLAN group which must reach the CPU may be radically different from the VLAN group that must be autonomously forwarded by the switch. In other words, the admin may want to isolate noisy stations and avoid traffic from them going to the control processor of the switch, where it would just waste useless cycles. The bridge already supports independent control of VLAN groups on bridge ports and on the bridge itself, and when VLAN-aware, it will drop packets in software anyway if their VID isn't added as a 'self' entry towards the bridge device. - Replaying host FDB entries may depend, for some drivers like mv88e6xxx, on replaying the host VLANs as well. The 2 VLAN groups are approximately the same in most regular cases, but there are corner cases when timing matters, and DSA's approximation of replicating VLANs on shared ports simply does not work. - If a user makes the bridge (implicitly the CPU port) join a VLAN by accident, there is no way for the CPU port to isolate itself from that noisy VLAN except by rebooting the system. This is because for each VLAN added on a user port, DSA will add it on shared ports too, but for each VLAN deletion on a user port, it will remain installed on shared ports, since DSA has no good indication of whether the VLAN is still in use or not. Now that the bridge driver emits well-balanced SWITCHDEV_OBJ_ID_PORT_VLAN addition and removal events, DSA has a simple and straightforward task of separating the bridge port VLANs (these have an orig_dev which is a DSA slave interface, or a LAG interface) from the host VLANs (these have an orig_dev which is a bridge interface), and to keep a simple reference count of each VID on each shared port. Forwarding VLANs must be installed on the bridge ports and on all DSA ports interconnecting them. We don't have a good view of the exact topology, so we simply install forwarding VLANs on all DSA ports, which is what has been done until now. Host VLANs must be installed primarily on the dedicated CPU port of each bridge port. More subtly, they must also be installed on upstream-facing and downstream-facing DSA ports that are connecting the bridge ports and the CPU. This ensures that the mv88e6xxx's problem (VID of host FDB entry may be absent from VTU) is still addressed even if that switch is in a cross-chip setup, and it has no local CPU port. Therefore: - user ports contain only bridge port (forwarding) VLANs, and no refcounting is necessary - DSA ports contain both forwarding and host VLANs. Refcounting is necessary among these 2 types. - CPU ports contain only host VLANs. Refcounting is also necessary. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index fd1f62a6e0a8..85cb9aed4c51 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -312,6 +312,10 @@ struct dsa_port { struct mutex addr_lists_lock; struct list_head fdbs; struct list_head mdbs; + + /* List of VLANs that CPU and DSA ports are members of. */ + struct mutex vlans_lock; + struct list_head vlans; }; /* TODO: ideally DSA ports would have a single dp->link_dp member, @@ -332,6 +336,12 @@ struct dsa_mac_addr { struct list_head list; }; +struct dsa_vlan { + u16 vid; + refcount_t refcount; + struct list_head list; +}; + struct dsa_switch { struct device *dev; -- cgit v1.2.3 From 08bc13d8efe30258850ecdc5d4f38c0bc07689c0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 10 Feb 2022 20:12:43 +0100 Subject: ieee80211: use tab to indent struct ieee80211_neighbor_ap_info Somehow spaces were used here, use tab instead. Link: https://lore.kernel.org/r/20220210201242.da8fa2e5ae8d.Ia452db01876e52e815f6337fef437049df0d8bd9@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 60ee7b3f58e7..a2940a853783 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4005,10 +4005,10 @@ static inline bool for_each_element_completed(const struct element *element, #define IEEE80211_RNR_TBTT_PARAMS_COLOC_AP 0x40 struct ieee80211_neighbor_ap_info { - u8 tbtt_info_hdr; - u8 tbtt_info_len; - u8 op_class; - u8 channel; + u8 tbtt_info_hdr; + u8 tbtt_info_len; + u8 op_class; + u8 channel; } __packed; enum ieee80211_range_params_max_total_ltf { -- cgit v1.2.3 From d61f4274daa41565b5f9ce589b4ba1239781c139 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 14 Feb 2022 17:29:21 +0100 Subject: ieee80211: add helper to check HE capability element size This element has a very dynamic structure, create a small helper function to validate its size. We're currently checking it in mac80211 in a conversion function, but that's actually slightly buggy. Link: https://lore.kernel.org/r/20220214172920.750bee9eaf37.Ie18359bd38143b7dc949078f10752413e6d36854@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a2940a853783..dfc84680af82 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -9,7 +9,7 @@ * Copyright (c) 2006, Michael Wu * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH * Copyright (c) 2016 - 2017 Intel Deutschland GmbH - * Copyright (c) 2018 - 2021 Intel Corporation + * Copyright (c) 2018 - 2022 Intel Corporation */ #ifndef LINUX_IEEE80211_H @@ -2338,6 +2338,29 @@ ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) return n; } +static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_he_cap_elem *he_cap_ie_elem = (const void *)data; + u8 needed = sizeof(*he_cap_ie_elem); + + if (len < needed) + return false; + + needed += ieee80211_he_mcs_nss_size(he_cap_ie_elem); + if (len < needed) + return false; + + if (he_cap_ie_elem->phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) { + if (len < needed + 1) + return false; + needed += ieee80211_he_ppe_size(data[needed], + he_cap_ie_elem->phy_cap_info); + } + + return len >= needed; +} + /* HE Operation defines */ #define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK 0x00000007 #define IEEE80211_HE_OPERATION_TWT_REQUIRED 0x00000008 -- cgit v1.2.3 From cbc1ca0a9d0a54cb8aa7c54c16e71599551e3f74 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 14 Feb 2022 17:29:51 +0100 Subject: ieee80211: Add EHT (802.11be) definitions Based on Draft P802.11be_D1.4. Signed-off-by: Ilan Peer Link: https://lore.kernel.org/r/20220214173004.928e23cacb2b.Id30a3ef2844b296efbd5486fe1da9ca36a95c5cf@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 299 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 299 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index dfc84680af82..7204bc28ca31 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -1925,6 +1926,111 @@ struct ieee80211_mu_edca_param_set { struct ieee80211_he_mu_edca_param_ac_rec ac_vo; } __packed; +#define IEEE80211_EHT_MCS_NSS_RX 0x0f +#define IEEE80211_EHT_MCS_NSS_TX 0xf0 + +/** + * struct ieee80211_eht_mcs_nss_supp_20mhz_only - EHT 20MHz only station max + * supported NSS for per MCS. + * + * For each field below, bits 0 - 3 indicate the maximal number of spatial + * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams + * for Tx. + * + * @rx_tx_mcs7_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 0 - 7. + * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 8 - 9. + * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 10 - 11. + * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 12 - 13. + */ +struct ieee80211_eht_mcs_nss_supp_20mhz_only { + u8 rx_tx_mcs7_max_nss; + u8 rx_tx_mcs9_max_nss; + u8 rx_tx_mcs11_max_nss; + u8 rx_tx_mcs13_max_nss; +}; + +/** + * struct ieee80211_eht_mcs_nss_supp_bw - EHT max supported NSS per MCS (except + * 20MHz only stations). + * + * For each field below, bits 0 - 3 indicate the maximal number of spatial + * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams + * for Tx. + * + * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 0 - 9. + * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 10 - 11. + * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 12 - 13. + */ +struct ieee80211_eht_mcs_nss_supp_bw { + u8 rx_tx_mcs9_max_nss; + u8 rx_tx_mcs11_max_nss; + u8 rx_tx_mcs13_max_nss; +}; + +/** + * struct ieee80211_eht_cap_elem_fixed - EHT capabilities fixed data + * + * This structure is the "EHT Capabilities element" fixed fields as + * described in P802.11be_D1.4 section 9.4.2.313. + * + * @mac_cap_info: MAC capabilities, see IEEE80211_EHT_MAC_CAP* + * @phy_cap_info: PHY capabilities, see IEEE80211_EHT_PHY_CAP* + */ +struct ieee80211_eht_cap_elem_fixed { + u8 mac_cap_info[2]; + u8 phy_cap_info[9]; +} __packed; + +/** + * struct ieee80211_eht_cap_elem - EHT capabilities element + * @fixed: fixed parts, see &ieee80211_eht_cap_elem_fixed + * @optional: optional parts + */ +struct ieee80211_eht_cap_elem { + struct ieee80211_eht_cap_elem_fixed fixed; + + /* + * Followed by: + * Supported EHT-MCS And NSS Set field: 4, 3, 6 or 9 octets. + * EHT PPE Thresholds field: variable length. + */ + u8 optional[]; +} __packed; + +/** + * struct ieee80211_eht_operation - eht operation element + * + * This structure is the "EHT Operation Element" fields as + * described in P802.11be_D1.4 section 9.4.2.311 + * + * FIXME: The spec is unclear how big the fields are, and doesn't + * indicate the "Disabled Subchannel Bitmap Present" in the + * structure (Figure 9-1002a) at all ... + */ +struct ieee80211_eht_operation { + u8 chan_width; + u8 ccfs; + u8 present_bm; + + u8 disable_subchannel_bitmap[]; +} __packed; + +#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT 0x1 + /* 802.11ac VHT Capabilities */ #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 0x00000000 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991 0x00000001 @@ -2129,6 +2235,8 @@ enum ieee80211_client_reg_power { #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G 0x04 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G 0x08 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G 0x10 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL 0x1e + #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G 0x20 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G 0x40 #define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK 0xfe @@ -2622,6 +2730,194 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) #define S1G_OPER_CH_WIDTH_PRIMARY_1MHZ BIT(0) #define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) +/* EHT MAC capabilities as defined in P802.11be_D1.4 section 9.4.2.313.2 */ +#define IEEE80211_EHT_MAC_CAP0_NSEP_PRIO_ACCESS 0x01 +#define IEEE80211_EHT_MAC_CAP0_OM_CONTROL 0x02 +#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1 0x04 +#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2 0x08 +#define IEEE80211_EHT_MAC_CAP0_RESTRICTED_TWT 0x10 +#define IEEE80211_EHT_MAC_CAP0_SCS_TRAFFIC_DESC 0x20 +#define IEEE80211_EHT_MAC_CAP0_MAX_AMPDU_LEN_MASK 0xc0 +#define IEEE80211_EHT_MAC_CAP0_MAX_AMPDU_LEN_3895 0 +#define IEEE80211_EHT_MAC_CAP0_MAX_AMPDU_LEN_7991 1 +#define IEEE80211_EHT_MAC_CAP0_MAX_AMPDU_LEN_11454 2 + +/* EHT PHY capabilities as defined in P802.11be_D1.4 section 9.4.2.313.3 */ +#define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ 0x02 +#define IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ 0x04 +#define IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI 0x08 +#define IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO 0x10 +#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER 0x20 +#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE 0x40 + +/* EHT beamformee number of spatial streams <= 80MHz is split */ +#define IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK 0x80 +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK 0x03 + +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK 0x1c +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK 0xe0 + +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK 0x07 +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK 0x38 + +/* EHT number of sounding dimensions for 320MHz is split */ +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK 0xc0 +#define IEEE80211_EHT_PHY_CAP3_SOUNDING_DIM_320MHZ_MASK 0x01 +#define IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK 0x02 +#define IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK 0x04 +#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK 0x08 +#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK 0x10 +#define IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK 0x20 +#define IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK 0x40 +#define IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK 0x80 + +#define IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO 0x01 +#define IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP 0x02 +#define IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP 0x04 +#define IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI 0x08 +#define IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK 0xf0 + +#define IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK 0x01 +#define IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP 0x02 +#define IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP 0x04 +#define IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT 0x08 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK 0x30 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_0US 0 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_8US 1 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_16US 2 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_20US 3 + +/* Maximum number of supported EHT LTF is split */ +#define IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK 0xc0 +#define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK 0x07 + +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK 0x78 +#define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP 0x80 + +#define IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW 0x01 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ 0x02 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ 0x04 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ 0x08 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ 0x10 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ 0x20 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ 0x40 +#define IEEE80211_EHT_PHY_CAP7_TB_SOUNDING_FDBK_RATE_LIMIT 0x80 + +#define IEEE80211_EHT_PHY_CAP8_RX_1024QAM_WIDER_BW_DL_OFDMA 0x01 +#define IEEE80211_EHT_PHY_CAP8_RX_4096QAM_WIDER_BW_DL_OFDMA 0x02 + +/* + * EHT operation channel width as defined in P802.11be_D1.4 section 9.4.2.311 + */ +#define IEEE80211_EHT_OPER_CHAN_WIDTH 0x7 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ 0 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ 1 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ 2 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ 3 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ 4 + +/* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */ +static inline u8 +ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, + const struct ieee80211_eht_cap_elem_fixed *eht_cap) +{ + u8 count = 0; + + /* on 2.4 GHz, if it supports 40 MHz, the result is 3 */ + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) + return 3; + + /* on 2.4 GHz, these three bits are reserved, so should be 0 */ + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G) + count += 3; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) + count += 3; + + if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) + count += 3; + + return count ? count : 4; +} + +/* 802.11be EHT PPE Thresholds */ +#define IEEE80211_EHT_PPE_THRES_NSS_POS 0 +#define IEEE80211_EHT_PPE_THRES_NSS_MASK 0xf +#define IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK 0x1f0 +#define IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE 3 +#define IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE 9 + +/* + * Calculate 802.11be EHT capabilities IE EHT field size + */ +static inline u8 +ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info) +{ + u32 n; + + if (!(phy_cap_info[5] & + IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT)) + return 0; + + n = hweight16(ppe_thres_hdr & + IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= 1 + u16_get_bits(ppe_thres_hdr, IEEE80211_EHT_PPE_THRES_NSS_MASK); + + /* + * Each pair is 6 bits, and we need to add the 9 "header" bits to the + * total size. + */ + n = n * IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE * 2 + + IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE; + return DIV_ROUND_UP(n, 8); +} + +static inline bool +ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len) +{ + const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data; + u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed); + + if (len < needed || !he_capa) + return false; + + needed += ieee80211_eht_mcs_nss_size((const void *)he_capa, + (const void *)data); + if (len < needed) + return false; + + if (elem->phy_cap_info[5] & + IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT) { + u16 ppe_thres_hdr; + + if (len < needed + sizeof(ppe_thres_hdr)) + return false; + + ppe_thres_hdr = get_unaligned_le16(data + needed); + needed += ieee80211_eht_ppe_size(ppe_thres_hdr, + elem->phy_cap_info); + } + + return len >= needed; +} + +static inline bool +ieee80211_eht_oper_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_eht_operation *elem = (const void *)data; + u8 needed = sizeof(*elem); + + if (len < needed) + return false; + + if (elem->present_bm & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT) + needed += 2; + + return len >= needed; +} #define LISTEN_INT_USF GENMASK(15, 14) #define LISTEN_INT_UI GENMASK(13, 0) @@ -3077,6 +3373,9 @@ enum ieee80211_eid_ext { WLAN_EID_EXT_SHORT_SSID_LIST = 58, WLAN_EID_EXT_HE_6GHZ_CAPA = 59, WLAN_EID_EXT_UL_MU_POWER_CAPA = 60, + WLAN_EID_EXT_EHT_OPERATION = 106, + WLAN_EID_EXT_EHT_MULTI_LINK = 107, + WLAN_EID_EXT_EHT_CAPABILITY = 108, }; /* Action category code */ -- cgit v1.2.3 From 2a2c86f15e17c5013b9897b67d895e64a25ae3cb Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Mon, 14 Feb 2022 17:29:52 +0100 Subject: ieee80211: add EHT 1K aggregation definitions We add the fields for parsing extended ADDBA request/respond, and new max 1K aggregation for limit ADDBA request/respond. Adjust drivers to use the proper macro, IEEE80211_MAX_AMPDU_BUF -> IEEE80211_MAX_AMPDU_BUF_HE. Signed-off-by: Mordechay Goodstein Link: https://lore.kernel.org/r/20220214173004.b8b447ce95b7.I0ee2554c94e89abc7a752b0f7cc7fd79c273efea@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 7204bc28ca31..a4143466cb32 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1024,6 +1024,8 @@ struct ieee80211_tpc_report_ie { #define IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK GENMASK(2, 1) #define IEEE80211_ADDBA_EXT_FRAG_LEVEL_SHIFT 1 #define IEEE80211_ADDBA_EXT_NO_FRAG BIT(0) +#define IEEE80211_ADDBA_EXT_BUF_SIZE_MASK GENMASK(7, 5) +#define IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT 10 struct ieee80211_addba_ext_ie { u8 data; @@ -1698,10 +1700,12 @@ struct ieee80211_ht_operation { * A-MPDU buffer sizes * According to HT size varies from 8 to 64 frames * HE adds the ability to have up to 256 frames. + * EHT adds the ability to have up to 1K frames. */ #define IEEE80211_MIN_AMPDU_BUF 0x8 #define IEEE80211_MAX_AMPDU_BUF_HT 0x40 -#define IEEE80211_MAX_AMPDU_BUF 0x100 +#define IEEE80211_MAX_AMPDU_BUF_HE 0x100 +#define IEEE80211_MAX_AMPDU_BUF_EHT 0x400 /* Spatial Multiplexing Power Save Modes (for capability) */ -- cgit v1.2.3 From 5cd5a8a3e2fb11b1c8a09f062c44c1e228ef987a Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 14 Feb 2022 17:29:53 +0100 Subject: cfg80211: Add data structures to capture EHT capabilities And advertise EHT capabilities to user space when supported. Signed-off-by: Ilan Peer Link: https://lore.kernel.org/r/20220214173004.6fb70658529f.I2413a37c8f7d2d6d638038a3d95360a3fce0114d@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 63 ++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/nl80211.h | 12 +++++++++ 2 files changed, 75 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f6db085ff3df..7bec15f605be 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -360,6 +360,48 @@ struct ieee80211_sta_he_cap { u8 ppe_thres[IEEE80211_HE_PPE_THRES_MAX_LEN]; }; +/** + * struct ieee80211_eht_mcs_nss_supp - EHT max supported NSS per MCS + * + * See P802.11be_D1.3 Table 9-401k - "Subfields of the Supported EHT-MCS + * and NSS Set field" + * + * @only_20mhz: MCS/NSS support for 20 MHz-only STA. + * @bw._80: MCS/NSS support for BW <= 80 MHz + * @bw._160: MCS/NSS support for BW = 160 MHz + * @bw._320: MCS/NSS support for BW = 320 MHz + */ +struct ieee80211_eht_mcs_nss_supp { + union { + struct ieee80211_eht_mcs_nss_supp_20mhz_only only_20mhz; + struct { + struct ieee80211_eht_mcs_nss_supp_bw _80; + struct ieee80211_eht_mcs_nss_supp_bw _160; + struct ieee80211_eht_mcs_nss_supp_bw _320; + } __packed bw; + } __packed; +} __packed; + +#define IEEE80211_EHT_PPE_THRES_MAX_LEN 32 + +/** + * struct ieee80211_sta_eht_cap - STA's EHT capabilities + * + * This structure describes most essential parameters needed + * to describe 802.11be EHT capabilities for a STA. + * + * @has_eht: true iff EHT data is valid. + * @eht_cap_elem: Fixed portion of the eht capabilities element. + * @eht_mcs_nss_supp: The supported NSS/MCS combinations. + * @eht_ppe_thres: Holds the PPE Thresholds data. + */ +struct ieee80211_sta_eht_cap { + bool has_eht; + struct ieee80211_eht_cap_elem_fixed eht_cap_elem; + struct ieee80211_eht_mcs_nss_supp eht_mcs_nss_supp; + u8 eht_ppe_thres[IEEE80211_EHT_PPE_THRES_MAX_LEN]; +}; + /** * struct ieee80211_sband_iftype_data - sband data per interface type * @@ -379,6 +421,7 @@ struct ieee80211_sband_iftype_data { u16 types_mask; struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; + struct ieee80211_sta_eht_cap eht_cap; struct { const u8 *data; unsigned int len; @@ -561,6 +604,26 @@ ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband, return data->he_6ghz_capa.capa; } +/** + * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype + * @sband: the sband to search for the iftype on + * @iftype: enum nl80211_iftype + * + * Return: pointer to the struct ieee80211_sta_eht_cap, or NULL is none found + */ +static inline const struct ieee80211_sta_eht_cap * +ieee80211_get_eht_iftype_cap(const struct ieee80211_supported_band *sband, + enum nl80211_iftype iftype) +{ + const struct ieee80211_sband_iftype_data *data = + ieee80211_get_sband_iftype_data(sband, iftype); + + if (data && data->eht_cap.has_eht) + return &data->eht_cap; + + return NULL; +} + /** * wiphy_read_of_freq_limits - read frequency limits from device tree * diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 195a238a322e..d305a8b8c536 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3766,6 +3766,14 @@ enum nl80211_mpath_info { * given for all 6 GHz band channels * @NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS: vendor element capabilities that are * advertised on this band/for this iftype (binary) + * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MAC: EHT MAC capabilities as in EHT + * capabilities element + * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY: EHT PHY capabilities as in EHT + * capabilities element + * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET: EHT supported NSS/MCS as in EHT + * capabilities element + * @NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE: EHT PPE thresholds information as + * defined in EHT capabilities element * @__NL80211_BAND_IFTYPE_ATTR_AFTER_LAST: internal use * @NL80211_BAND_IFTYPE_ATTR_MAX: highest band attribute currently defined */ @@ -3779,6 +3787,10 @@ enum nl80211_band_iftype_attr { NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, NL80211_BAND_IFTYPE_ATTR_VENDOR_ELEMS, + NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MAC, + NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PHY, + NL80211_BAND_IFTYPE_ATTR_EHT_CAP_MCS_SET, + NL80211_BAND_IFTYPE_ATTR_EHT_CAP_PPE, /* keep last */ __NL80211_BAND_IFTYPE_ATTR_AFTER_LAST, -- cgit v1.2.3 From 3743bec6120ae0748cb3fda6ff80a690117ef1f3 Mon Sep 17 00:00:00 2001 From: Jia Ding Date: Mon, 14 Feb 2022 17:29:54 +0100 Subject: cfg80211: Add support for EHT 320 MHz channel width Add 320 MHz support in the channel def and center frequency validation with compatible check. Signed-off-by: Jia Ding Co-authored-by: Karthikeyan Periyasamy Signed-off-by: Karthikeyan Periyasamy Co-authored-by: Muna Sinada Signed-off-by: Muna Sinada Co-authored-by: Veerendranath Jakkam Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/1640163883-12696-5-git-send-email-quic_vjakkam@quicinc.com Link: https://lore.kernel.org/r/20220214163009.175289-1-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 ++++++- include/uapi/linux/nl80211.h | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7bec15f605be..a27f2d699dac 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -109,7 +109,11 @@ struct wiphy; * on this channel. * @IEEE80211_CHAN_16MHZ: 16 MHz bandwidth is permitted * on this channel. - * + * @IEEE80211_CHAN_NO_320MHZ: If the driver supports 320 MHz on the band, + * this flag indicates that a 320 MHz channel cannot use this + * channel as the control or any of the secondary channels. + * This may be due to the driver or due to regulatory bandwidth + * restrictions. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = 1<<0, @@ -131,6 +135,7 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_4MHZ = 1<<16, IEEE80211_CHAN_8MHZ = 1<<17, IEEE80211_CHAN_16MHZ = 1<<18, + IEEE80211_CHAN_NO_320MHZ = 1<<19, }; #define IEEE80211_CHAN_NO_HT40 \ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d305a8b8c536..9e05973f3f56 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4684,6 +4684,8 @@ enum nl80211_key_mode { * @NL80211_CHAN_WIDTH_4: 4 MHz OFDM channel * @NL80211_CHAN_WIDTH_8: 8 MHz OFDM channel * @NL80211_CHAN_WIDTH_16: 16 MHz OFDM channel + * @NL80211_CHAN_WIDTH_320: 320 MHz channel, the %NL80211_ATTR_CENTER_FREQ1 + * attribute must be provided as well */ enum nl80211_chan_width { NL80211_CHAN_WIDTH_20_NOHT, @@ -4699,6 +4701,7 @@ enum nl80211_chan_width { NL80211_CHAN_WIDTH_4, NL80211_CHAN_WIDTH_8, NL80211_CHAN_WIDTH_16, + NL80211_CHAN_WIDTH_320, }; /** -- cgit v1.2.3 From cfb14110acf87b4db62e07ba08a80429f1749f40 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Mon, 14 Feb 2022 17:29:55 +0100 Subject: nl80211: add EHT MCS support Add support for reporting and calculating EHT bitrates. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/1640163883-12696-7-git-send-email-quic_vjakkam@quicinc.com Link: https://lore.kernel.org/r/20220214163009.175289-2-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 11 ++++++++ include/uapi/linux/nl80211.h | 62 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a27f2d699dac..f35ffd81d213 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1595,6 +1595,7 @@ int cfg80211_check_station_change(struct wiphy *wiphy, * @RATE_INFO_FLAGS_HE_MCS: HE MCS information * @RATE_INFO_FLAGS_EDMG: 60GHz MCS in EDMG mode * @RATE_INFO_FLAGS_EXTENDED_SC_DMG: 60GHz extended SC MCS + * @RATE_INFO_FLAGS_EHT_MCS: EHT MCS information */ enum rate_info_flags { RATE_INFO_FLAGS_MCS = BIT(0), @@ -1604,6 +1605,7 @@ enum rate_info_flags { RATE_INFO_FLAGS_HE_MCS = BIT(4), RATE_INFO_FLAGS_EDMG = BIT(5), RATE_INFO_FLAGS_EXTENDED_SC_DMG = BIT(6), + RATE_INFO_FLAGS_EHT_MCS = BIT(7), }; /** @@ -1618,6 +1620,8 @@ enum rate_info_flags { * @RATE_INFO_BW_80: 80 MHz bandwidth * @RATE_INFO_BW_160: 160 MHz bandwidth * @RATE_INFO_BW_HE_RU: bandwidth determined by HE RU allocation + * @RATE_INFO_BW_320: 320 MHz bandwidth + * @RATE_INFO_BW_EHT_RU: bandwidth determined by EHT RU allocation */ enum rate_info_bw { RATE_INFO_BW_20 = 0, @@ -1627,6 +1631,8 @@ enum rate_info_bw { RATE_INFO_BW_80, RATE_INFO_BW_160, RATE_INFO_BW_HE_RU, + RATE_INFO_BW_320, + RATE_INFO_BW_EHT_RU, }; /** @@ -1644,6 +1650,9 @@ enum rate_info_bw { * @he_ru_alloc: HE RU allocation (from &enum nl80211_he_ru_alloc, * only valid if bw is %RATE_INFO_BW_HE_RU) * @n_bonded_ch: In case of EDMG the number of bonded channels (1-4) + * @eht_gi: EHT guard interval (from &enum nl80211_eht_gi) + * @eht_ru_alloc: EHT RU allocation (from &enum nl80211_eht_ru_alloc, + * only valid if bw is %RATE_INFO_BW_EHT_RU) */ struct rate_info { u8 flags; @@ -1655,6 +1664,8 @@ struct rate_info { u8 he_dcm; u8 he_ru_alloc; u8 n_bonded_ch; + u8 eht_gi; + u8 eht_ru_alloc; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 9e05973f3f56..d0ba70ea5d04 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3392,6 +3392,56 @@ enum nl80211_he_ru_alloc { NL80211_RATE_INFO_HE_RU_ALLOC_2x996, }; +/** + * enum nl80211_eht_gi - EHT guard interval + * @NL80211_RATE_INFO_EHT_GI_0_8: 0.8 usec + * @NL80211_RATE_INFO_EHT_GI_1_6: 1.6 usec + * @NL80211_RATE_INFO_EHT_GI_3_2: 3.2 usec + */ +enum nl80211_eht_gi { + NL80211_RATE_INFO_EHT_GI_0_8, + NL80211_RATE_INFO_EHT_GI_1_6, + NL80211_RATE_INFO_EHT_GI_3_2, +}; + +/** + * enum nl80211_eht_ru_alloc - EHT RU allocation values + * @NL80211_RATE_INFO_EHT_RU_ALLOC_26: 26-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_52: 52-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_52P26: 52+26-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_106: 106-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_106P26: 106+26 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_242: 242-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_484: 484-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_484P242: 484+242 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_996: 996-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_996P484: 996+484 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242: 996+484+242 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_2x996: 2x996-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484: 2x996+484 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_3x996: 3x996-tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484: 3x996+484 tone RU allocation + * @NL80211_RATE_INFO_EHT_RU_ALLOC_4x996: 4x996-tone RU allocation + */ +enum nl80211_eht_ru_alloc { + NL80211_RATE_INFO_EHT_RU_ALLOC_26, + NL80211_RATE_INFO_EHT_RU_ALLOC_52, + NL80211_RATE_INFO_EHT_RU_ALLOC_52P26, + NL80211_RATE_INFO_EHT_RU_ALLOC_106, + NL80211_RATE_INFO_EHT_RU_ALLOC_106P26, + NL80211_RATE_INFO_EHT_RU_ALLOC_242, + NL80211_RATE_INFO_EHT_RU_ALLOC_484, + NL80211_RATE_INFO_EHT_RU_ALLOC_484P242, + NL80211_RATE_INFO_EHT_RU_ALLOC_996, + NL80211_RATE_INFO_EHT_RU_ALLOC_996P484, + NL80211_RATE_INFO_EHT_RU_ALLOC_996P484P242, + NL80211_RATE_INFO_EHT_RU_ALLOC_2x996, + NL80211_RATE_INFO_EHT_RU_ALLOC_2x996P484, + NL80211_RATE_INFO_EHT_RU_ALLOC_3x996, + NL80211_RATE_INFO_EHT_RU_ALLOC_3x996P484, + NL80211_RATE_INFO_EHT_RU_ALLOC_4x996, +}; + /** * enum nl80211_rate_info - bitrate information * @@ -3431,6 +3481,13 @@ enum nl80211_he_ru_alloc { * @NL80211_RATE_INFO_HE_DCM: HE DCM value (u8, 0/1) * @NL80211_RATE_INFO_RU_ALLOC: HE RU allocation, if not present then * non-OFDMA was used (u8, see &enum nl80211_he_ru_alloc) + * @NL80211_RATE_INFO_320_MHZ_WIDTH: 320 MHz bitrate + * @NL80211_RATE_INFO_EHT_MCS: EHT MCS index (u8, 0-15) + * @NL80211_RATE_INFO_EHT_NSS: EHT NSS value (u8, 1-8) + * @NL80211_RATE_INFO_EHT_GI: EHT guard interval identifier + * (u8, see &enum nl80211_eht_gi) + * @NL80211_RATE_INFO_EHT_RU_ALLOC: EHT RU allocation, if not present then + * non-OFDMA was used (u8, see &enum nl80211_eht_ru_alloc) * @__NL80211_RATE_INFO_AFTER_LAST: internal use */ enum nl80211_rate_info { @@ -3452,6 +3509,11 @@ enum nl80211_rate_info { NL80211_RATE_INFO_HE_GI, NL80211_RATE_INFO_HE_DCM, NL80211_RATE_INFO_HE_RU_ALLOC, + NL80211_RATE_INFO_320_MHZ_WIDTH, + NL80211_RATE_INFO_EHT_MCS, + NL80211_RATE_INFO_EHT_NSS, + NL80211_RATE_INFO_EHT_GI, + NL80211_RATE_INFO_EHT_RU_ALLOC, /* keep last */ __NL80211_RATE_INFO_AFTER_LAST, -- cgit v1.2.3 From c2b3d7699fb0ce66538b829af43970acc2f89060 Mon Sep 17 00:00:00 2001 From: Sriram R Date: Mon, 14 Feb 2022 17:29:56 +0100 Subject: nl80211: add support for 320MHz channel limitation Add support to advertise drivers or regulatory limitations on 320 MHz channels to userspace. Signed-off-by: Sriram R Co-authored-by: Karthikeyan Periyasamy Signed-off-by: Karthikeyan Periyasamy Co-authored-by: Veerendranath Jakkam Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/1640163883-12696-6-git-send-email-quic_vjakkam@quicinc.com Link: https://lore.kernel.org/r/20220214163009.175289-3-johannes@sipsolutions.net Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index d0ba70ea5d04..6a338dafcd07 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3997,6 +3997,8 @@ enum nl80211_wmm_rule { * on this channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_16MHZ: 16 MHz operation is allowed * on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_NO_320MHZ: any 320 MHz channel using this channel + * as the primary or any of the secondary channels isn't possible * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4033,6 +4035,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_4MHZ, NL80211_FREQUENCY_ATTR_8MHZ, NL80211_FREQUENCY_ATTR_16MHZ, + NL80211_FREQUENCY_ATTR_NO_320MHZ, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, @@ -4231,6 +4234,7 @@ enum nl80211_sched_scan_match_attr { * @NL80211_RRF_NO_80MHZ: 80MHz operation not allowed * @NL80211_RRF_NO_160MHZ: 160MHz operation not allowed * @NL80211_RRF_NO_HE: HE operation not allowed + * @NL80211_RRF_NO_320MHZ: 320MHz operation not allowed */ enum nl80211_reg_rule_flags { NL80211_RRF_NO_OFDM = 1<<0, @@ -4249,6 +4253,7 @@ enum nl80211_reg_rule_flags { NL80211_RRF_NO_80MHZ = 1<<15, NL80211_RRF_NO_160MHZ = 1<<16, NL80211_RRF_NO_HE = 1<<17, + NL80211_RRF_NO_320MHZ = 1<<18, }; #define NL80211_RRF_PASSIVE_SCAN NL80211_RRF_NO_IR -- cgit v1.2.3 From 31846b657857e6a73d982604f36a34710d98902c Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 14 Feb 2022 17:29:57 +0100 Subject: cfg80211: add NO-EHT flag to regulatory This may be necessary in some cases, add a flag and propagate it, just like the NO-HE that already exists. Signed-off-by: Ilan Peer [split off from a combined 320/no-EHT patch] Link: https://lore.kernel.org/r/20220214173004.dbb85a7b86bb.Ifc1e2daac51c1cc5f895ccfb79faf5eaec3950ec@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f35ffd81d213..5cfc483dece1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -114,6 +114,7 @@ struct wiphy; * channel as the control or any of the secondary channels. * This may be due to the driver or due to regulatory bandwidth * restrictions. + * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel. */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = 1<<0, @@ -136,6 +137,7 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_8MHZ = 1<<17, IEEE80211_CHAN_16MHZ = 1<<18, IEEE80211_CHAN_NO_320MHZ = 1<<19, + IEEE80211_CHAN_NO_EHT = 1<<20, }; #define IEEE80211_CHAN_NO_HT40 \ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6a338dafcd07..baf6433c0119 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3999,6 +3999,8 @@ enum nl80211_wmm_rule { * on this channel in current regulatory domain. * @NL80211_FREQUENCY_ATTR_NO_320MHZ: any 320 MHz channel using this channel * as the primary or any of the secondary channels isn't possible + * @NL80211_FREQUENCY_ATTR_NO_EHT: EHT operation is not allowed on this channel + * in current regulatory domain. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4036,6 +4038,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_8MHZ, NL80211_FREQUENCY_ATTR_16MHZ, NL80211_FREQUENCY_ATTR_NO_320MHZ, + NL80211_FREQUENCY_ATTR_NO_EHT, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, -- cgit v1.2.3 From ea05fd3581d32a0f1098657005c7a9b763798fe8 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 14 Feb 2022 17:29:58 +0100 Subject: cfg80211: Support configuration of station EHT capabilities Add attributes and some code bits to support userspace passing in EHT capabilities of stations. Signed-off-by: Ilan Peer Link: https://lore.kernel.org/r/20220214173004.ecf0b3ff9627.Icb4a5f2ec7b41d9008ac4cfc16c59baeb84793d3@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ include/uapi/linux/nl80211.h | 10 +++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5cfc483dece1..68713388b617 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1487,6 +1487,8 @@ struct sta_txpwr { * @airtime_weight: airtime scheduler weight for this station * @txpwr: transmit power for an associated station * @he_6ghz_capa: HE 6 GHz Band capabilities of station + * @eht_capa: EHT capabilities of station + * @eht_capa_len: the length of the EHT capabilities */ struct station_parameters { const u8 *supported_rates; @@ -1520,6 +1522,8 @@ struct station_parameters { u16 airtime_weight; struct sta_txpwr txpwr; const struct ieee80211_he_6ghz_capa *he_6ghz_capa; + const struct ieee80211_eht_cap_elem *eht_capa; + u8 eht_capa_len; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index baf6433c0119..98ed52663d6b 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -11,7 +11,7 @@ * Copyright 2008 Jouni Malinen * Copyright 2008 Colin McCabe * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2021 Intel Corporation + * Copyright (C) 2018-2022 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -2659,6 +2659,10 @@ enum nl80211_commands { * enumerated in &enum nl80211_ap_settings_flags. This attribute shall be * used with %NL80211_CMD_START_AP request. * + * @NL80211_ATTR_EHT_CAPABILITY: EHT Capability information element (from + * association request when used with NL80211_CMD_NEW_STATION). Can be set + * only if %NL80211_STA_FLAG_WME is set. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3169,6 +3173,8 @@ enum nl80211_attrs { NL80211_ATTR_AP_SETTINGS_FLAGS, + NL80211_ATTR_EHT_CAPABILITY, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3224,6 +3230,8 @@ enum nl80211_attrs { #define NL80211_HE_MAX_CAPABILITY_LEN 54 #define NL80211_MAX_NR_CIPHER_SUITES 5 #define NL80211_MAX_NR_AKM_SUITES 2 +#define NL80211_EHT_MIN_CAPABILITY_LEN 13 +#define NL80211_EHT_MAX_CAPABILITY_LEN 51 #define NL80211_MIN_REMAIN_ON_CHANNEL_TIME 10 -- cgit v1.2.3 From 5dca295dd76756c7918ad7113fc4a3cf8262ed43 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 14 Feb 2022 17:30:00 +0100 Subject: mac80211: Add initial support for EHT and 320 MHz channels Add initial support for EHT and 320 MHz bandwidth in mac80211. As a new IEEE80211_STA_RX_BW_320 is added to enum ieee80211_sta_rx_bandwidth, update the drivers to avoid compilation warnings. Signed-off-by: Ilan Peer Link: https://lore.kernel.org/r/20220214173004.0f144cc0bba6.Iad18111264da87eed5fd7b017f0cc6e58c604e07@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index bd6912d0292b..586d3c26c8ac 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2005,6 +2005,7 @@ enum ieee80211_sta_state { * @IEEE80211_STA_RX_BW_80: station can receive up to 80 MHz * @IEEE80211_STA_RX_BW_160: station can receive up to 160 MHz * (including 80+80 MHz) + * @IEEE80211_STA_RX_BW_320: station can receive up to 320 MHz * * Implementation note: 20 must be zero to be initialized * correctly, the values must be sorted. @@ -2014,6 +2015,7 @@ enum ieee80211_sta_rx_bandwidth { IEEE80211_STA_RX_BW_40, IEEE80211_STA_RX_BW_80, IEEE80211_STA_RX_BW_160, + IEEE80211_STA_RX_BW_320, }; /** -- cgit v1.2.3 From a1de64078bf7a3fed856fef5334132ba1963b683 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 14 Feb 2022 17:30:02 +0100 Subject: mac80211: Handle station association response with EHT When the association is an EHT association, parse the EHT element from the association response and update the station's EHT capabilities accordingly. Signed-off-by: Ilan Peer Link: https://lore.kernel.org/r/20220214173004.f33574718755.I21182234c5303d9423eabd5eb997e7cf75f8e0c8@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 586d3c26c8ac..f118b8fe667a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -636,6 +636,7 @@ struct ieee80211_fils_discovery { * @tx_pwr_env: transmit power envelope array of BSS. * @tx_pwr_env_num: number of @tx_pwr_env. * @pwr_reduction: power constraint of BSS. + * @eht_support: does this BSS support EHT */ struct ieee80211_bss_conf { const u8 *bssid; @@ -710,6 +711,7 @@ struct ieee80211_bss_conf { struct ieee80211_tx_pwr_env tx_pwr_env[IEEE80211_TPE_MAX_IE_COUNT]; u8 tx_pwr_env_num; u8 pwr_reduction; + bool eht_support; }; /** @@ -2071,6 +2073,7 @@ struct ieee80211_sta_txpwr { * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities * @he_cap: HE capabilities of this STA * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities + * @eht_cap: EHT capabilities of this STA * @max_rx_aggregation_subframes: maximal amount of frames in a single AMPDU * that this station is allowed to transmit to us. * Can be modified by driver. @@ -2111,6 +2114,7 @@ struct ieee80211_sta { struct ieee80211_sta_vht_cap vht_cap; struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; + struct ieee80211_sta_eht_cap eht_cap; u16 max_rx_aggregation_subframes; bool wme; u8 uapsd_queues; -- cgit v1.2.3 From f8e9ce4a6e85067d7d7cfa89167f5ce5f0ec2a8a Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Tue, 15 Feb 2022 18:11:24 -0800 Subject: mptcp: mptcp_parse_option is no longer exported Options parsing in now done from mptcp_incoming_options(). mptcp_parse_option() has been removed from mptcp.h when CONFIG_MPTCP is defined but not when it is not. Fixes: cfde141ea3fa ("mptcp: move option parsing into mptcp_incoming_options()") Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index a925349b4b89..0a3b0fb04a3b 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -217,12 +217,6 @@ static inline bool rsk_drop_req(const struct request_sock *req) return false; } -static inline void mptcp_parse_option(const struct sk_buff *skb, - const unsigned char *ptr, int opsize, - struct tcp_options_received *opt_rx) -{ -} - static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) -- cgit v1.2.3 From c518afec288351347dbe05ea3d49d18fb9a9fff1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Feb 2022 16:30:05 +0200 Subject: net: mscc: ocelot: consolidate cookie allocation for private VCAP rules Every use case that needed VCAP filters (in order: DSA tag_8021q, MRP, PTP traps) has hardcoded filter identifiers that worked well enough for that use case alone. But when two or more of those use cases would be used together, some of those identifiers would overlap, leading to breakage. Add definitions for each cookie and centralize them in ocelot_vcap.h, such that the overlaps are more obvious. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot_vcap.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 709cbc198fd2..562bcd972132 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -8,6 +8,22 @@ #include +/* Cookie definitions for private VCAP filters installed by the driver. + * Must be unique per VCAP block. + */ +#define OCELOT_VCAP_ES0_TAG_8021Q_RXVLAN(ocelot, port) (port) +#define OCELOT_VCAP_IS1_TAG_8021Q_TXVLAN(ocelot, port) (port) +#define OCELOT_VCAP_IS1_TAG_8021Q_PTP_MMIO(ocelot) ((ocelot)->num_phys_ports) +#define OCELOT_VCAP_IS2_TAG_8021Q_TXVLAN(ocelot, port) (port) +#define OCELOT_VCAP_IS2_TAG_8021Q_PTP_MMIO(ocelot) ((ocelot)->num_phys_ports) +#define OCELOT_VCAP_IS2_L2_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 1) +#define OCELOT_VCAP_IS2_IPV4_GEN_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 2) +#define OCELOT_VCAP_IS2_IPV4_EV_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 3) +#define OCELOT_VCAP_IS2_IPV6_GEN_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 4) +#define OCELOT_VCAP_IS2_IPV6_EV_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 5) +#define OCELOT_VCAP_IS2_MRP_REDIRECT(ocelot, port) (port) +#define OCELOT_VCAP_IS2_MRP_TRAP(ocelot, port) ((ocelot)->num_phys_ports + (port)) + /* ================================================================= * VCAP Common * ================================================================= -- cgit v1.2.3 From 36fac35b29072e345d5fc485cf7841be265181b1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Feb 2022 16:30:06 +0200 Subject: net: mscc: ocelot: delete OCELOT_MRP_CPUQ MRP frames are configured to be trapped to the CPU queue 7, and this number is reflected in the extraction header. However, the information isn't used anywhere, so just leave MRP frames to go to CPU queue 0 unless needed otherwise. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index cacb103e4bad..2d7456c0e77d 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -105,8 +105,6 @@ #define REG_RESERVED_ADDR 0xffffffff #define REG_RESERVED(reg) REG(reg, REG_RESERVED_ADDR) -#define OCELOT_MRP_CPUQ 7 - enum ocelot_target { ANA = 1, QS, -- cgit v1.2.3 From b9bace6e534d431871a9d69cbd06d3a798f5086d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Feb 2022 16:30:07 +0200 Subject: net: mscc: ocelot: use a single VCAP filter for all MRP traps The MRP assist code installs a VCAP IS2 trapping rule for each port, but since the key and the action is the same, just the ingress port mask differs, there isn't any need to do this. We can save some space in the TCAM by using a single filter and adjusting the ingress port mask. Reuse the ocelot_trap_add() and ocelot_trap_del() functions for this purpose. Now that the cookies are no longer per port, we need to change the allocation scheme such that MRP traps use a fixed number. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot_vcap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 562bcd972132..14ada097db0b 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -21,8 +21,8 @@ #define OCELOT_VCAP_IS2_IPV4_EV_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 3) #define OCELOT_VCAP_IS2_IPV6_GEN_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 4) #define OCELOT_VCAP_IS2_IPV6_EV_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 5) +#define OCELOT_VCAP_IS2_MRP_TRAP(ocelot) ((ocelot)->num_phys_ports + 6) #define OCELOT_VCAP_IS2_MRP_REDIRECT(ocelot, port) (port) -#define OCELOT_VCAP_IS2_MRP_TRAP(ocelot, port) ((ocelot)->num_phys_ports + (port)) /* ================================================================= * VCAP Common -- cgit v1.2.3 From 85ea0daabe5abc6add440f86565b7b9f6aff5535 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Feb 2022 16:30:08 +0200 Subject: net: mscc: ocelot: avoid overlap in VCAP IS2 between PTP and MRP traps OCELOT_VCAP_IS2_TAG_8021Q_TXVLAN overlaps with OCELOT_VCAP_IS2_MRP_REDIRECT. To avoid this, make OCELOT_VCAP_IS2_MRP_REDIRECT take the cookie region from N to 2 * N - 1 (where N is ocelot->num_phys_ports). To avoid any risk that the singleton (not per port) VCAP IS2 filters overlap with per-port VCAP IS2 filters, we must ensure that the number of singleton filters is smaller than the number of physical ports. This is true right now, but may change in the future as switches with less ports get supported, or more singleton filters get added. So to be future-proof, let's move the singleton filters at the end of the range, where they won't overlap with anything to their right. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot_vcap.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 14ada097db0b..ae0eec7f5dd2 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -15,14 +15,14 @@ #define OCELOT_VCAP_IS1_TAG_8021Q_TXVLAN(ocelot, port) (port) #define OCELOT_VCAP_IS1_TAG_8021Q_PTP_MMIO(ocelot) ((ocelot)->num_phys_ports) #define OCELOT_VCAP_IS2_TAG_8021Q_TXVLAN(ocelot, port) (port) -#define OCELOT_VCAP_IS2_TAG_8021Q_PTP_MMIO(ocelot) ((ocelot)->num_phys_ports) -#define OCELOT_VCAP_IS2_L2_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 1) -#define OCELOT_VCAP_IS2_IPV4_GEN_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 2) -#define OCELOT_VCAP_IS2_IPV4_EV_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 3) -#define OCELOT_VCAP_IS2_IPV6_GEN_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 4) -#define OCELOT_VCAP_IS2_IPV6_EV_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports + 5) -#define OCELOT_VCAP_IS2_MRP_TRAP(ocelot) ((ocelot)->num_phys_ports + 6) -#define OCELOT_VCAP_IS2_MRP_REDIRECT(ocelot, port) (port) +#define OCELOT_VCAP_IS2_MRP_REDIRECT(ocelot, port) ((ocelot)->num_phys_ports + (port)) +#define OCELOT_VCAP_IS2_TAG_8021Q_PTP_MMIO(ocelot) ((ocelot)->num_phys_ports * 2) +#define OCELOT_VCAP_IS2_L2_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2 + 1) +#define OCELOT_VCAP_IS2_IPV4_GEN_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2 + 2) +#define OCELOT_VCAP_IS2_IPV4_EV_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2 + 3) +#define OCELOT_VCAP_IS2_IPV6_GEN_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2 + 4) +#define OCELOT_VCAP_IS2_IPV6_EV_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2 + 5) +#define OCELOT_VCAP_IS2_MRP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2 + 6) /* ================================================================= * VCAP Common -- cgit v1.2.3 From e42bd4ed09aaf57949f199ae02d20a7108ccf73b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Feb 2022 16:30:10 +0200 Subject: net: mscc: ocelot: keep traps in a list When using the ocelot-8021q tagging protocol, the CPU port isn't configured as an NPI port, but is a regular port. So a "trap to CPU" operation is actually a "redirect" operation. So DSA needs to set up the trapping action one way or another, depending on the tagging protocol in use. To ease DSA's work of modifying the action, keep all currently installed traps in a list, so that DSA can live-patch them when the tagging protocol changes. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 1 + include/soc/mscc/ocelot_vcap.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 2d7456c0e77d..78f56502bc09 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -689,6 +689,7 @@ struct ocelot { u8 base_mac[ETH_ALEN]; struct list_head vlans; + struct list_head traps; /* Switches like VSC9959 have flooding per traffic class */ int num_flooding_pgids; diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index ae0eec7f5dd2..69b3d880302d 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -682,6 +682,7 @@ struct ocelot_vcap_id { struct ocelot_vcap_filter { struct list_head list; + struct list_head trap_list; enum ocelot_vcap_filter_type type; int block_id; -- cgit v1.2.3 From 9d75b8818537fe64e6eae033765b9dc1b3107c15 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Feb 2022 16:30:11 +0200 Subject: net: mscc: ocelot: annotate which traps need PTP timestamping The ocelot switch library does not need this information, but the felix DSA driver does. As a reminder, the VSC9959 switch in LS1028A doesn't have an IRQ line for packet extraction, so to be notified that a PTP packet needs to be dequeued, it receives that packet also over Ethernet, by setting up a packet trap. The Felix driver needs to install special kinds of traps for packets in need of RX timestamps, such that the packets are replicated both over Ethernet and over the CPU port module. But the Ocelot switch library sets up more than one trap for PTP event messages; it also traps PTP general messages, MRP control messages etc. Those packets don't need PTP timestamps, so there's no reason for the Felix driver to send them to the CPU port module. By knowing which traps need PTP timestamps, the Felix driver can adjust the traps installed using ocelot_trap_add() such that only those will actually get delivered to the CPU port module. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot_vcap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 69b3d880302d..50af64e2ca3c 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -695,6 +695,7 @@ struct ocelot_vcap_filter { struct ocelot_vcap_action action; struct ocelot_vcap_stats stats; /* For VCAP IS1 and IS2 */ + bool take_ts; unsigned long ingress_port_mask; /* For VCAP ES0 */ struct ocelot_vcap_port ingress_port; -- cgit v1.2.3 From 9934800436552d2a4af4aaca62d779b33d1f6a63 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Feb 2022 16:30:13 +0200 Subject: net: dsa: felix: update destinations of existing traps with ocelot-8021q Historically, the felix DSA driver has installed special traps such that PTP over L2 works with the ocelot-8021q tagging protocol; commit 0a6f17c6ae21 ("net: dsa: tag_ocelot_8021q: add support for PTP timestamping") has the details. Then the ocelot switch library also gained more comprehensive support for PTP traps through commit 96ca08c05838 ("net: mscc: ocelot: set up traps for PTP packets"). Right now, PTP over L2 works using ocelot-8021q via the traps it has set for itself, but nothing else does. Consolidating the two code blocks would make ocelot-8021q gain support for PTP over L4 and tc-flower traps, and at the same time avoid some code and TCAM duplication. The traps are similar in intent, but different in execution, so some explanation is required. The traps set up by felix_setup_mmio_filtering() are VCAP IS1 filters, which have a PAG that chains them to a VCAP IS2 filter, and the IS2 is where the 'trap' action resides. The traps set up by ocelot_trap_add(), on the other hand, have a single filter, in VCAP IS2. The reason for chaining VCAP IS1 and IS2 in Felix was to ensure that the hardcoded traps take precedence and cannot be overridden by the Ocelot switch library. So in principle, the PTP traps needed for ocelot-8021q in the Felix driver can rely on ocelot_trap_add(), but the filters need to be patched to account for a quirk that LS1028A has: the quirk_no_xtr_irq described in commit 0a6f17c6ae21 ("net: dsa: tag_ocelot_8021q: add support for PTP timestamping"). Live-patching is done by iterating through the trap list every time we know it has been updated, and transforming a trap into a redirect + CPU copy if ocelot-8021q is in use. Making the DSA ocelot-8021q tagger work with the Ocelot traps means we can eliminate the dedicated OCELOT_VCAP_IS1_TAG_8021Q_PTP_MMIO and OCELOT_VCAP_IS2_TAG_8021Q_PTP_MMIO cookies. To minimize the patch delta, OCELOT_VCAP_IS2_MRP_TRAP takes the place of OCELOT_VCAP_IS2_TAG_8021Q_PTP_MMIO (the alternative would have been to left-shift all cookie numbers by 1). Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot_vcap.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index 50af64e2ca3c..deb2ad9eb0a5 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -13,16 +13,14 @@ */ #define OCELOT_VCAP_ES0_TAG_8021Q_RXVLAN(ocelot, port) (port) #define OCELOT_VCAP_IS1_TAG_8021Q_TXVLAN(ocelot, port) (port) -#define OCELOT_VCAP_IS1_TAG_8021Q_PTP_MMIO(ocelot) ((ocelot)->num_phys_ports) #define OCELOT_VCAP_IS2_TAG_8021Q_TXVLAN(ocelot, port) (port) #define OCELOT_VCAP_IS2_MRP_REDIRECT(ocelot, port) ((ocelot)->num_phys_ports + (port)) -#define OCELOT_VCAP_IS2_TAG_8021Q_PTP_MMIO(ocelot) ((ocelot)->num_phys_ports * 2) +#define OCELOT_VCAP_IS2_MRP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2) #define OCELOT_VCAP_IS2_L2_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2 + 1) #define OCELOT_VCAP_IS2_IPV4_GEN_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2 + 2) #define OCELOT_VCAP_IS2_IPV4_EV_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2 + 3) #define OCELOT_VCAP_IS2_IPV6_GEN_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2 + 4) #define OCELOT_VCAP_IS2_IPV6_EV_PTP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2 + 5) -#define OCELOT_VCAP_IS2_MRP_TRAP(ocelot) ((ocelot)->num_phys_ports * 2 + 6) /* ================================================================= * VCAP Common -- cgit v1.2.3 From d2b1d186ce2eac6b15d31db3e2750ee8e02bbe81 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Feb 2022 21:37:26 +0200 Subject: net: dsa: delete unused exported symbols for ethtool PHY stats Introduced in commit cf963573039a ("net: dsa: Allow providing PHY statistics from CPU port"), it appears these were never used. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20220216193726.2926320-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1456313a1faa..c8626dec970c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1257,9 +1257,6 @@ static inline bool dsa_slave_dev_check(const struct net_device *dev) #endif netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); -int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data); -int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data); -int dsa_port_get_phy_sset_count(struct dsa_port *dp); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); struct dsa_tag_driver { -- cgit v1.2.3 From 091296d30917f6e63a9402974f546412dfb2a8e6 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Sat, 5 Feb 2022 11:21:36 +0200 Subject: iwlwifi: mvm: refactor setting PPE thresholds in STA_HE_CTXT_CMD We are setting the PPE Thresholds in STA_HE_CTXT_CMD according to HE PHY Capabilities IE. As EHT is introduced, we will have to set this thresholds according to EHT PHY Capabilities IE if we're in an EHT connection. Some parts of the code can be used for both HE and EHT. Put this parts in functions which will be used in the patch which adds support for EHT PPE. Signed-off-by: Miri Korenblit Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20220205112029.48a508dfffef.If392e44d88f96ebed7fadf827e327194d4bd97b1@changeid Signed-off-by: Luca Coelho --- include/linux/ieee80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a4143466cb32..75d40acb60c1 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2421,6 +2421,7 @@ ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap) #define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK 0x78 #define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS (3) #define IEEE80211_PPE_THRES_INFO_PPET_SIZE (3) +#define IEEE80211_HE_PPE_THRES_INFO_HEADER_SIZE (7) /* * Calculate 802.11ax HE capabilities IE PPE field size -- cgit v1.2.3 From 8467fadc115cb08bb1cbc7885cb7b7ef1871cae4 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 17 Feb 2022 10:07:55 +0200 Subject: net: gro: Fix a 'directive in macro's argument list' sparse warning Following the cited commit, sparse started complaining about: ../include/net/gro.h:58:1: warning: directive in macro's argument list ../include/net/gro.h:59:1: warning: directive in macro's argument list Fix that by moving the defines out of the struct_group() macro. Fixes: de5a1f3ce4c8 ("net: gro: minor optimization for dev_gro_receive()") Reviewed-by: Maxim Mikityanskiy Signed-off-by: Gal Pressman Acked-by: Alexander Lobakin Acked-by: Paolo Abeni Signed-off-by: David S. Miller --- include/net/gro.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h index a765fedda5c4..867656b0739c 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -35,6 +35,9 @@ struct napi_gro_cb { /* jiffies when first packet was created/queued */ unsigned long age; +/* Used in napi_gro_cb::free */ +#define NAPI_GRO_FREE 1 +#define NAPI_GRO_FREE_STOLEN_HEAD 2 /* portion of the cb set to zero at every gro iteration */ struct_group(zeroed, @@ -55,8 +58,6 @@ struct napi_gro_cb { /* Free the skb? */ u8 free:2; -#define NAPI_GRO_FREE 1 -#define NAPI_GRO_FREE_STOLEN_HEAD 2 /* Used in foo-over-udp, set in udp[46]_gro_receive */ u8 is_ipv6:1; -- cgit v1.2.3 From bde018222c6b084ac32933a9f933581dd83da18e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 17 Feb 2022 18:30:35 +0000 Subject: net: dsa: add support for phylink mac_select_pcs() Add DSA support for the phylink mac_select_pcs() method so DSA drivers can return provide phylink with the appropriate PCS for the PHY interface mode. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/net/dsa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index c8626dec970c..bc6eef6af810 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -788,6 +788,9 @@ struct dsa_switch_ops { void (*phylink_validate)(struct dsa_switch *ds, int port, unsigned long *supported, struct phylink_link_state *state); + struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds, + int port, + phy_interface_t iface); int (*phylink_mac_link_state)(struct dsa_switch *ds, int port, struct phylink_link_state *state); void (*phylink_mac_config)(struct dsa_switch *ds, int port, -- cgit v1.2.3 From 47f0bd5032106469827cf56c8b45bb9101112105 Mon Sep 17 00:00:00 2001 From: Jacques de Laval Date: Thu, 17 Feb 2022 16:02:02 +0100 Subject: net: Add new protocol attribute to IP addresses This patch adds a new protocol attribute to IPv4 and IPv6 addresses. Inspiration was taken from the protocol attribute of routes. User space applications like iproute2 can set/get the protocol with the Netlink API. The attribute is stored as an 8-bit unsigned integer. The protocol attribute is set by kernel for these categories: - IPv4 and IPv6 loopback addresses - IPv6 addresses generated from router announcements - IPv6 link local addresses User space may pass custom protocols, not defined by the kernel. Grouping addresses on their origin is useful in scenarios where you want to distinguish between addresses based on who added them, e.g. kernel vs. user space. Tagging addresses with a string label is an existing feature that could be used as a solution. Unfortunately the max length of a label is 15 characters, and for compatibility reasons the label must be prefixed with the name of the device followed by a colon. Since device names also have a max length of 15 characters, only -1 characters is guaranteed to be available for any origin tag, which is not that much. A reference implementation of user space setting and getting protocols is available for iproute2: https://github.com/westermo/iproute2/commit/9a6ea18bd79f47f293e5edc7780f315ea42ff540 Signed-off-by: Jacques de Laval Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220217150202.80802-1-Jacques.De.Laval@westermo.com Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 1 + include/net/addrconf.h | 2 ++ include/net/if_inet6.h | 2 ++ include/uapi/linux/if_addr.h | 9 ++++++++- 4 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 674aeead6260..ead323243e7b 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -150,6 +150,7 @@ struct in_ifaddr { __be32 ifa_broadcast; unsigned char ifa_scope; unsigned char ifa_prefixlen; + unsigned char ifa_proto; __u32 ifa_flags; char ifa_label[IFNAMSIZ]; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 59940e230b78..f7506f08e505 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -64,6 +64,8 @@ struct ifa6_config { const struct in6_addr *pfx; unsigned int plen; + u8 ifa_proto; + const struct in6_addr *peer_pfx; u32 rt_priority; diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h index f026cf08a8e8..4cfdef6ca4f6 100644 --- a/include/net/if_inet6.h +++ b/include/net/if_inet6.h @@ -71,6 +71,8 @@ struct inet6_ifaddr { bool tokenized; + u8 ifa_proto; + struct rcu_head rcu; struct in6_addr peer_addr; }; diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index dfcf3ce0097f..1c392dd95a5e 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -33,8 +33,9 @@ enum { IFA_CACHEINFO, IFA_MULTICAST, IFA_FLAGS, - IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */ + IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */ IFA_TARGET_NETNSID, + IFA_PROTO, /* u8, address protocol */ __IFA_MAX, }; @@ -69,4 +70,10 @@ struct ifa_cacheinfo { #define IFA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifaddrmsg)) #endif +/* ifa_proto */ +#define IFAPROT_UNSPEC 0 +#define IFAPROT_KERNEL_LO 1 /* loopback */ +#define IFAPROT_KERNEL_RA 2 /* set by kernel from router announcement */ +#define IFAPROT_KERNEL_LL 3 /* link-local set by kernel */ + #endif -- cgit v1.2.3 From cb196b725936f6b776ad1d073f66fbe92aa798fa Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Fri, 18 Feb 2022 12:25:53 +0800 Subject: mctp: replace mctp_address_ok with more fine-grained helpers Currently, we have mctp_address_ok(), which checks if an EID is in the "valid" range of 8-254 inclusive. However, 0 and 255 may also be valid addresses, depending on context. 0 is the NULL EID, which may be set when physical addressing is used. 255 is valid as a destination address for broadcasts. This change renames mctp_address_ok to mctp_address_unicast, and adds similar helpers for broadcast and null EIDs, which will be used in an upcoming commit. Signed-off-by: Jeremy Kerr Signed-off-by: Jakub Kicinski --- include/net/mctp.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mctp.h b/include/net/mctp.h index e80a4baf8379..d37268fe6825 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -40,11 +40,21 @@ struct mctp_hdr { #define MCTP_INITIAL_DEFAULT_NET 1 -static inline bool mctp_address_ok(mctp_eid_t eid) +static inline bool mctp_address_unicast(mctp_eid_t eid) { return eid >= 8 && eid < 255; } +static inline bool mctp_address_broadcast(mctp_eid_t eid) +{ + return eid == 255; +} + +static inline bool mctp_address_null(mctp_eid_t eid) +{ + return eid == 0; +} + static inline bool mctp_address_matches(mctp_eid_t match, mctp_eid_t eid) { return match == eid || match == MCTP_ADDR_ANY; -- cgit v1.2.3 From 4b340a5a726dafba15b366c4009aa0a8f77631ac Mon Sep 17 00:00:00 2001 From: Mobashshera Rasool Date: Thu, 17 Feb 2022 07:46:40 +0000 Subject: net: ip6mr: add support for passing full packet on wrong mif This patch adds support for MRT6MSG_WRMIFWHOLE which is used to pass full packet and real vif id when the incoming interface is wrong. While the RP and FHR are setting up state we need to be sending the registers encapsulated with all the data inside otherwise we lose it. The RP then decapsulates it and forwards it to the interested parties. Currently with WRONGMIF we can only be sending empty register packets and will lose that data. This behaviour can be enabled by using MRT_PIM with val == MRT6MSG_WRMIFWHOLE. This doesn't prevent MRT6MSG_WRONGMIF from happening, it happens in addition to it, also it is controlled by the same throttling parameters as WRONGMIF (i.e. 1 packet per 3 seconds currently). Both messages are generated to keep backwards compatibily and avoid breaking someone who was enabling MRT_PIM with val == 4, since any positive val is accepted and treated the same. Signed-off-by: Mobashshera Rasool Signed-off-by: David S. Miller --- include/uapi/linux/mroute6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h index a1fd6173e2db..1d90c21a6251 100644 --- a/include/uapi/linux/mroute6.h +++ b/include/uapi/linux/mroute6.h @@ -134,6 +134,7 @@ struct mrt6msg { #define MRT6MSG_NOCACHE 1 #define MRT6MSG_WRONGMIF 2 #define MRT6MSG_WHOLEPKT 3 /* used for use level encap */ +#define MRT6MSG_WRMIFWHOLE 4 /* For PIM Register and assert processing */ __u8 im6_mbz; /* must be zero */ __u8 im6_msgtype; /* what type of message */ __u16 im6_mif; /* mif rec'd on */ -- cgit v1.2.3 From ccfbf44d4c7fb7c64cf79b3f2a5ae522e5165878 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 19 Feb 2022 11:47:17 +0000 Subject: net: dsa: remove pcs_poll With drivers converted over to using phylink PCS, there is no need for the struct dsa_switch member "pcs_poll" to exist anymore - there is a flag in the struct phylink_pcs which indicates whether this PCS needs to be polled which supersedes this. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/net/dsa.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index bc6eef6af810..f13de2d8aef3 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -391,11 +391,6 @@ struct dsa_switch { */ u32 vlan_filtering:1; - /* MAC PCS does not provide link state change interrupt, and requires - * polling. Flag passed on to PHYLINK. - */ - u32 pcs_poll:1; - /* For switches that only have the MRU configurable. To ensure the * configured MTU is not exceeded, normalization of MRU on all bridged * interfaces is needed. -- cgit v1.2.3 From 64b4a0f8b51b20e0c9dbff7748365994364d5f01 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 19 Feb 2022 11:47:22 +0000 Subject: net: phylink: remove phylink_config's pcs_poll phylink_config's pcs_poll is no longer used, let's get rid of it. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index cca149f78d35..9ef9b7047f19 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -86,7 +86,6 @@ enum phylink_op_type { * @type: operation type of PHYLINK instance * @legacy_pre_march2020: driver has not been updated for March 2020 updates * (See commit 7cceb599d15d ("net: phylink: avoid mac_config calls") - * @pcs_poll: MAC PCS cannot provide link change interrupt * @poll_fixed_state: if true, starts link_poll, * if MAC link is at %MLO_AN_FIXED mode. * @ovr_an_inband: if true, override PCS to MLO_AN_INBAND @@ -100,7 +99,6 @@ struct phylink_config { struct device *dev; enum phylink_op_type type; bool legacy_pre_march2020; - bool pcs_poll; bool poll_fixed_state; bool ovr_an_inband; void (*get_fixed_state)(struct phylink_config *config, -- cgit v1.2.3 From 643b622b51f1f0015e0a80f90b4ef9032e6ddb1b Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 20 Feb 2022 15:06:32 +0800 Subject: net: tcp: add skb drop reasons to tcp_v{4,6}_inbound_md5_hash() Pass the address of drop reason to tcp_v4_inbound_md5_hash() and tcp_v6_inbound_md5_hash() to store the reasons for skb drops when this function fails. Therefore, the drop reason can be passed to kfree_skb_reason() when the skb needs to be freed. Following drop reasons are added: SKB_DROP_REASON_TCP_MD5NOTFOUND SKB_DROP_REASON_TCP_MD5UNEXPECTED SKB_DROP_REASON_TCP_MD5FAILURE SKB_DROP_REASON_TCP_MD5* above correspond to LINUX_MIB_TCPMD5* Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 12 ++++++++++++ include/trace/events/skb.h | 4 ++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a5adbf6b51e8..46678eb587ff 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -346,6 +346,18 @@ enum skb_drop_reason { * udp packet drop out of * udp_memory_allocated. */ + SKB_DROP_REASON_TCP_MD5NOTFOUND, /* no MD5 hash and one + * expected, corresponding + * to LINUX_MIB_TCPMD5NOTFOUND + */ + SKB_DROP_REASON_TCP_MD5UNEXPECTED, /* MD5 hash and we're not + * expecting one, corresponding + * to LINUX_MIB_TCPMD5UNEXPECTED + */ + SKB_DROP_REASON_TCP_MD5FAILURE, /* MD5 hash and its wrong, + * corresponding to + * LINUX_MIB_TCPMD5FAILURE + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index cfcfd26399f7..46c06b0be850 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -27,6 +27,10 @@ EM(SKB_DROP_REASON_IP_NOPROTO, IP_NOPROTO) \ EM(SKB_DROP_REASON_SOCKET_RCVBUFF, SOCKET_RCVBUFF) \ EM(SKB_DROP_REASON_PROTO_MEM, PROTO_MEM) \ + EM(SKB_DROP_REASON_TCP_MD5NOTFOUND, TCP_MD5NOTFOUND) \ + EM(SKB_DROP_REASON_TCP_MD5UNEXPECTED, \ + TCP_MD5UNEXPECTED) \ + EM(SKB_DROP_REASON_TCP_MD5FAILURE, TCP_MD5FAILURE) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 7a26dc9e7b43f5a24c4b843713e728582adf1c38 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 20 Feb 2022 15:06:33 +0800 Subject: net: tcp: add skb drop reasons to tcp_add_backlog() Pass the address of drop_reason to tcp_add_backlog() to store the reasons for skb drops when fails. Following drop reasons are introduced: SKB_DROP_REASON_SOCKET_BACKLOG Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ include/net/tcp.h | 3 ++- include/trace/events/skb.h | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 46678eb587ff..f7f33c79945b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -358,6 +358,10 @@ enum skb_drop_reason { * corresponding to * LINUX_MIB_TCPMD5FAILURE */ + SKB_DROP_REASON_SOCKET_BACKLOG, /* failed to add skb to socket + * backlog (see + * LINUX_MIB_TCPBACKLOGDROP) + */ SKB_DROP_REASON_MAX, }; diff --git a/include/net/tcp.h b/include/net/tcp.h index eff2487d972d..04f4650e0ff0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1367,7 +1367,8 @@ static inline bool tcp_checksum_complete(struct sk_buff *skb) __skb_checksum_complete(skb); } -bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); +bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason); #ifdef CONFIG_INET void __sk_defer_free_flush(struct sock *sk); diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 46c06b0be850..bfccd77e9071 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -31,6 +31,7 @@ EM(SKB_DROP_REASON_TCP_MD5UNEXPECTED, \ TCP_MD5UNEXPECTED) \ EM(SKB_DROP_REASON_TCP_MD5FAILURE, TCP_MD5FAILURE) \ + EM(SKB_DROP_REASON_SOCKET_BACKLOG, SOCKET_BACKLOG) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 2a968ef60e1fac4e694d9f60ce19a3b66b40e8c3 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 20 Feb 2022 15:06:35 +0800 Subject: net: tcp: use tcp_drop_reason() for tcp_rcv_established() Replace tcp_drop() used in tcp_rcv_established() with tcp_drop_reason(). Following drop reasons are added: SKB_DROP_REASON_TCP_FLAGS Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/trace/events/skb.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f7f33c79945b..671db9f49efe 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -362,6 +362,7 @@ enum skb_drop_reason { * backlog (see * LINUX_MIB_TCPBACKLOGDROP) */ + SKB_DROP_REASON_TCP_FLAGS, /* TCP flags invalid */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index bfccd77e9071..d332e7313a61 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -32,6 +32,7 @@ TCP_MD5UNEXPECTED) \ EM(SKB_DROP_REASON_TCP_MD5FAILURE, TCP_MD5FAILURE) \ EM(SKB_DROP_REASON_SOCKET_BACKLOG, SOCKET_BACKLOG) \ + EM(SKB_DROP_REASON_TCP_FLAGS, TCP_FLAGS) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From a7ec381049c0d1f03e342063d75f5c3b314d0ec2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 20 Feb 2022 15:06:36 +0800 Subject: net: tcp: use tcp_drop_reason() for tcp_data_queue() Replace tcp_drop() used in tcp_data_queue() with tcp_drop_reason(). Following drop reasons are introduced: SKB_DROP_REASON_TCP_ZEROWINDOW SKB_DROP_REASON_TCP_OLD_DATA SKB_DROP_REASON_TCP_OVERWINDOW SKB_DROP_REASON_TCP_OLD_DATA is used for the case that end_seq of skb less than the left edges of receive window. (Maybe there is a better name?) Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 13 +++++++++++++ include/trace/events/skb.h | 3 +++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 671db9f49efe..554ef2c848ee 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -363,6 +363,19 @@ enum skb_drop_reason { * LINUX_MIB_TCPBACKLOGDROP) */ SKB_DROP_REASON_TCP_FLAGS, /* TCP flags invalid */ + SKB_DROP_REASON_TCP_ZEROWINDOW, /* TCP receive window size is zero, + * see LINUX_MIB_TCPZEROWINDOWDROP + */ + SKB_DROP_REASON_TCP_OLD_DATA, /* the TCP data reveived is already + * received before (spurious retrans + * may happened), see + * LINUX_MIB_DELAYEDACKLOST + */ + SKB_DROP_REASON_TCP_OVERWINDOW, /* the TCP data is out of window, + * the seq of the first byte exceed + * the right edges of receive + * window + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index d332e7313a61..cc1c8f7eaf72 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -33,6 +33,9 @@ EM(SKB_DROP_REASON_TCP_MD5FAILURE, TCP_MD5FAILURE) \ EM(SKB_DROP_REASON_SOCKET_BACKLOG, SOCKET_BACKLOG) \ EM(SKB_DROP_REASON_TCP_FLAGS, TCP_FLAGS) \ + EM(SKB_DROP_REASON_TCP_ZEROWINDOW, TCP_ZEROWINDOW) \ + EM(SKB_DROP_REASON_TCP_OLD_DATA, TCP_OLD_DATA) \ + EM(SKB_DROP_REASON_TCP_OVERWINDOW, TCP_OVERWINDOW) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From d25e481be0c519d1a458b14191dc8c2a8bb3e24a Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 20 Feb 2022 15:06:37 +0800 Subject: net: tcp: use tcp_drop_reason() for tcp_data_queue_ofo() Replace tcp_drop() used in tcp_data_queue_ofo with tcp_drop_reason(). Following drop reasons are introduced: SKB_DROP_REASON_TCP_OFOMERGE Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ include/trace/events/skb.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 554ef2c848ee..a3e90efe6586 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -376,6 +376,10 @@ enum skb_drop_reason { * the right edges of receive * window */ + SKB_DROP_REASON_TCP_OFOMERGE, /* the data of skb is already in + * the ofo queue, corresponding to + * LINUX_MIB_TCPOFOMERGE + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index cc1c8f7eaf72..2ab7193313aa 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -36,6 +36,7 @@ EM(SKB_DROP_REASON_TCP_ZEROWINDOW, TCP_ZEROWINDOW) \ EM(SKB_DROP_REASON_TCP_OLD_DATA, TCP_OLD_DATA) \ EM(SKB_DROP_REASON_TCP_OVERWINDOW, TCP_OVERWINDOW) \ + EM(SKB_DROP_REASON_TCP_OFOMERGE, TCP_OFOMERGE) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 0c51e12e218f20b7d976158fdc18019627326f7a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 19 Feb 2022 17:45:19 +0200 Subject: ipv4: Invalidate neighbour for broadcast address upon address addition In case user space sends a packet destined to a broadcast address when a matching broadcast route is not configured, the kernel will create a unicast neighbour entry that will never be resolved [1]. When the broadcast route is configured, the unicast neighbour entry will not be invalidated and continue to linger, resulting in packets being dropped. Solve this by invalidating unresolved neighbour entries for broadcast addresses after routes for these addresses are internally configured by the kernel. This allows the kernel to create a broadcast neighbour entry following the next route lookup. Another possible solution that is more generic but also more complex is to have the ARP code register a listener to the FIB notification chain and invalidate matching neighbour entries upon the addition of broadcast routes. It is also possible to wave off the issue as a user space problem, but it seems a bit excessive to expect user space to be that intimately familiar with the inner workings of the FIB/neighbour kernel code. [1] https://lore.kernel.org/netdev/55a04a8f-56f3-f73c-2aea-2195923f09d1@huawei.com/ Reported-by: Wang Hai Signed-off-by: Ido Schimmel Tested-by: Wang Hai Signed-off-by: David S. Miller --- include/net/arp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/arp.h b/include/net/arp.h index 031374ac2f22..d7ef4ec71dfe 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -65,6 +65,7 @@ void arp_send(int type, int ptype, __be32 dest_ip, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); +int arp_invalidate(struct net_device *dev, __be32 ip, bool force); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, -- cgit v1.2.3 From 696c65444120cfe98ae704e86a59df8666fabf1d Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 21 Feb 2022 13:54:53 +0800 Subject: ipv6: separate ndisc_ns_create() from ndisc_send_ns() This patch separate NS message allocation steps from ndisc_send_ns(), so it could be used in other places, like bonding, to allocate and send IPv6 NS message. Also export ndisc_send_skb() and ndisc_ns_create() for later bonding usage. Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/ndisc.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 53cb8de0e589..aac3a42de432 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -447,10 +447,15 @@ void ndisc_cleanup(void); int ndisc_rcv(struct sk_buff *skb); +struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit, + const struct in6_addr *saddr, u64 nonce); void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit, const struct in6_addr *daddr, const struct in6_addr *saddr, u64 nonce); +void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, + const struct in6_addr *saddr); + void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr, -- cgit v1.2.3 From 841e95641e4cb695586b036f330d6d272fcd7173 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 21 Feb 2022 13:54:55 +0800 Subject: bonding: add extra field for bond_opt_value Adding an extra storage field for bond_opt_value so we can set large bytes of data for bonding options in future, e.g. IPv6 address. Define a new call bond_opt_initextra(). Also change the checking order of __bond_opt_init() and check values first. Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/bond_options.h | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/bond_options.h b/include/net/bond_options.h index dd75c071f67e..286b29c6c451 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -71,14 +71,18 @@ enum { /* This structure is used for storing option values and for passing option * values when changing an option. The logic when used as an arg is as follows: - * - if string != NULL -> parse it, if the opt is RAW type then return it, else - * return the parse result - * - if string == NULL -> parse value + * - if value != ULLONG_MAX -> parse value + * - if string != NULL -> parse string + * - if the opt is RAW data and length less than maxlen, + * copy the data to extra storage */ + +#define BOND_OPT_EXTRA_MAXLEN 16 struct bond_opt_value { char *string; u64 value; u32 flags; + char extra[BOND_OPT_EXTRA_MAXLEN]; }; struct bonding; @@ -118,17 +122,22 @@ const struct bond_opt_value *bond_opt_get_val(unsigned int option, u64 val); * When value is ULLONG_MAX then string will be used. */ static inline void __bond_opt_init(struct bond_opt_value *optval, - char *string, u64 value) + char *string, u64 value, + void *extra, size_t extra_len) { memset(optval, 0, sizeof(*optval)); optval->value = ULLONG_MAX; - if (value == ULLONG_MAX) - optval->string = string; - else + if (value != ULLONG_MAX) optval->value = value; + else if (string) + optval->string = string; + else if (extra_len <= BOND_OPT_EXTRA_MAXLEN) + memcpy(optval->extra, extra, extra_len); } -#define bond_opt_initval(optval, value) __bond_opt_init(optval, NULL, value) -#define bond_opt_initstr(optval, str) __bond_opt_init(optval, str, ULLONG_MAX) +#define bond_opt_initval(optval, value) __bond_opt_init(optval, NULL, value, NULL, 0) +#define bond_opt_initstr(optval, str) __bond_opt_init(optval, str, ULLONG_MAX, NULL, 0) +#define bond_opt_initextra(optval, extra, extra_len) \ + __bond_opt_init(optval, NULL, ULLONG_MAX, extra, extra_len) void bond_option_arp_ip_targets_clear(struct bonding *bond); -- cgit v1.2.3 From 4e24be018eb9dbcefa4b01c07e298b147dc1a4d7 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 21 Feb 2022 13:54:56 +0800 Subject: bonding: add new parameter ns_targets Add a new bonding parameter ns_targets to store IPv6 address. Add required bond_ns_send/rcv functions first before adding IPv6 address option setting. Add two functions bond_send/rcv_validate so we can send/recv ARP and NS at the same time. Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/bonding.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bonding.h b/include/net/bonding.h index 7dead855a72d..f3b986f6b6e4 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -29,8 +29,11 @@ #include #include #include +#include +#include #define BOND_MAX_ARP_TARGETS 16 +#define BOND_MAX_NS_TARGETS BOND_MAX_ARP_TARGETS #define BOND_DEFAULT_MIIMON 100 @@ -146,6 +149,7 @@ struct bond_params { struct reciprocal_value reciprocal_packets_per_slave; u16 ad_actor_sys_prio; u16 ad_user_port_key; + struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; @@ -628,7 +632,7 @@ struct bond_net { struct class_attribute class_attr_bonding_masters; }; -int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); +int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave); netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev); int bond_create(struct net *net, const char *name); int bond_create_sysfs(struct bond_net *net); @@ -735,6 +739,19 @@ static inline int bond_get_targets_ip(__be32 *targets, __be32 ip) return -1; } +static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip) +{ + int i; + + for (i = 0; i < BOND_MAX_NS_TARGETS; i++) + if (ipv6_addr_equal(&targets[i], ip)) + return i; + else if (ipv6_addr_any(&targets[i])) + break; + + return -1; +} + /* exported from bond_main.c */ extern unsigned int bond_net_id; -- cgit v1.2.3 From 129e3c1bab24d27d0fa6e505a472345a92d7a2b0 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 21 Feb 2022 13:54:57 +0800 Subject: bonding: add new option ns_ip6_target This patch add a new bonding option ns_ip6_target, which correspond to the arp_ip_target. With this we set IPv6 targets and send IPv6 NS request to determine the health of the link. For other related options like the validation, we still use arp_validate, and will change to ns_validate later. Note: the sysfs configuration support was removed based on https://lore.kernel.org/netdev/8863.1645071997@famine Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/bond_options.h | 4 ++++ include/net/bonding.h | 7 +++++++ include/uapi/linux/if_link.h | 1 + 3 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 286b29c6c451..61b49063791c 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -66,6 +66,7 @@ enum { BOND_OPT_PEER_NOTIF_DELAY, BOND_OPT_LACP_ACTIVE, BOND_OPT_MISSED_MAX, + BOND_OPT_NS_TARGETS, BOND_OPT_LAST }; @@ -140,5 +141,8 @@ static inline void __bond_opt_init(struct bond_opt_value *optval, __bond_opt_init(optval, NULL, ULLONG_MAX, extra, extra_len) void bond_option_arp_ip_targets_clear(struct bonding *bond); +#if IS_ENABLED(CONFIG_IPV6) +void bond_option_ns_ip6_targets_clear(struct bonding *bond); +#endif #endif /* _NET_BOND_OPTIONS_H */ diff --git a/include/net/bonding.h b/include/net/bonding.h index f3b986f6b6e4..d0dfe727e0b1 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -503,6 +503,13 @@ static inline int bond_is_ip_target_ok(__be32 addr) return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr); } +static inline int bond_is_ip6_target_ok(struct in6_addr *addr) +{ + return !ipv6_addr_any(addr) && + !ipv6_addr_loopback(addr) && + !ipv6_addr_is_multicast(addr); +} + /* Get the oldest arp which we've received on this slave for bond's * arp_targets. */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 6218f93f5c1a..e1ba2d51b717 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -860,6 +860,7 @@ enum { IFLA_BOND_PEER_NOTIF_DELAY, IFLA_BOND_AD_LACP_ACTIVE, IFLA_BOND_MISSED_MAX, + IFLA_BOND_NS_IP6_TARGET, __IFLA_BOND_MAX, }; -- cgit v1.2.3 From 763087dab97547230a6807c865a6a5ae53a59247 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Feb 2022 19:21:12 -0800 Subject: net: add skb_set_end_offset() helper We have multiple places where this helper is convenient, and plan using it in the following patch. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a3e90efe6586..115be7f73487 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1536,6 +1536,11 @@ static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end; } + +static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) +{ + skb->end = offset; +} #else static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { @@ -1546,6 +1551,11 @@ static inline unsigned int skb_end_offset(const struct sk_buff *skb) { return skb->end - skb->head; } + +static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) +{ + skb->end = skb->head + offset; +} #endif /* Internal */ -- cgit v1.2.3 From 2b88cba55883eaafbc9b7cbff0b2c7cdba71ed01 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Feb 2022 19:21:13 -0800 Subject: net: preserve skb_end_offset() in skb_unclone_keeptruesize() syzbot found another way to trigger the infamous WARN_ON_ONCE(delta < len) in skb_try_coalesce() [1] I was able to root cause the issue to kfence. When kfence is in action, the following assertion is no longer true: int size = xxxx; void *ptr1 = kmalloc(size, gfp); void *ptr2 = kmalloc(size, gfp); if (ptr1 && ptr2) ASSERT(ksize(ptr1) == ksize(ptr2)); We attempted to fix these issues in the blamed commits, but forgot that TCP was possibly shifting data after skb_unclone_keeptruesize() has been used, notably from tcp_retrans_try_collapse(). So we not only need to keep same skb->truesize value, we also need to make sure TCP wont fill new tailroom that pskb_expand_head() was able to get from a addr = kmalloc(...) followed by ksize(addr) Split skb_unclone_keeptruesize() into two parts: 1) Inline skb_unclone_keeptruesize() for the common case, when skb is not cloned. 2) Out of line __skb_unclone_keeptruesize() for the 'slow path'. WARNING: CPU: 1 PID: 6490 at net/core/skbuff.c:5295 skb_try_coalesce+0x1235/0x1560 net/core/skbuff.c:5295 Modules linked in: CPU: 1 PID: 6490 Comm: syz-executor161 Not tainted 5.17.0-rc4-syzkaller-00229-g4f12b742eb2b #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_try_coalesce+0x1235/0x1560 net/core/skbuff.c:5295 Code: bf 01 00 00 00 0f b7 c0 89 c6 89 44 24 20 e8 62 24 4e fa 8b 44 24 20 83 e8 01 0f 85 e5 f0 ff ff e9 87 f4 ff ff e8 cb 20 4e fa <0f> 0b e9 06 f9 ff ff e8 af b2 95 fa e9 69 f0 ff ff e8 95 b2 95 fa RSP: 0018:ffffc900063af268 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 00000000ffffffd5 RCX: 0000000000000000 RDX: ffff88806fc05700 RSI: ffffffff872abd55 RDI: 0000000000000003 RBP: ffff88806e675500 R08: 00000000ffffffd5 R09: 0000000000000000 R10: ffffffff872ab659 R11: 0000000000000000 R12: ffff88806dd554e8 R13: ffff88806dd9bac0 R14: ffff88806dd9a2c0 R15: 0000000000000155 FS: 00007f18014f9700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020002000 CR3: 000000006be7a000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tcp_try_coalesce net/ipv4/tcp_input.c:4651 [inline] tcp_try_coalesce+0x393/0x920 net/ipv4/tcp_input.c:4630 tcp_queue_rcv+0x8a/0x6e0 net/ipv4/tcp_input.c:4914 tcp_data_queue+0x11fd/0x4bb0 net/ipv4/tcp_input.c:5025 tcp_rcv_established+0x81e/0x1ff0 net/ipv4/tcp_input.c:5947 tcp_v4_do_rcv+0x65e/0x980 net/ipv4/tcp_ipv4.c:1719 sk_backlog_rcv include/net/sock.h:1037 [inline] __release_sock+0x134/0x3b0 net/core/sock.c:2779 release_sock+0x54/0x1b0 net/core/sock.c:3311 sk_wait_data+0x177/0x450 net/core/sock.c:2821 tcp_recvmsg_locked+0xe28/0x1fd0 net/ipv4/tcp.c:2457 tcp_recvmsg+0x137/0x610 net/ipv4/tcp.c:2572 inet_recvmsg+0x11b/0x5e0 net/ipv4/af_inet.c:850 sock_recvmsg_nosec net/socket.c:948 [inline] sock_recvmsg net/socket.c:966 [inline] sock_recvmsg net/socket.c:962 [inline] ____sys_recvmsg+0x2c4/0x600 net/socket.c:2632 ___sys_recvmsg+0x127/0x200 net/socket.c:2674 __sys_recvmsg+0xe2/0x1a0 net/socket.c:2704 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: c4777efa751d ("net: add and use skb_unclone_keeptruesize() helper") Fixes: 097b9146c0e2 ("net: fix up truesize of cloned skb in skb_prepare_for_shift()") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Marco Elver Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 115be7f73487..31be38078918 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1795,19 +1795,19 @@ static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) return 0; } -/* This variant of skb_unclone() makes sure skb->truesize is not changed */ +/* This variant of skb_unclone() makes sure skb->truesize + * and skb_end_offset() are not changed, whenever a new skb->head is needed. + * + * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X)) + * when various debugging features are in place. + */ +int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri); static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); - if (skb_cloned(skb)) { - unsigned int save = skb->truesize; - int res; - - res = pskb_expand_head(skb, 0, 0, pri); - skb->truesize = save; - return res; - } + if (skb_cloned(skb)) + return __skb_unclone_keeptruesize(skb, pri); return 0; } -- cgit v1.2.3 From b26ef81c46ed15d11ddddba9ba1cd52c749385ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 22 Feb 2022 14:04:50 -0800 Subject: drop_monitor: remove quadratic behavior drop_monitor is using an unique list on which all netdevices in the host have an element, regardless of their netns. This scales poorly, not only at device unregister time (what I caught during my netns dismantle stress tests), but also at packet processing time whenever trace_napi_poll_hit() is called. If the intent was to avoid adding one pointer in 'struct net_device' then surely we prefer O(1) behavior. Signed-off-by: Eric Dumazet Cc: Neil Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 93fc680b658f..c79ee2296296 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2236,7 +2236,9 @@ struct net_device { #if IS_ENABLED(CONFIG_MRP) struct mrp_port __rcu *mrp_port; #endif - +#if IS_ENABLED(CONFIG_NET_DROP_MONITOR) + struct dm_hw_stat_delta __rcu *dm_private; +#endif struct device dev; const struct attribute_group *sysfs_groups[4]; const struct attribute_group *sysfs_rx_queue_group; -- cgit v1.2.3 From a21d9a670d81103db7f788de1a4a4a6e4b891a0b Mon Sep 17 00:00:00 2001 From: Hans Schultz Date: Wed, 23 Feb 2022 11:16:46 +0100 Subject: net: bridge: Add support for bridge port in locked mode In a 802.1X scenario, clients connected to a bridge port shall not be allowed to have traffic forwarded until fully authenticated. A static fdb entry of the clients MAC address for the bridge port unlocks the client and allows bidirectional communication. This scenario is facilitated with setting the bridge port in locked mode, which is also supported by various switchcore chipsets. Signed-off-by: Hans Schultz Acked-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 1 + include/uapi/linux/if_link.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 509e18c7e740..3aae023a9353 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -58,6 +58,7 @@ struct br_ip_list { #define BR_MRP_LOST_CONT BIT(18) #define BR_MRP_LOST_IN_CONT BIT(19) #define BR_TX_FWD_OFFLOAD BIT(20) +#define BR_PORT_LOCKED BIT(21) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index e1ba2d51b717..be09d2ad4b5d 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -537,6 +537,7 @@ enum { IFLA_BRPORT_MRP_IN_OPEN, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, + IFLA_BRPORT_LOCKED, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.2.3 From 43c075959de3c45608636d9d80ff9e61d166fb21 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 26 Jan 2022 10:57:07 -0800 Subject: mlx5: remove unused static inlines mlx5 has some unused static inline helpers in include/ while at it also clean static inlines in the driver itself. Signed-off-by: Jakub Kicinski Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 78655d8d13a7..1b398c9e17b9 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -863,20 +863,10 @@ struct mlx5_hca_vport_context { bool grh_required; }; -static inline void *mlx5_buf_offset(struct mlx5_frag_buf *buf, int offset) -{ - return buf->frags->buf + offset; -} - #define STRUCT_FIELD(header, field) \ .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \ .struct_size_bytes = sizeof((struct ib_unpacked_ ## header *)0)->field -static inline struct mlx5_core_dev *pci2mlx5_core_dev(struct pci_dev *pdev) -{ - return pci_get_drvdata(pdev); -} - extern struct dentry *mlx5_debugfs_root; static inline u16 fw_rev_maj(struct mlx5_core_dev *dev) -- cgit v1.2.3 From c2c922dae77f36e24d246c6e310cee0c61afc6fb Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 29 Nov 2021 16:24:28 +0200 Subject: net/mlx5: Add ability to insert to specific flow group If the flow table isn't an autogroup the upper driver has to create the flow groups explicitly. This information can't later be used when creating rules to insert into a specific flow group. Allow such use case. Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index b1aad14689e3..e3bfed68b08a 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -224,6 +224,7 @@ struct mlx5_flow_act { u32 flags; struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH]; struct ib_counters *counters; + struct mlx5_flow_group *fg; }; #define MLX5_DECLARE_FLOW_ACT(name) \ -- cgit v1.2.3 From 605bef0015b163867127202b821dce79804d603d Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sun, 5 Apr 2020 17:50:25 -0700 Subject: net/mlx5: cmdif, cmd_check refactoring Do not mangle the command outbox in the internal low level cmd_exec and cmd_invoke functions. Instead return a proper unique error code and move the driver error checking to be at a higher level in mlx5_cmd_exec(). Signed-off-by: Saeed Mahameed Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1b398c9e17b9..8a8408708e6c 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -981,7 +981,6 @@ int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); -void mlx5_cmd_mbox_status(void *out, u8 *status, u32 *syndrome); bool mlx5_cmd_is_down(struct mlx5_core_dev *dev); int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); -- cgit v1.2.3 From f23519e542e51c19ab3081deb089bb3f8fec7bb9 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Sat, 17 Aug 2019 03:05:10 -0700 Subject: net/mlx5: cmdif, Add new api for command execution Add mlx5_cmd_do. Unlike mlx5_cmd_exec, this function will not modify or translate outbox.status. The function will return: return = 0: Command was executed, outbox.status == MLX5_CMD_STAT_OK. return = -EREMOTEIO: Executed, outbox.status != MLX5_CMD_STAT_OK. return < 0: Command execution couldn't be performed by FW or driver. And document other mlx5_cmd_exec functions. Signed-off-by: Saeed Mahameed Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8a8408708e6c..1b9bec8fa870 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -964,6 +964,8 @@ int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, void *out, int out_size, mlx5_async_cbk_t callback, struct mlx5_async_work *work); +int mlx5_cmd_do(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); +int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *out); int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); -- cgit v1.2.3 From 31803e59233efc838b9dcb26edea28a4b2389e97 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 30 Mar 2020 21:12:58 -0700 Subject: net/mlx5: Use mlx5_cmd_do() in core create_{cq,dct} mlx5_core_create_{cq/dct} functions are non-trivial mlx5 commands functions. They check command execution status themselves and hide valuable FW failure information. For mlx5_core/eth kernel user this is what we actually want, but for a devx/rdma user the hidden information is essential and should be propagated up to the caller, thus we convert these commands to use mlx5_cmd_do to return the FW/driver and command outbox status as is, and let the caller decide what to do with it. For kernel callers of mlx5_core_create_{cq/dct} or those who only care about the binary status (FAIL/SUCCESS) they must check status themselves via mlx5_cmd_check() to restore the current behavior. err = mlx5_create_cq(in, out) err = mlx5_cmd_check(err, in, out) if (err) // handle err For DEVX users and those who care about full visibility, They will just propagate the error to user space, and app can check if err == -EREMOTEIO, then outbox.{status,syndrome} are valid. API Note: mlx5_cmd_check() must be used by kernel users since it allows the driver to intercept the command execution status and return a driver simulated status in case of driver induced error handling or reset/recovery flows. Signed-off-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- include/linux/mlx5/cq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h index 7bfb67363434..cb15308b5cb0 100644 --- a/include/linux/mlx5/cq.h +++ b/include/linux/mlx5/cq.h @@ -183,6 +183,8 @@ static inline void mlx5_cq_put(struct mlx5_core_cq *cq) complete(&cq->free); } +int mlx5_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, + u32 *in, int inlen, u32 *out, int outlen); int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, u32 *in, int inlen, u32 *out, int outlen); int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq); -- cgit v1.2.3 From 0a41527608e7f3da61e76564f5a8749a1fddc7f1 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 31 Mar 2020 22:03:00 -0700 Subject: net/mlx5: cmdif, Refactor error handling and reporting of async commands Same as the new mlx5_cmd_do API, report all information to callers and let them handle the error values and outbox parsing. The user callback status "work->user_callback(status)" is now similar to the error rc code returned from the blocking mlx5_cmd_do() version, and now is defined as follows: -EREMOTEIO : Command executed by FW, outbox.status != MLX5_CMD_STAT_OK. Caller must check FW outbox status. 0 : Command execution successful, outbox.status == MLX5_CMD_STAT_OK. < 0 : Command couldn't execute, FW or driver induced error. Signed-off-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1b9bec8fa870..432151d7d0bf 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -955,6 +955,7 @@ typedef void (*mlx5_async_cbk_t)(int status, struct mlx5_async_work *context); struct mlx5_async_work { struct mlx5_async_ctx *ctx; mlx5_async_cbk_t user_callback; + void *out; /* pointer to the cmd output buffer */ }; void mlx5_cmd_init_async_ctx(struct mlx5_core_dev *dev, @@ -963,7 +964,7 @@ void mlx5_cmd_cleanup_async_ctx(struct mlx5_async_ctx *ctx); int mlx5_cmd_exec_cb(struct mlx5_async_ctx *ctx, void *in, int in_size, void *out, int out_size, mlx5_async_cbk_t callback, struct mlx5_async_work *work); - +void mlx5_cmd_out_err(struct mlx5_core_dev *dev, u16 opcode, u16 op_mod, void *out); int mlx5_cmd_do(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); int mlx5_cmd_check(struct mlx5_core_dev *dev, int err, void *in, void *out); int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, -- cgit v1.2.3 From 72fb3b60a3114a1154a8ae5629ea3b43a88a7a4d Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sat, 14 Aug 2021 17:57:51 +0300 Subject: net/mlx5: Add reset_state field to MFRL register Add new field reset_state to MFRL register. This field expose current state of sync reset for fw update. This field enables sharing with the user more details on why fw activate failed in case it failed the sync reset stage. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 598ac3bcc901..8ca2d65ff789 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9694,7 +9694,8 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x6b]; + u8 reserved_at_0[0x6a]; + u8 reset_state[0x1]; u8 ptpcyc2realtime_modify[0x1]; u8 reserved_at_6c[0x2]; u8 pci_status_and_power[0x1]; @@ -10375,6 +10376,14 @@ struct mlx5_ifc_mcda_reg_bits { u8 data[][0x20]; }; +enum { + MLX5_MFRL_REG_RESET_STATE_IDLE = 0, + MLX5_MFRL_REG_RESET_STATE_IN_NEGOTIATION = 1, + MLX5_MFRL_REG_RESET_STATE_RESET_IN_PROGRESS = 2, + MLX5_MFRL_REG_RESET_STATE_TIMEOUT = 3, + MLX5_MFRL_REG_RESET_STATE_NACK = 4, +}; + enum { MLX5_MFRL_REG_RESET_TYPE_FULL_CHIP = BIT(0), MLX5_MFRL_REG_RESET_TYPE_NET_PORT_ALIVE = BIT(1), @@ -10393,7 +10402,8 @@ struct mlx5_ifc_mfrl_reg_bits { u8 pci_sync_for_fw_update_start[0x1]; u8 pci_sync_for_fw_update_resp[0x2]; u8 rst_type_sel[0x3]; - u8 reserved_at_28[0x8]; + u8 reserved_at_28[0x4]; + u8 reset_state[0x4]; u8 reset_type[0x8]; u8 reset_level[0x8]; }; -- cgit v1.2.3 From 45fee8edb4b333af79efad7a99de51718ebda94b Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Mon, 6 Sep 2021 11:02:44 +0300 Subject: net/mlx5: Add clarification on sync reset failure In case devlink reload action fw_activate failed in sync reset stage, use the new MFRL field reset_state to find why it failed and share this clarification with the user. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 432151d7d0bf..d3b1a6a1f8d2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1031,6 +1031,9 @@ int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn) void mlx5_qp_debugfs_init(struct mlx5_core_dev *dev); void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev); +int mlx5_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, + void *data_out, int size_out, u16 reg_id, int arg, + int write, bool verbose); int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, void *data_out, int size_out, u16 reg_num, int arg, int write); -- cgit v1.2.3 From 1241e329ce2e1f5b1039fd356b75867b29721ad2 Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Wed, 23 Feb 2022 00:09:12 +0530 Subject: ethtool: add support to set/get completion queue event size Add support to set completion queue event size via ethtool -G parameter and get it via ethtool -g parameter. ~ # ./ethtool -G eth0 cqe-size 512 ~ # ./ethtool -g eth0 Ring parameters for eth0: Pre-set maximums: RX: 1048576 RX Mini: n/a RX Jumbo: n/a TX: 1048576 Current hardware settings: RX: 256 RX Mini: n/a RX Jumbo: n/a TX: 4096 RX Buf Len: 2048 CQE Size: 128 Signed-off-by: Subbaraya Sundeep Signed-off-by: Sunil Goutham Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 4 ++++ include/uapi/linux/ethtool_netlink.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index e0853f48b75e..4af58459a1e7 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -71,18 +71,22 @@ enum { * struct kernel_ethtool_ringparam - RX/TX ring configuration * @rx_buf_len: Current length of buffers on the rx ring. * @tcp_data_split: Scatter packet headers and data to separate buffers + * @cqe_size: Size of TX/RX completion queue event */ struct kernel_ethtool_ringparam { u32 rx_buf_len; u8 tcp_data_split; + u32 cqe_size; }; /** * enum ethtool_supported_ring_param - indicator caps for setting ring params * @ETHTOOL_RING_USE_RX_BUF_LEN: capture for setting rx_buf_len + * @ETHTOOL_RING_USE_CQE_SIZE: capture for setting cqe_size */ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), + ETHTOOL_RING_USE_CQE_SIZE = BIT(1), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 417d4280d7b5..979850221b8d 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -337,6 +337,7 @@ enum { ETHTOOL_A_RINGS_TX, /* u32 */ ETHTOOL_A_RINGS_RX_BUF_LEN, /* u32 */ ETHTOOL_A_RINGS_TCP_DATA_SPLIT, /* u8 */ + ETHTOOL_A_RINGS_CQE_SIZE, /* u32 */ /* add new constants above here */ __ETHTOOL_A_RINGS_CNT, -- cgit v1.2.3 From 5597f082fcaf17e2d9adee7a1f6fa02f5872f9d5 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Thu, 13 Jan 2022 15:34:07 +0100 Subject: can: bittiming: mark function arguments and local variables as const This patch marks the arguments of some functions as well as some local variables as constant. Link: https://lore.kernel.org/all/20220124215642.3474154-7-mkl@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index a81652d1c6f3..7ae21c0f7f23 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -113,7 +113,7 @@ struct can_tdc_const { }; #ifdef CONFIG_CAN_CALC_BITTIMING -int can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, +int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc); void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, @@ -121,7 +121,7 @@ void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, u32 *ctrlmode, u32 ctrlmode_supported); #else /* !CONFIG_CAN_CALC_BITTIMING */ static inline int -can_calc_bittiming(struct net_device *dev, struct can_bittiming *bt, +can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc) { netdev_err(dev, "bit-timing calculation not available\n"); @@ -136,7 +136,7 @@ can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, } #endif /* CONFIG_CAN_CALC_BITTIMING */ -int can_get_bittiming(struct net_device *dev, struct can_bittiming *bt, +int can_get_bittiming(const struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc, const u32 *bitrate_const, const unsigned int bitrate_const_cnt); -- cgit v1.2.3 From 46a76724e4c93bb1cda8ee11276001a92d1f7987 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Feb 2022 16:00:44 +0200 Subject: net: dsa: rename references to "lag" as "lag_dev" In preparation of converting struct net_device *dp->lag_dev into a struct dsa_lag *dp->lag, we need to rename, for consistency purposes, all occurrences of the "lag" variable in the DSA core to "lag_dev". Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index f13de2d8aef3..ef7f446cbdf4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -182,12 +182,12 @@ static inline struct net_device *dsa_lag_dev(struct dsa_switch_tree *dst, } static inline int dsa_lag_id(struct dsa_switch_tree *dst, - struct net_device *lag) + struct net_device *lag_dev) { unsigned int id; dsa_lags_foreach_id(id, dst) { - if (dsa_lag_dev(dst, id) == lag) + if (dsa_lag_dev(dst, id) == lag_dev) return id; } @@ -966,10 +966,10 @@ struct dsa_switch_ops { int (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index, int port); int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, - int port, struct net_device *lag, + int port, struct net_device *lag_dev, struct netdev_lag_upper_info *info); int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, - int port, struct net_device *lag); + int port, struct net_device *lag_dev); /* * PTP functionality @@ -1041,10 +1041,10 @@ struct dsa_switch_ops { */ int (*port_lag_change)(struct dsa_switch *ds, int port); int (*port_lag_join)(struct dsa_switch *ds, int port, - struct net_device *lag, + struct net_device *lag_dev, struct netdev_lag_upper_info *info); int (*port_lag_leave)(struct dsa_switch *ds, int port, - struct net_device *lag); + struct net_device *lag_dev); /* * HSR integration -- cgit v1.2.3 From 3d4a0a2a46ab8ff8897dfd6324edee5e8184d2c5 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Feb 2022 16:00:47 +0200 Subject: net: dsa: make LAG IDs one-based The DSA LAG API will be changed to become more similar with the bridge data structures, where struct dsa_bridge holds an unsigned int num, which is generated by DSA and is one-based. We have a similar thing going with the DSA LAG, except that isn't stored anywhere, it is calculated dynamically by dsa_lag_id() by iterating through dst->lags. The idea of encoding an invalid (or not requested) LAG ID as zero for the purpose of simplifying checks in drivers means that the LAG IDs passed by DSA to drivers need to be one-based too. So back-and-forth conversion is needed when indexing the dst->lags array, as well as in drivers which assume a zero-based index. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index ef7f446cbdf4..3ae93adda25d 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -163,9 +163,10 @@ struct dsa_switch_tree { unsigned int last_switch; }; +/* LAG IDs are one-based, the dst->lags array is zero-based */ #define dsa_lags_foreach_id(_id, _dst) \ - for ((_id) = 0; (_id) < (_dst)->lags_len; (_id)++) \ - if ((_dst)->lags[(_id)]) + for ((_id) = 1; (_id) <= (_dst)->lags_len; (_id)++) \ + if ((_dst)->lags[(_id) - 1]) #define dsa_lag_foreach_port(_dp, _dst, _lag) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ @@ -178,7 +179,8 @@ struct dsa_switch_tree { static inline struct net_device *dsa_lag_dev(struct dsa_switch_tree *dst, unsigned int id) { - return dst->lags[id]; + /* DSA LAG IDs are one-based, dst->lags is zero-based */ + return dst->lags[id - 1]; } static inline int dsa_lag_id(struct dsa_switch_tree *dst, -- cgit v1.2.3 From dedd6a009f4191989bee83c1faf66728648a223f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Feb 2022 16:00:49 +0200 Subject: net: dsa: create a dsa_lag structure The main purpose of this change is to create a data structure for a LAG as seen by DSA. This is similar to what we have for bridging - we pass a copy of this structure by value to ->port_lag_join and ->port_lag_leave. For now we keep the lag_dev, id and a reference count in it. Future patches will add a list of FDB entries for the LAG (these also need to be refcounted to work properly). The LAG structure is created using dsa_port_lag_create() and destroyed using dsa_port_lag_destroy(), just like we have for bridging. Because now, the dsa_lag itself is refcounted, we can simplify dsa_lag_map() and dsa_lag_unmap(). These functions need to keep a LAG in the dst->lags array only as long as at least one port uses it. The refcounting logic inside those functions can be removed now - they are called only when we should perform the operation. dsa_lag_dev() is renamed to dsa_lag_by_id() and now returns the dsa_lag structure instead of the lag_dev net_device. dsa_lag_foreach_port() now takes the dsa_lag structure as argument. dst->lags holds an array of dsa_lag structures. dsa_lag_map() now also saves the dsa_lag->id value, so that linear walking of dst->lags in drivers using dsa_lag_id() is no longer necessary. They can just look at lag.id. dsa_port_lag_id_get() is a helper, similar to dsa_port_bridge_num_get(), which can be used by drivers to get the LAG ID assigned by DSA to a given port. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 50 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 3ae93adda25d..81ed34998416 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -116,6 +116,12 @@ struct dsa_netdevice_ops { #define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) +struct dsa_lag { + struct net_device *dev; + unsigned int id; + refcount_t refcount; +}; + struct dsa_switch_tree { struct list_head list; @@ -134,7 +140,7 @@ struct dsa_switch_tree { /* Maps offloaded LAG netdevs to a zero-based linear ID for * drivers that need it. */ - struct net_device **lags; + struct dsa_lag **lags; /* Tagging protocol operations */ const struct dsa_device_ops *tag_ops; @@ -170,14 +176,14 @@ struct dsa_switch_tree { #define dsa_lag_foreach_port(_dp, _dst, _lag) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ - if ((_dp)->lag_dev == (_lag)) + if (dsa_port_offloads_lag((_dp), (_lag))) #define dsa_hsr_foreach_port(_dp, _ds, _hsr) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr)) -static inline struct net_device *dsa_lag_dev(struct dsa_switch_tree *dst, - unsigned int id) +static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst, + unsigned int id) { /* DSA LAG IDs are one-based, dst->lags is zero-based */ return dst->lags[id - 1]; @@ -189,8 +195,10 @@ static inline int dsa_lag_id(struct dsa_switch_tree *dst, unsigned int id; dsa_lags_foreach_id(id, dst) { - if (dsa_lag_dev(dst, id) == lag_dev) - return id; + struct dsa_lag *lag = dsa_lag_by_id(dst, id); + + if (lag->dev == lag_dev) + return lag->id; } return -ENODEV; @@ -293,7 +301,7 @@ struct dsa_port { struct devlink_port devlink_port; struct phylink *pl; struct phylink_config pl_config; - struct net_device *lag_dev; + struct dsa_lag *lag; struct net_device *hsr_dev; struct list_head list; @@ -643,14 +651,30 @@ static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp) return dp->vlan_filtering; } +static inline unsigned int dsa_port_lag_id_get(struct dsa_port *dp) +{ + return dp->lag ? dp->lag->id : 0; +} + +static inline struct net_device *dsa_port_lag_dev_get(struct dsa_port *dp) +{ + return dp->lag ? dp->lag->dev : NULL; +} + +static inline bool dsa_port_offloads_lag(struct dsa_port *dp, + const struct dsa_lag *lag) +{ + return dsa_port_lag_dev_get(dp) == lag->dev; +} + static inline struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp) { if (!dp->bridge) return NULL; - if (dp->lag_dev) - return dp->lag_dev; + if (dp->lag) + return dp->lag->dev; else if (dp->hsr_dev) return dp->hsr_dev; @@ -968,10 +992,10 @@ struct dsa_switch_ops { int (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index, int port); int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, - int port, struct net_device *lag_dev, + int port, struct dsa_lag lag, struct netdev_lag_upper_info *info); int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, - int port, struct net_device *lag_dev); + int port, struct dsa_lag lag); /* * PTP functionality @@ -1043,10 +1067,10 @@ struct dsa_switch_ops { */ int (*port_lag_change)(struct dsa_switch *ds, int port); int (*port_lag_join)(struct dsa_switch *ds, int port, - struct net_device *lag_dev, + struct dsa_lag lag, struct netdev_lag_upper_info *info); int (*port_lag_leave)(struct dsa_switch *ds, int port, - struct net_device *lag_dev); + struct dsa_lag lag); /* * HSR integration -- cgit v1.2.3 From ec638740fce990ad2b9af43ead8088d6d6eb2145 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Feb 2022 16:00:50 +0200 Subject: net: switchdev: remove lag_mod_cb from switchdev_handle_fdb_event_to_device When the switchdev_handle_fdb_event_to_device() event replication helper was created, my original thought was that FDB events on LAG interfaces should most likely be special-cased, not just replicated towards all switchdev ports beneath that LAG. So this replication helper currently does not recurse through switchdev lower interfaces of LAG bridge ports, but rather calls the lag_mod_cb() if that was provided. No switchdev driver uses this helper for FDB events on LAG interfaces yet, so that was an assumption which was yet to be tested. It is certainly usable for that purpose, as my RFC series shows: https://patchwork.kernel.org/project/netdevbpf/cover/20220210125201.2859463-1-vladimir.oltean@nxp.com/ however this approach is slightly convoluted because: - the switchdev driver gets a "dev" that isn't its own net device, but rather the LAG net device. It must call switchdev_lower_dev_find(dev) in order to get a handle of any of its own net devices (the ones that pass check_cb). - in order for FDB entries on LAG ports to be correctly refcounted per the number of switchdev ports beneath that LAG, we haven't escaped the need to iterate through the LAG's lower interfaces. Except that is now the responsibility of the switchdev driver, because the replication helper just stopped half-way. So, even though yes, FDB events on LAG bridge ports must be special-cased, in the end it's simpler to let switchdev_handle_fdb_* just iterate through the LAG port's switchdev lowers, and let the switchdev driver figure out that those physical ports are under a LAG. The switchdev_handle_fdb_event_to_device() helper takes a "foreign_dev_check" callback so it can figure out whether @dev can autonomously forward to @foreign_dev. DSA fills this method properly: if the LAG is offloaded by another port in the same tree as @dev, then it isn't foreign. If it is a software LAG, it is foreign - forwarding happens in software. Whether an interface is foreign or not decides whether the replication helper will go through the LAG's switchdev lowers or not. Since the lan966x doesn't properly fill this out, FDB events on software LAG uppers will get called. By changing lan966x_foreign_dev_check(), we can suppress them. Whereas DSA will now start receiving FDB events for its offloaded LAG uppers, so we need to return -EOPNOTSUPP, since we currently don't do the right thing for them. Cc: Horatiu Vultur Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/switchdev.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index c32e1c8f79ec..3e424d40fae3 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -313,10 +313,7 @@ int switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long e const struct net_device *foreign_dev), int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, - const struct switchdev_notifier_fdb_info *fdb_info), - int (*lag_mod_cb)(struct net_device *dev, struct net_device *orig_dev, - unsigned long event, const void *ctx, - const struct switchdev_notifier_fdb_info *fdb_info)); + const struct switchdev_notifier_fdb_info *fdb_info)); int switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, @@ -443,10 +440,7 @@ switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event const struct net_device *foreign_dev), int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, unsigned long event, const void *ctx, - const struct switchdev_notifier_fdb_info *fdb_info), - int (*lag_mod_cb)(struct net_device *dev, struct net_device *orig_dev, - unsigned long event, const void *ctx, - const struct switchdev_notifier_fdb_info *fdb_info)) + const struct switchdev_notifier_fdb_info *fdb_info)) { return 0; } -- cgit v1.2.3 From e212fa7c54184b7b1f88990bd328b23b567cbf41 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Feb 2022 16:00:53 +0200 Subject: net: dsa: support FDB events on offloaded LAG interfaces This change introduces support for installing static FDB entries towards a bridge port that is a LAG of multiple DSA switch ports, as well as support for filtering towards the CPU local FDB entries emitted for LAG interfaces that are bridge ports. Conceptually, host addresses on LAG ports are identical to what we do for plain bridge ports. Whereas FDB entries _towards_ a LAG can't simply be replicated towards all member ports like we do for multicast, or VLAN. Instead we need new driver API. Hardware usually considers a LAG to be a "logical port", and sets the entire LAG as the forwarding destination. The physical egress port selection within the LAG is made by hashing policy, as usual. To represent the logical port corresponding to the LAG, we pass by value a copy of the dsa_lag structure to all switches in the tree that have at least one port in that LAG. To illustrate why a refcounted list of FDB entries is needed in struct dsa_lag, it is enough to say that: - a LAG may be a bridge port and may therefore receive FDB events even while it isn't yet offloaded by any DSA interface - DSA interfaces may be removed from a LAG while that is a bridge port; we don't want FDB entries lingering around, but we don't want to remove entries that are still in use, either For all the cases below to work, the idea is to always keep an FDB entry on a LAG with a reference count equal to the DSA member ports. So: - if a port joins a LAG, it requests the bridge to replay the FDB, and the FDB entries get created, or their refcount gets bumped by one - if a port leaves a LAG, the FDB replay deletes or decrements refcount by one - if an FDB is installed towards a LAG with ports already present, that entry is created (if it doesn't exist) and its refcount is bumped by the amount of ports already present in the LAG echo "Adding FDB entry to bond with existing ports" ip link del bond0 ip link add bond0 type bond mode 802.3ad ip link set swp1 down && ip link set swp1 master bond0 && ip link set swp1 up ip link set swp2 down && ip link set swp2 master bond0 && ip link set swp2 up ip link del br0 ip link add br0 type bridge ip link set bond0 master br0 bridge fdb add dev bond0 00:01:02:03:04:05 master static ip link del br0 ip link del bond0 echo "Adding FDB entry to empty bond" ip link del bond0 ip link add bond0 type bond mode 802.3ad ip link del br0 ip link add br0 type bridge ip link set bond0 master br0 bridge fdb add dev bond0 00:01:02:03:04:05 master static ip link set swp1 down && ip link set swp1 master bond0 && ip link set swp1 up ip link set swp2 down && ip link set swp2 master bond0 && ip link set swp2 up ip link del br0 ip link del bond0 echo "Adding FDB entry to empty bond, then removing ports one by one" ip link del bond0 ip link add bond0 type bond mode 802.3ad ip link del br0 ip link add br0 type bridge ip link set bond0 master br0 bridge fdb add dev bond0 00:01:02:03:04:05 master static ip link set swp1 down && ip link set swp1 master bond0 && ip link set swp1 up ip link set swp2 down && ip link set swp2 master bond0 && ip link set swp2 up ip link set swp1 nomaster ip link set swp2 nomaster ip link del br0 ip link del bond0 Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 81ed34998416..01faba89c987 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -119,6 +119,8 @@ struct dsa_netdevice_ops { struct dsa_lag { struct net_device *dev; unsigned int id; + struct mutex fdb_lock; + struct list_head fdbs; refcount_t refcount; }; @@ -944,6 +946,10 @@ struct dsa_switch_ops { const unsigned char *addr, u16 vid); int (*port_fdb_dump)(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); + int (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag, + const unsigned char *addr, u16 vid); + int (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag, + const unsigned char *addr, u16 vid); /* * Multicast database -- cgit v1.2.3 From 961d8b699070070fb3a9639af61641a52c8e49ef Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 23 Feb 2022 16:00:54 +0200 Subject: net: dsa: felix: support FDB entries on offloaded LAG interfaces This adds the logic in the Felix DSA driver and Ocelot switch library. For Ocelot switches, the DEST_IDX that is the output of the MAC table lookup is a logical port (equal to physical port, if no LAG is used, or a dynamically allocated number otherwise). The allocation we have in place for LAG IDs is different from DSA's, so we can't use that: - DSA allocates a continuous range of LAG IDs starting from 1 - Ocelot appears to require that physical ports and LAG IDs are in the same space of [0, num_phys_ports), and additionally, ports that aren't in a LAG must have physical port id == logical port id The implication is that an FDB entry towards a LAG might need to be deleted and reinstalled when the LAG ID changes. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/soc/mscc/ocelot.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 78f56502bc09..dd4fc34d2992 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -635,6 +635,13 @@ enum macaccess_entry_type { #define OCELOT_QUIRK_PCS_PERFORMS_RATE_ADAPTATION BIT(0) #define OCELOT_QUIRK_QSGMII_PORTS_MUST_BE_UP BIT(1) +struct ocelot_lag_fdb { + unsigned char addr[ETH_ALEN]; + u16 vid; + struct net_device *bond; + struct list_head list; +}; + struct ocelot_port { struct ocelot *ocelot; @@ -690,6 +697,7 @@ struct ocelot { struct list_head vlans; struct list_head traps; + struct list_head lag_fdbs; /* Switches like VSC9959 have flooding per traffic class */ int num_flooding_pgids; @@ -866,6 +874,10 @@ int ocelot_fdb_add(struct ocelot *ocelot, int port, const unsigned char *addr, u16 vid); int ocelot_fdb_del(struct ocelot *ocelot, int port, const unsigned char *addr, u16 vid); +int ocelot_lag_fdb_add(struct ocelot *ocelot, struct net_device *bond, + const unsigned char *addr, u16 vid); +int ocelot_lag_fdb_del(struct ocelot *ocelot, struct net_device *bond, + const unsigned char *addr, u16 vid); int ocelot_vlan_prepare(struct ocelot *ocelot, int port, u16 vid, bool pvid, bool untagged, struct netlink_ext_ack *extack); int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, -- cgit v1.2.3 From 7bbb765b73496699a165d505ecdce962f903b422 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 23 Feb 2022 17:57:40 +0000 Subject: net/tcp: Merge TCP-MD5 inbound callbacks The functions do essentially the same work to verify TCP-MD5 sign. Code can be merged into one family-independent function in order to reduce copy'n'paste and generated code. Later with TCP-AO option added, this will allow to create one function that's responsible for segment verification, that will have all the different checks for MD5/AO/non-signed packets, which in turn will help to see checks for all corner-cases in one function, rather than spread around different families and functions. Cc: Eric Dumazet Cc: Hideaki YOSHIFUJI Signed-off-by: Dmitry Safonov Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20220223175740.452397-1-dima@arista.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 04f4650e0ff0..479a27777ad6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1674,6 +1674,11 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family); } +bool tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + enum skb_drop_reason *reason, + const void *saddr, const void *daddr, + int family, int dif, int sdif); + #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else @@ -1683,6 +1688,14 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, { return NULL; } +static inline bool tcp_inbound_md5_hash(const struct sock *sk, + const struct sk_buff *skb, + enum skb_drop_reason *reason, + const void *saddr, const void *daddr, + int family, int dif, int sdif) +{ + return false; +} #define tcp_twsk_md5_key(twsk) NULL #endif -- cgit v1.2.3 From 28a3f0601727d521a1c6cce62ecbcb7402a9e4f5 Mon Sep 17 00:00:00 2001 From: Toms Atteka Date: Wed, 23 Feb 2022 16:54:09 -0800 Subject: net: openvswitch: IPv6: Add IPv6 extension header support This change adds a new OpenFlow field OFPXMT_OFB_IPV6_EXTHDR and packets can be filtered using ipv6_ext flag. Signed-off-by: Toms Atteka Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 150bcff49b1c..9d1710f20505 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -351,6 +351,7 @@ enum ovs_key_attr { OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, /* struct ovs_key_ct_tuple_ipv4 */ OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, /* struct ovs_key_ct_tuple_ipv6 */ OVS_KEY_ATTR_NSH, /* Nested set of ovs_nsh_key_* */ + OVS_KEY_ATTR_IPV6_EXTHDRS, /* struct ovs_key_ipv6_exthdr */ #ifdef __KERNEL__ OVS_KEY_ATTR_TUNNEL_INFO, /* struct ip_tunnel_info */ @@ -430,6 +431,11 @@ struct ovs_key_ipv6 { __u8 ipv6_frag; /* One of OVS_FRAG_TYPE_*. */ }; +/* separate structure to support backward compatibility with older user space */ +struct ovs_key_ipv6_exthdrs { + __u16 hdrs; +}; + struct ovs_key_tcp { __be16 tcp_src; __be16 tcp_dst; -- cgit v1.2.3 From 5e187189ec324f78035d33a4bc123a9c4ca6f3e3 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 26 Feb 2022 12:18:29 +0800 Subject: net: ip: add skb drop reasons for ip egress path Replace kfree_skb() which is used in the packet egress path of IP layer with kfree_skb_reason(). Functions that are involved include: __ip_queue_xmit() ip_finish_output() ip_mc_finish_output() ip6_output() ip6_finish_output() ip6_finish_output2() Following new drop reasons are introduced: SKB_DROP_REASON_IP_OUTNOROUTES SKB_DROP_REASON_BPF_CGROUP_EGRESS SKB_DROP_REASON_IPV6DISABLED SKB_DROP_REASON_NEIGH_CREATEFAIL Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 9 +++++++++ include/trace/events/skb.h | 5 +++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 31be38078918..62b4bed1b7bc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -380,6 +380,15 @@ enum skb_drop_reason { * the ofo queue, corresponding to * LINUX_MIB_TCPOFOMERGE */ + SKB_DROP_REASON_IP_OUTNOROUTES, /* route lookup failed */ + SKB_DROP_REASON_BPF_CGROUP_EGRESS, /* dropped by + * BPF_PROG_TYPE_CGROUP_SKB + * eBPF program + */ + SKB_DROP_REASON_IPV6DISABLED, /* IPv6 is disabled on the device */ + SKB_DROP_REASON_NEIGH_CREATEFAIL, /* failed to create neigh + * entry + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 2ab7193313aa..97f8097651da 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -37,6 +37,11 @@ EM(SKB_DROP_REASON_TCP_OLD_DATA, TCP_OLD_DATA) \ EM(SKB_DROP_REASON_TCP_OVERWINDOW, TCP_OVERWINDOW) \ EM(SKB_DROP_REASON_TCP_OFOMERGE, TCP_OFOMERGE) \ + EM(SKB_DROP_REASON_IP_OUTNOROUTES, IP_OUTNOROUTES) \ + EM(SKB_DROP_REASON_BPF_CGROUP_EGRESS, \ + BPF_CGROUP_EGRESS) \ + EM(SKB_DROP_REASON_IPV6DISABLED, IPV6DISABLED) \ + EM(SKB_DROP_REASON_NEIGH_CREATEFAIL, NEIGH_CREATEFAIL) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From a5736edda10ca2ba075606baad1dda3f2426766d Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 26 Feb 2022 12:18:30 +0800 Subject: net: neigh: use kfree_skb_reason() for __neigh_event_send() Replace kfree_skb() used in __neigh_event_send() with kfree_skb_reason(). Following drop reasons are added: SKB_DROP_REASON_NEIGH_FAILED SKB_DROP_REASON_NEIGH_QUEUEFULL SKB_DROP_REASON_NEIGH_DEAD The first two reasons above should be the hot path that skb drops in neighbour subsystem. Reviewed-by: Mengen Sun Reviewed-by: Hao Peng Signed-off-by: Menglong Dong Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ include/trace/events/skb.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 62b4bed1b7bc..d67941f78b92 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -389,6 +389,11 @@ enum skb_drop_reason { SKB_DROP_REASON_NEIGH_CREATEFAIL, /* failed to create neigh * entry */ + SKB_DROP_REASON_NEIGH_FAILED, /* neigh entry in failed state */ + SKB_DROP_REASON_NEIGH_QUEUEFULL, /* arp_queue for neigh + * entry is full + */ + SKB_DROP_REASON_NEIGH_DEAD, /* neigh entry is dead */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 97f8097651da..1977f301260d 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -42,6 +42,9 @@ BPF_CGROUP_EGRESS) \ EM(SKB_DROP_REASON_IPV6DISABLED, IPV6DISABLED) \ EM(SKB_DROP_REASON_NEIGH_CREATEFAIL, NEIGH_CREATEFAIL) \ + EM(SKB_DROP_REASON_NEIGH_FAILED, NEIGH_FAILED) \ + EM(SKB_DROP_REASON_NEIGH_QUEUEFULL, NEIGH_QUEUEFULL) \ + EM(SKB_DROP_REASON_NEIGH_DEAD, NEIGH_DEAD) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From e8eb9e32999dc5995b19d5141f8ebb38f69696fc Mon Sep 17 00:00:00 2001 From: Dimitris Michailidis Date: Thu, 24 Feb 2022 18:58:55 -0800 Subject: PCI: Add Fungible Vendor ID to pci_ids.h Cc: Bjorn Helgaas Cc: linux-pci@vger.kernel.org Signed-off-by: Dimitris Michailidis Acked-by: Bjorn Helgaas Signed-off-by: David S. Miller --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index aad54c666407..c7e6f2043c7d 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2561,6 +2561,8 @@ #define PCI_VENDOR_ID_HYGON 0x1d94 +#define PCI_VENDOR_ID_FUNGIBLE 0x1dad + #define PCI_VENDOR_ID_HXT 0x1dbf #define PCI_VENDOR_ID_TEKRAM 0x1de1 -- cgit v1.2.3 From 91495f21fcec3a889d6a9c21e6df16c4c15f2184 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:16 +0200 Subject: net: dsa: tag_8021q: replace the SVL bridging with VLAN-unaware IVL bridging For VLAN-unaware bridging, tag_8021q uses something perhaps a bit too tied with the sja1105 switch: each port uses the same pvid which is also used for standalone operation (a unique one from which the source port and device ID can be retrieved when packets from that port are forwarded to the CPU). Since each port has a unique pvid when performing autonomous forwarding, the switch must be configured for Shared VLAN Learning (SVL) such that the VLAN ID itself is ignored when performing FDB lookups. Without SVL, packets would always be flooded, since FDB lookup in the source port's VLAN would never find any entry. First of all, to make tag_8021q more palatable to switches which might not support Shared VLAN Learning, let's just use a common VLAN for all ports that are under the same bridge. Secondly, using Shared VLAN Learning means that FDB isolation can never be enforced. But if all ports under the same VLAN-unaware bridge share the same VLAN ID, it can. The disadvantage is that the CPU port can no longer perform precise source port identification for these packets. But at least we have a mechanism which has proven to be adequate for that situation: imprecise RX (dsa_find_designated_bridge_port_by_vid), which is what we use for termination on VLAN-aware bridges. The VLAN ID that VLAN-unaware bridges will use with tag_8021q is the same one as we were previously using for imprecise TX (bridge TX forwarding offload). It is already allocated, it is just a matter of using it. Note that because now all ports under the same bridge share the same VLAN, the complexity of performing a tag_8021q bridge join decreases dramatically. We no longer have to install the RX VLAN of a newly joining port into the port membership of the existing bridge ports. The newly joining port just becomes a member of the VLAN corresponding to that bridge, and the other ports are already members of it from when they joined the bridge themselves. So forwarding works properly. This means that we can unhook dsa_tag_8021q_bridge_{join,leave} from the cross-chip notifier level dsa_switch_bridge_{join,leave}. We can put these calls directly into the sja1105 driver. With this new mode of operation, a port controlled by tag_8021q can have two pvids whereas before it could only have one. The pvid for standalone operation is different from the pvid used for VLAN-unaware bridging. This is done, again, so that FDB isolation can be enforced. Let tag_8021q manage this by deleting the standalone pvid when a port joins a bridge, and restoring it when it leaves it. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 939a1beaddf7..f47f227baa27 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -32,17 +32,17 @@ int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto); void dsa_tag_8021q_unregister(struct dsa_switch *ds); +int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port, + struct dsa_bridge bridge); + +void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, + struct dsa_bridge bridge); + struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci); void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id); -int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, - struct dsa_bridge bridge); - -void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, - struct dsa_bridge bridge); - u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num); u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp); -- cgit v1.2.3 From d7f9787a763f35225287aedb9364c972ae128d18 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:17 +0200 Subject: net: dsa: tag_8021q: add support for imprecise RX based on the VBID The sja1105 switch can't populate the PORT field of the tag_8021q header when sending a frame to the CPU with a non-zero VBID. Similar to dsa_find_designated_bridge_port_by_vid() which performs imprecise RX for VLAN-aware bridges, let's introduce a helper in tag_8021q for performing imprecise RX based on the VLAN that it has allocated for a VLAN-unaware bridge. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index f47f227baa27..92f5243b841e 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -41,7 +41,11 @@ void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci); -void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id); +void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, + int *vbid); + +struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, + int vbid); u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num); -- cgit v1.2.3 From 04b67e18ce5b29785578397f6785f28f512d64aa Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:20 +0200 Subject: net: dsa: tag_8021q: merge RX and TX VLANs In the old Shared VLAN Learning mode of operation that tag_8021q previously used for forwarding, we needed to have distinct concepts for an RX and a TX VLAN. An RX VLAN could be installed on all ports that were members of a given bridge, so that autonomous forwarding could still work, while a TX VLAN was dedicated for precise packet steering, so it just contained the CPU port and one egress port. Now that tag_8021q uses Independent VLAN Learning and imprecise RX/TX all over, those lines have been blurred and we no longer have the need to do precise TX towards a port that is in a bridge. As for standalone ports, it is fine to use the same VLAN ID for both RX and TX. This patch changes the tag_8021q format by shifting the VLAN range it reserves, and halving it. Previously, our DIR bits were encoding the VLAN direction (RX/TX) and were set to either 1 or 2. This meant that tag_8021q reserved 2K VLANs, or 50% of the available range. Change the DIR bits to a hardcoded value of 3 now, which makes tag_8021q reserve only 1K VLANs, and a different range now (the last 1K). This is done so that we leave the old format in place in case we need to return to it. In terms of code, the vid_is_dsa_8021q_rxvlan and vid_is_dsa_8021q_txvlan functions go away. Any vid_is_dsa_8021q is both a TX and an RX VLAN, and they are no longer distinct. For example, felix which did different things for different VLAN types, now needs to handle the RX and the TX logic for the same VLAN. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 92f5243b841e..b4e2862633f6 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -49,18 +49,12 @@ struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num); -u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp); - -u16 dsa_tag_8021q_rx_vid(const struct dsa_port *dp); +u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp); int dsa_8021q_rx_switch_id(u16 vid); int dsa_8021q_rx_source_port(u16 vid); -bool vid_is_dsa_8021q_rxvlan(u16 vid); - -bool vid_is_dsa_8021q_txvlan(u16 vid); - bool vid_is_dsa_8021q(u16 vid); #endif /* _NET_DSA_8021Q_H */ -- cgit v1.2.3 From b6362bdf750b4ba266d4a10156174ec52460e73d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:21 +0200 Subject: net: dsa: tag_8021q: rename dsa_8021q_bridge_tx_fwd_offload_vid The dsa_8021q_bridge_tx_fwd_offload_vid is no longer used just for bridge TX forwarding offload, it is the private VLAN reserved for VLAN-unaware bridging in a way that is compatible with FDB isolation. So just rename it dsa_tag_8021q_bridge_vid. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/linux/dsa/8021q.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index b4e2862633f6..3ed117e299ec 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -47,7 +47,7 @@ void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, int vbid); -u16 dsa_8021q_bridge_tx_fwd_offload_vid(unsigned int bridge_num); +u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num); u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp); -- cgit v1.2.3 From c26933639b5402c174c65c01d33f145622784012 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:22 +0200 Subject: net: dsa: request drivers to perform FDB isolation For DSA, to encourage drivers to perform FDB isolation simply means to track which bridge does each FDB and MDB entry belong to. It then becomes the driver responsibility to use something that makes the FDB entry from one bridge not match the FDB lookup of ports from other bridges. The top-level functions where the bridge is determined are: - dsa_port_fdb_{add,del} - dsa_port_host_fdb_{add,del} - dsa_port_mdb_{add,del} - dsa_port_host_mdb_{add,del} aka the pre-crosschip-notifier functions. Changing the API to pass a reference to a bridge is not superfluous, and looking at the passed bridge argument is not the same as having the driver look at dsa_to_port(ds, port)->bridge from the ->port_fdb_add() method. DSA installs FDB and MDB entries on shared (CPU and DSA) ports as well, and those do not have any dp->bridge information to retrieve, because they are not in any bridge - they are merely the pipes that serve the user ports that are in one or multiple bridges. The struct dsa_bridge associated with each FDB/MDB entry is encapsulated in a larger "struct dsa_db" database. Although only databases associated to bridges are notified for now, this API will be the starting point for implementing IFF_UNICAST_FLT in DSA. There, the idea is to install FDB entries on the CPU port which belong to the corresponding user port's port database. These are supposed to match only when the port is standalone. It is better to introduce the API in its expected final form than to introduce it for bridges first, then to have to change drivers which may have made one or more assumptions. Drivers can use the provided bridge.num, but they can also use a different numbering scheme that is more convenient. DSA must perform refcounting on the CPU and DSA ports by also taking into account the bridge number. So if two bridges request the same local address, DSA must notify the driver twice, once for each bridge. In fact, if the driver supports FDB isolation, DSA must perform refcounting per bridge, but if the driver doesn't, DSA must refcount host addresses across all bridges, otherwise it would be telling the driver to delete an FDB entry for a bridge and the driver would delete it for all bridges. So introduce a bool fdb_isolation in drivers which would make all bridge databases passed to the cross-chip notifier have the same number (0). This makes dsa_mac_addr_find() -> dsa_db_equal() say that all bridge databases are the same database - which is essentially the legacy behavior. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 01faba89c987..87c5f18eb381 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -341,11 +341,28 @@ struct dsa_link { struct list_head list; }; +enum dsa_db_type { + DSA_DB_PORT, + DSA_DB_LAG, + DSA_DB_BRIDGE, +}; + +struct dsa_db { + enum dsa_db_type type; + + union { + const struct dsa_port *dp; + struct dsa_lag lag; + struct dsa_bridge bridge; + }; +}; + struct dsa_mac_addr { unsigned char addr[ETH_ALEN]; u16 vid; refcount_t refcount; struct list_head list; + struct dsa_db db; }; struct dsa_vlan { @@ -409,6 +426,13 @@ struct dsa_switch { */ u32 mtu_enforcement_ingress:1; + /* Drivers that isolate the FDBs of multiple bridges must set this + * to true to receive the bridge as an argument in .port_fdb_{add,del} + * and .port_mdb_{add,del}. Otherwise, the bridge.num will always be + * passed as zero. + */ + u32 fdb_isolation:1; + /* Listener for switch fabric events */ struct notifier_block nb; @@ -941,23 +965,29 @@ struct dsa_switch_ops { * Forwarding database */ int (*port_fdb_add)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + struct dsa_db db); int (*port_fdb_del)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + struct dsa_db db); int (*port_fdb_dump)(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); int (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + struct dsa_db db); int (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + struct dsa_db db); /* * Multicast database */ int (*port_mdb_add)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb); + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db); int (*port_mdb_del)(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb); + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db); /* * RXNFC */ -- cgit v1.2.3 From 06b9cce42634a50f2840777a66553b02320db5ef Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:23 +0200 Subject: net: dsa: pass extack to .port_bridge_join driver methods As FDB isolation cannot be enforced between VLAN-aware bridges in lack of hardware assistance like extra FID bits, it seems plausible that many DSA switches cannot do it. Therefore, they need to reject configurations with multiple VLAN-aware bridges from the two code paths that can transition towards that state: - joining a VLAN-aware bridge - toggling VLAN awareness on an existing bridge The .port_vlan_filtering method already propagates the netlink extack to the driver, let's propagate it from .port_bridge_join too, to make sure that the driver can use the same function for both. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 87c5f18eb381..cfedcfb86350 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -937,7 +937,8 @@ struct dsa_switch_ops { int (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs); int (*port_bridge_join)(struct dsa_switch *ds, int port, struct dsa_bridge bridge, - bool *tx_fwd_offload); + bool *tx_fwd_offload, + struct netlink_ext_ack *extack); void (*port_bridge_leave)(struct dsa_switch *ds, int port, struct dsa_bridge bridge); void (*port_stp_state_set)(struct dsa_switch *ds, int port, @@ -1021,7 +1022,8 @@ struct dsa_switch_ops { */ int (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index, int sw_index, int port, - struct dsa_bridge bridge); + struct dsa_bridge bridge, + struct netlink_ext_ack *extack); void (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index, int sw_index, int port, struct dsa_bridge bridge); -- cgit v1.2.3 From 54c319846086e57071fd0e92d20f2cba0fbf0e79 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 25 Feb 2022 11:22:25 +0200 Subject: net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index dd4fc34d2992..ee3c59639d70 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -668,6 +668,7 @@ struct ocelot_port { u16 mrp_ring_id; struct net_device *bridge; + int bridge_num; u8 stp_state; int speed; @@ -713,6 +714,8 @@ struct ocelot { enum ocelot_tag_prefix npi_inj_prefix; enum ocelot_tag_prefix npi_xtr_prefix; + unsigned long bridges; + struct list_head multicast; struct list_head pgids; @@ -846,6 +849,9 @@ void ocelot_deinit(struct ocelot *ocelot); void ocelot_init_port(struct ocelot *ocelot, int port); void ocelot_deinit_port(struct ocelot *ocelot, int port); +void ocelot_port_set_dsa_8021q_cpu(struct ocelot *ocelot, int port); +void ocelot_port_unset_dsa_8021q_cpu(struct ocelot *ocelot, int port); + /* DSA callbacks */ void ocelot_get_strings(struct ocelot *ocelot, int port, u32 sset, u8 *data); void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data); @@ -863,21 +869,24 @@ int ocelot_port_pre_bridge_flags(struct ocelot *ocelot, int port, struct switchdev_brport_flags val); void ocelot_port_bridge_flags(struct ocelot *ocelot, int port, struct switchdev_brport_flags val); -void ocelot_port_bridge_join(struct ocelot *ocelot, int port, - struct net_device *bridge); +int ocelot_port_bridge_join(struct ocelot *ocelot, int port, + struct net_device *bridge, int bridge_num, + struct netlink_ext_ack *extack); void ocelot_port_bridge_leave(struct ocelot *ocelot, int port, struct net_device *bridge); int ocelot_mact_flush(struct ocelot *ocelot, int port); int ocelot_fdb_dump(struct ocelot *ocelot, int port, dsa_fdb_dump_cb_t *cb, void *data); -int ocelot_fdb_add(struct ocelot *ocelot, int port, - const unsigned char *addr, u16 vid); -int ocelot_fdb_del(struct ocelot *ocelot, int port, - const unsigned char *addr, u16 vid); +int ocelot_fdb_add(struct ocelot *ocelot, int port, const unsigned char *addr, + u16 vid, const struct net_device *bridge); +int ocelot_fdb_del(struct ocelot *ocelot, int port, const unsigned char *addr, + u16 vid, const struct net_device *bridge); int ocelot_lag_fdb_add(struct ocelot *ocelot, struct net_device *bond, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + const struct net_device *bridge); int ocelot_lag_fdb_del(struct ocelot *ocelot, struct net_device *bond, - const unsigned char *addr, u16 vid); + const unsigned char *addr, u16 vid, + const struct net_device *bridge); int ocelot_vlan_prepare(struct ocelot *ocelot, int port, u16 vid, bool pvid, bool untagged, struct netlink_ext_ack *extack); int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid, @@ -901,9 +910,11 @@ int ocelot_cls_flower_destroy(struct ocelot *ocelot, int port, int ocelot_cls_flower_stats(struct ocelot *ocelot, int port, struct flow_cls_offload *f, bool ingress); int ocelot_port_mdb_add(struct ocelot *ocelot, int port, - const struct switchdev_obj_port_mdb *mdb); + const struct switchdev_obj_port_mdb *mdb, + const struct net_device *bridge); int ocelot_port_mdb_del(struct ocelot *ocelot, int port, - const struct switchdev_obj_port_mdb *mdb); + const struct switchdev_obj_port_mdb *mdb, + const struct net_device *bridge); int ocelot_port_lag_join(struct ocelot *ocelot, int port, struct net_device *bond, struct netdev_lag_upper_info *info); -- cgit v1.2.3 From b8cd5831c61cadee4493248babef8386f836f31c Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Thu, 24 Feb 2022 10:29:07 +0000 Subject: net: flow_offload: add tc police action parameters The current police offload action entry is missing exceed/notexceed actions and parameters that can be configured by tc police action. Add the missing parameters as a pre-step for offloading police actions to hardware. Signed-off-by: Jianbo Liu Signed-off-by: Roi Dayan Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/flow_offload.h | 9 +++++++++ include/net/tc_act/tc_police.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 5b8c54eb7a6b..74f44d44abe3 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -148,6 +148,8 @@ enum flow_action_id { FLOW_ACTION_MPLS_MANGLE, FLOW_ACTION_GATE, FLOW_ACTION_PPPOE_PUSH, + FLOW_ACTION_JUMP, + FLOW_ACTION_PIPE, NUM_FLOW_ACTIONS, }; @@ -235,9 +237,16 @@ struct flow_action_entry { struct { /* FLOW_ACTION_POLICE */ u32 burst; u64 rate_bytes_ps; + u64 peakrate_bytes_ps; + u32 avrate; + u16 overhead; u64 burst_pkt; u64 rate_pkt_ps; u32 mtu; + struct { + enum flow_action_id act_id; + u32 extval; + } exceed, notexceed; } police; struct { /* FLOW_ACTION_CT */ int action; diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h index 72649512dcdd..283bde711a42 100644 --- a/include/net/tc_act/tc_police.h +++ b/include/net/tc_act/tc_police.h @@ -159,4 +159,34 @@ static inline u32 tcf_police_tcfp_mtu(const struct tc_action *act) return params->tcfp_mtu; } +static inline u64 tcf_police_peakrate_bytes_ps(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + return params->peak.rate_bytes_ps; +} + +static inline u32 tcf_police_tcfp_ewma_rate(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + return params->tcfp_ewma_rate; +} + +static inline u16 tcf_police_rate_overhead(const struct tc_action *act) +{ + struct tcf_police *police = to_police(act); + struct tcf_police_params *params; + + params = rcu_dereference_protected(police->params, + lockdep_is_held(&police->tcf_lock)); + return params->rate.overhead; +} + #endif /* __NET_TC_POLICE_H */ -- cgit v1.2.3 From d97b4b105ce71f9f907f1511986cc1d224126772 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Thu, 24 Feb 2022 10:29:08 +0000 Subject: flow_offload: reject offload for all drivers with invalid police parameters As more police parameters are passed to flow_offload, driver can check them to make sure hardware handles packets in the way indicated by tc. The conform-exceed control should be drop/pipe or drop/ok. Besides, for drop/ok, the police should be the last action. As hardware can't configure peakrate/avrate/overhead, offload should not be supported if any of them is configured. Signed-off-by: Jianbo Liu Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- include/net/flow_offload.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 74f44d44abe3..92267d23083e 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -311,6 +311,12 @@ static inline bool flow_offload_has_one_action(const struct flow_action *action) return action->num_entries == 1; } +static inline bool flow_action_is_last_entry(const struct flow_action *action, + const struct flow_action_entry *entry) +{ + return entry == &action->entries[action->num_entries - 1]; +} + #define flow_action_for_each(__i, __act, __actions) \ for (__i = 0, __act = &(__actions)->entries[0]; \ __i < (__actions)->num_entries; \ -- cgit v1.2.3 From a5081bad2eac5108593ac36b6551201f0df9e897 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Sat, 26 Feb 2022 14:56:22 +0000 Subject: net: phylink: remove phylink_set_pcs() As all users of phylink_set_pcs() have now been updated to use the mac_select_pcs() method, it can be removed from the phylink kernel API and its functionality moved into phylink_major_config(). Removing phylink_set_pcs() gives us a single approach for attaching a PCS within phylink. Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phylink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 9ef9b7047f19..223781622b33 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -532,7 +532,6 @@ void phylink_generic_validate(struct phylink_config *config, struct phylink *phylink_create(struct phylink_config *, struct fwnode_handle *, phy_interface_t iface, const struct phylink_mac_ops *mac_ops); -void phylink_set_pcs(struct phylink *, struct phylink_pcs *pcs); void phylink_destroy(struct phylink *); int phylink_connect_phy(struct phylink *, struct phy_device *); -- cgit v1.2.3 From 7b8135f4df98b155b23754b6065c157861e268f1 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 1 Mar 2022 05:04:34 +0000 Subject: rtnetlink: add new rtm tunnel api for tunnel id filtering This patch adds new rtm tunnel msg and api for tunnel id filtering in dst_metadata devices. First dst_metadata device to use the api is vxlan driver with AF_BRIDGE family. This and later changes add ability in vxlan driver to do tunnel id filtering (or vni filtering) on dst_metadata devices. This is similar to vlan api in the vlan filtering bridge. this patch includes selinux nlmsg_route_perms support for RTM_*TUNNEL api from Benjamin Poirier. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 26 ++++++++++++++++++++++++++ include/uapi/linux/rtnetlink.h | 9 +++++++++ 2 files changed, 35 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index be09d2ad4b5d..3dfc9ff2ec9b 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -713,7 +713,32 @@ enum ipvlan_mode { #define IPVLAN_F_PRIVATE 0x01 #define IPVLAN_F_VEPA 0x02 +/* Tunnel RTM header */ +struct tunnel_msg { + __u8 family; + __u8 reserved1; + __u16 reserved2; + __u32 ifindex; +}; + /* VXLAN section */ +enum { + VXLAN_VNIFILTER_ENTRY_UNSPEC, + VXLAN_VNIFILTER_ENTRY_START, + VXLAN_VNIFILTER_ENTRY_END, + VXLAN_VNIFILTER_ENTRY_GROUP, + VXLAN_VNIFILTER_ENTRY_GROUP6, + __VXLAN_VNIFILTER_ENTRY_MAX +}; +#define VXLAN_VNIFILTER_ENTRY_MAX (__VXLAN_VNIFILTER_ENTRY_MAX - 1) + +enum { + VXLAN_VNIFILTER_UNSPEC, + VXLAN_VNIFILTER_ENTRY, + __VXLAN_VNIFILTER_MAX +}; +#define VXLAN_VNIFILTER_MAX (__VXLAN_VNIFILTER_MAX - 1) + enum { IFLA_VXLAN_UNSPEC, IFLA_VXLAN_ID, @@ -745,6 +770,7 @@ enum { IFLA_VXLAN_GPE, IFLA_VXLAN_TTL_INHERIT, IFLA_VXLAN_DF, + IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 93d934cc4613..0970cb4b1b88 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -185,6 +185,13 @@ enum { RTM_GETNEXTHOPBUCKET, #define RTM_GETNEXTHOPBUCKET RTM_GETNEXTHOPBUCKET + RTM_NEWTUNNEL = 120, +#define RTM_NEWTUNNEL RTM_NEWTUNNEL + RTM_DELTUNNEL, +#define RTM_DELTUNNEL RTM_DELTUNNEL + RTM_GETTUNNEL, +#define RTM_GETTUNNEL RTM_GETTUNNEL + __RTM_MAX, #define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) }; @@ -756,6 +763,8 @@ enum rtnetlink_groups { #define RTNLGRP_BRVLAN RTNLGRP_BRVLAN RTNLGRP_MCTP_IFADDR, #define RTNLGRP_MCTP_IFADDR RTNLGRP_MCTP_IFADDR + RTNLGRP_TUNNEL, +#define RTNLGRP_TUNNEL RTNLGRP_TUNNEL __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) -- cgit v1.2.3 From f9c4bb0b245cee35ef66f75bf409c9573d934cf9 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 1 Mar 2022 05:04:36 +0000 Subject: vxlan: vni filtering support on collect metadata device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds vnifiltering support to collect metadata device. Motivation: You can only use a single vxlan collect metadata device for a given vxlan udp port in the system today. The vxlan collect metadata device terminates all received vxlan packets. As shown in the below diagram, there are use-cases where you need to support multiple such vxlan devices in independent bridge domains. Each vxlan device must terminate the vni's it is configured for. Example usecase: In a service provider network a service provider typically supports multiple bridge domains with overlapping vlans. One bridge domain per customer. Vlans in each bridge domain are mapped to globally unique vxlan ranges assigned to each customer. vnifiltering support in collect metadata devices terminates only configured vnis. This is similar to vlan filtering in bridge driver. The vni filtering capability is provided by a new flag on collect metadata device. In the below pic: - customer1 is mapped to br1 bridge domain - customer2 is mapped to br2 bridge domain - customer1 vlan 10-11 is mapped to vni 1001-1002 - customer2 vlan 10-11 is mapped to vni 2001-2002 - br1 and br2 are vlan filtering bridges - vxlan1 and vxlan2 are collect metadata devices with vnifiltering enabled ┌──────────────────────────────────────────────────────────────────┐ │ switch │ │ │ │ ┌───────────┐ ┌───────────┐ │ │ │ │ │ │ │ │ │ br1 │ │ br2 │ │ │ └┬─────────┬┘ └──┬───────┬┘ │ │ vlans│ │ vlans │ │ │ │ 10,11│ │ 10,11│ │ │ │ │ vlanvnimap: │ vlanvnimap: │ │ │ 10-1001,11-1002 │ 10-2001,11-2002 │ │ │ │ │ │ │ │ ┌──────┴┐ ┌──┴─────────┐ ┌───┴────┐ │ │ │ │ swp1 │ │vxlan1 │ │ swp2 │ ┌┴─────────────┐ │ │ │ │ │ vnifilter:│ │ │ │vxlan2 │ │ │ └───┬───┘ │ 1001,1002│ └───┬────┘ │ vnifilter: │ │ │ │ └────────────┘ │ │ 2001,2002 │ │ │ │ │ └──────────────┘ │ │ │ │ │ └───────┼──────────────────────────────────┼───────────────────────┘ │ │ │ │ ┌─────┴───────┐ │ │ customer1 │ ┌─────┴──────┐ │ host/VM │ │customer2 │ └─────────────┘ │ host/VM │ └────────────┘ With this implementation, vxlan dst metadata device can be associated with range of vnis. struct vxlan_vni_node is introduced to represent a configured vni. We start with vni and its associated remote_ip in this structure. This structure can be extended to bring in other per vni attributes if there are usecases for it. A vni inherits an attribute from the base vxlan device if there is no per vni attributes defined. struct vxlan_dev gets a new rhashtable for vnis called vxlan_vni_group. vxlan_vnifilter.c implements the necessary netlink api, notifications and helper functions to process and manage lifecycle of vxlan_vni_node. This patch also adds new helper functions in vxlan_multicast.c to handle per vni remote_ip multicast groups which are part of vxlan_vni_group. Fix build problems: Reported-by: kernel test robot Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/vxlan.h | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 5a934bebe630..8eb961bb9589 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -232,6 +232,25 @@ struct vxlan_dev_node { struct vxlan_dev *vxlan; }; +struct vxlan_vni_node { + struct rhash_head vnode; + struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ +#if IS_ENABLED(CONFIG_IPV6) + struct vxlan_dev_node hlist6; /* vni hash table for IPv6 socket */ +#endif + struct list_head vlist; + __be32 vni; + union vxlan_addr remote_ip; /* default remote ip for this vni */ + + struct rcu_head rcu; +}; + +struct vxlan_vni_group { + struct rhashtable vni_hash; + struct list_head vni_list; + u32 num_vnis; +}; + /* Pseudo network device */ struct vxlan_dev { struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */ @@ -254,6 +273,8 @@ struct vxlan_dev { struct vxlan_config cfg; + struct vxlan_vni_group __rcu *vnigrp; + struct hlist_head fdb_head[FDB_HASH_SIZE]; }; @@ -274,6 +295,7 @@ struct vxlan_dev { #define VXLAN_F_GPE 0x4000 #define VXLAN_F_IPV6_LINKLOCAL 0x8000 #define VXLAN_F_TTL_INHERIT 0x10000 +#define VXLAN_F_VNIFILTER 0x20000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -283,7 +305,8 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_REMCSUM_RX | \ VXLAN_F_REMCSUM_NOPARTIAL | \ - VXLAN_F_COLLECT_METADATA) + VXLAN_F_COLLECT_METADATA | \ + VXLAN_F_VNIFILTER) /* Flags that can be set together with VXLAN_F_GPE. */ #define VXLAN_F_ALLOWED_GPE (VXLAN_F_GPE | \ @@ -292,7 +315,8 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM_TX | \ VXLAN_F_UDP_ZERO_CSUM6_TX | \ VXLAN_F_UDP_ZERO_CSUM6_RX | \ - VXLAN_F_COLLECT_METADATA) + VXLAN_F_COLLECT_METADATA | \ + VXLAN_F_VNIFILTER) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); -- cgit v1.2.3 From 4095e0e1328a3cd9e3b30174d6cb0edb3824256d Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 1 Mar 2022 05:04:38 +0000 Subject: drivers: vxlan: vnifilter: per vni stats Add per-vni statistics for vni filter mode. Counting Rx/Tx bytes/packets/drops/errors at the appropriate places. This patch changes vxlan_vs_find_vni to also return the vxlan_vni_node in cases where the vni belongs to a vni filtering vxlan device Signed-off-by: Nikolay Aleksandrov Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/vxlan.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 8eb961bb9589..bca5b01af247 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -227,6 +227,31 @@ struct vxlan_config { enum ifla_vxlan_df df; }; +enum { + VXLAN_VNI_STATS_RX, + VXLAN_VNI_STATS_RX_DROPS, + VXLAN_VNI_STATS_RX_ERRORS, + VXLAN_VNI_STATS_TX, + VXLAN_VNI_STATS_TX_DROPS, + VXLAN_VNI_STATS_TX_ERRORS, +}; + +struct vxlan_vni_stats { + u64 rx_packets; + u64 rx_bytes; + u64 rx_drops; + u64 rx_errors; + u64 tx_packets; + u64 tx_bytes; + u64 tx_drops; + u64 tx_errors; +}; + +struct vxlan_vni_stats_pcpu { + struct vxlan_vni_stats stats; + struct u64_stats_sync syncp; +}; + struct vxlan_dev_node { struct hlist_node hlist; struct vxlan_dev *vxlan; @@ -241,6 +266,7 @@ struct vxlan_vni_node { struct list_head vlist; __be32 vni; union vxlan_addr remote_ip; /* default remote ip for this vni */ + struct vxlan_vni_stats_pcpu __percpu *stats; struct rcu_head rcu; }; -- cgit v1.2.3 From 445b2f36bb4efb81f064e931f28b9ec19f114355 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 1 Mar 2022 05:04:39 +0000 Subject: drivers: vxlan: vnifilter: add support for stats dumping Add support for VXLAN vni filter entries' stats dumping Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 3dfc9ff2ec9b..e315e53125f4 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -716,18 +716,41 @@ enum ipvlan_mode { /* Tunnel RTM header */ struct tunnel_msg { __u8 family; - __u8 reserved1; + __u8 flags; __u16 reserved2; __u32 ifindex; }; /* VXLAN section */ + +/* include statistics in the dump */ +#define TUNNEL_MSG_FLAG_STATS 0x01 + +#define TUNNEL_MSG_VALID_USER_FLAGS TUNNEL_MSG_FLAG_STATS + +/* Embedded inside VXLAN_VNIFILTER_ENTRY_STATS */ +enum { + VNIFILTER_ENTRY_STATS_UNSPEC, + VNIFILTER_ENTRY_STATS_RX_BYTES, + VNIFILTER_ENTRY_STATS_RX_PKTS, + VNIFILTER_ENTRY_STATS_RX_DROPS, + VNIFILTER_ENTRY_STATS_RX_ERRORS, + VNIFILTER_ENTRY_STATS_TX_BYTES, + VNIFILTER_ENTRY_STATS_TX_PKTS, + VNIFILTER_ENTRY_STATS_TX_DROPS, + VNIFILTER_ENTRY_STATS_TX_ERRORS, + VNIFILTER_ENTRY_STATS_PAD, + __VNIFILTER_ENTRY_STATS_MAX +}; +#define VNIFILTER_ENTRY_STATS_MAX (__VNIFILTER_ENTRY_STATS_MAX - 1) + enum { VXLAN_VNIFILTER_ENTRY_UNSPEC, VXLAN_VNIFILTER_ENTRY_START, VXLAN_VNIFILTER_ENTRY_END, VXLAN_VNIFILTER_ENTRY_GROUP, VXLAN_VNIFILTER_ENTRY_GROUP6, + VXLAN_VNIFILTER_ENTRY_STATS, __VXLAN_VNIFILTER_ENTRY_MAX }; #define VXLAN_VNIFILTER_ENTRY_MAX (__VXLAN_VNIFILTER_ENTRY_MAX - 1) -- cgit v1.2.3 From 462791bbfa350189e309a5a94541f6b63cd874e8 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 1 Mar 2022 17:43:56 +0800 Subject: net/smc: add sysctl interface for SMC This patch add sysctl interface to support container environment for SMC as we talk in the mail list. Link: https://lore.kernel.org/netdev/20220224020253.GF5443@linux.alibaba.com Co-developed-by: Tony Lu Signed-off-by: Tony Lu Signed-off-by: Dust Li Signed-off-by: David S. Miller --- include/net/netns/smc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 47b166684fd8..1682eae50579 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -14,5 +14,8 @@ struct netns_smc { struct smc_stats_rsn *fback_rsn; bool limit_smc_hs; /* constraint on handshake */ +#ifdef CONFIG_SYSCTL + struct ctl_table_header *smc_hdr; +#endif }; #endif -- cgit v1.2.3 From 12bbb0d163a90d81a2677cf7808d364697290207 Mon Sep 17 00:00:00 2001 From: Dust Li Date: Tue, 1 Mar 2022 17:43:58 +0800 Subject: net/smc: add sysctl for autocorking This add a new sysctl: net.smc.autocorking_size We can dynamically change the behaviour of autocorking by change the value of autocorking_size. Setting to 0 disables autocorking in SMC Signed-off-by: Dust Li Signed-off-by: David S. Miller --- include/net/netns/smc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 1682eae50579..e5389eeaf8bd 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -17,5 +17,6 @@ struct netns_smc { #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif + unsigned int sysctl_autocorking_size; }; #endif -- cgit v1.2.3 From dd0ca255f3d27a1bb43d8e9529fb3645f9a341a3 Mon Sep 17 00:00:00 2001 From: Daniel Braunwarth Date: Mon, 28 Feb 2022 14:30:28 +0100 Subject: if_ether.h: add PROFINET Ethertype Add the Ethertype for PROFINET protocol. Signed-off-by: Daniel Braunwarth Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index c0c2f3ed5729..4f4ed35a16db 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -86,6 +86,7 @@ * over Ethernet */ #define ETH_P_PAE 0x888E /* Port Access Entity (IEEE 802.1X) */ +#define ETH_P_PROFINET 0x8892 /* PROFINET */ #define ETH_P_REALTEK 0x8899 /* Multiple proprietary protocols */ #define ETH_P_AOE 0x88A2 /* ATA over Ethernet */ #define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN */ -- cgit v1.2.3 From cd73cda742fbe1f33ed7306c7a01aa64f4e6ebd5 Mon Sep 17 00:00:00 2001 From: Daniel Braunwarth Date: Mon, 28 Feb 2022 14:30:29 +0100 Subject: if_ether.h: add EtherCAT Ethertype Add the Ethertype for EtherCAT protocol. Signed-off-by: Daniel Braunwarth Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 4f4ed35a16db..1d0bccc3fa54 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -89,6 +89,7 @@ #define ETH_P_PROFINET 0x8892 /* PROFINET */ #define ETH_P_REALTEK 0x8899 /* Multiple proprietary protocols */ #define ETH_P_AOE 0x88A2 /* ATA over Ethernet */ +#define ETH_P_ETHERCAT 0x88A4 /* EtherCAT */ #define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN */ #define ETH_P_802_EX1 0x88B5 /* 802.1 Local Experimental 1. */ #define ETH_P_PREAUTH 0x88C7 /* 802.11 Preauthentication */ -- cgit v1.2.3 From bf08824a0f4776fc0626b82b6924fa1a5643eacb Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Mon, 28 Feb 2022 20:58:56 +0100 Subject: flow_dissector: Add support for HSR Network drivers such as igb or igc call eth_get_headlen() to determine the header length for their to be constructed skbs in receive path. When running HSR on top of these drivers, it results in triggering BUG_ON() in skb_pull(). The reason is the skb headlen is not sufficient for HSR to work correctly. skb_pull() notices that. For instance, eth_get_headlen() returns 14 bytes for TCP traffic over HSR which is not correct. The problem is, the flow dissection code does not take HSR into account. Therefore, add support for it. Reported-by: Anthony Harivel Signed-off-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20220228195856.88187-1-kurt@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/if_hsr.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/if_hsr.h b/include/linux/if_hsr.h index 38bbc537d4e4..408539d5ea5f 100644 --- a/include/linux/if_hsr.h +++ b/include/linux/if_hsr.h @@ -9,6 +9,22 @@ enum hsr_version { PRP_V1, }; +/* HSR Tag. + * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB, + * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest, + * h_source, h_proto = 0x88FB }, and add { path, LSDU_size, sequence Nr, + * encapsulated protocol } instead. + * + * Field names as defined in the IEC:2010 standard for HSR. + */ +struct hsr_tag { + __be16 path_and_LSDU_size; + __be16 sequence_nr; + __be16 encap_proto; +} __packed; + +#define HSR_HLEN 6 + #if IS_ENABLED(CONFIG_HSR) extern bool is_hsr_master(struct net_device *dev); extern int hsr_get_version(struct net_device *dev, enum hsr_version *ver); -- cgit v1.2.3 From 42f0c1934c7cb3e94c2fe8f5771245fa5631d0e7 Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Tue, 1 Mar 2022 06:35:42 -0800 Subject: tcp: Remove the unused api Last tcp_write_queue_head() use was removed in commit 114f39feab36 ("tcp: restore autocorking"), so remove it. Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/SYZP282MB33317DEE1253B37C0F57231E86029@SYZP282MB3331.AUSP282.PROD.OUTLOOK.COM Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 479a27777ad6..d486d7b6112d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1831,11 +1831,6 @@ static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) return skb_rb_last(&sk->tcp_rtx_queue); } -static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) -{ - return skb_peek(&sk->sk_write_queue); -} - static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) { return skb_peek_tail(&sk->sk_write_queue); -- cgit v1.2.3 From 8610037e8106b48c79cfe0afb92b2b2466e51c3d Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Tue, 1 Mar 2022 23:55:47 -0800 Subject: page_pool: Add allocation stats Add per-pool statistics counters for the allocation path of a page pool. These stats are incremented in softirq context, so no locking or per-cpu variables are needed. This code is disabled by default and a kernel config option is provided for users who wish to enable them. The statistics added are: - fast: successful fast path allocations - slow: slow path order-0 allocations - slow_high_order: slow path high order allocations - empty: ptr ring is empty, so a slow path allocation was forced. - refill: an allocation which triggered a refill of the cache - waive: pages obtained from the ptr ring that cannot be added to the cache due to a NUMA mismatch. Signed-off-by: Joe Damato Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 97c3c19872ff..1f27e8a48830 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -84,6 +84,19 @@ struct page_pool_params { void *init_arg; }; +#ifdef CONFIG_PAGE_POOL_STATS +struct page_pool_alloc_stats { + u64 fast; /* fast path allocations */ + u64 slow; /* slow-path order 0 allocations */ + u64 slow_high_order; /* slow-path high order allocations */ + u64 empty; /* failed refills due to empty ptr ring, forcing + * slow path allocation + */ + u64 refill; /* allocations via successful refill */ + u64 waive; /* failed refills due to numa zone mismatch */ +}; +#endif + struct page_pool { struct page_pool_params p; @@ -96,6 +109,11 @@ struct page_pool { unsigned int frag_offset; struct page *frag_page; long frag_users; + +#ifdef CONFIG_PAGE_POOL_STATS + /* these stats are incremented while in softirq context */ + struct page_pool_alloc_stats alloc_stats; +#endif u32 xdp_mem_id; /* -- cgit v1.2.3 From ad6fa1e1ab1b8164f1ba296b1b4dc556a483bcad Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Tue, 1 Mar 2022 23:55:48 -0800 Subject: page_pool: Add recycle stats Add per-cpu stats tracking page pool recycling events: - cached: recycling placed page in the page pool cache - cache_full: page pool cache was full - ring: page placed into the ptr ring - ring_full: page released from page pool because the ptr ring was full - released_refcnt: page released (and not recycled) because refcnt > 1 Signed-off-by: Joe Damato Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 1f27e8a48830..298af95bbf96 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -95,6 +95,18 @@ struct page_pool_alloc_stats { u64 refill; /* allocations via successful refill */ u64 waive; /* failed refills due to numa zone mismatch */ }; + +struct page_pool_recycle_stats { + u64 cached; /* recycling placed page in the cache. */ + u64 cache_full; /* cache was full */ + u64 ring; /* recycling placed page back into ptr ring */ + u64 ring_full; /* page was released from page-pool because + * PTR ring was full. + */ + u64 released_refcnt; /* page released because of elevated + * refcnt + */ +}; #endif struct page_pool { @@ -144,6 +156,10 @@ struct page_pool { */ struct ptr_ring ring; +#ifdef CONFIG_PAGE_POOL_STATS + /* recycle stats are per-cpu to avoid locking */ + struct page_pool_recycle_stats __percpu *recycle_stats; +#endif atomic_t pages_state_release_cnt; /* A page_pool is strictly tied to a single RX-queue being -- cgit v1.2.3 From 6b95e3388b1ea0ca63500c5a6e39162dbf828433 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Tue, 1 Mar 2022 23:55:49 -0800 Subject: page_pool: Add function to batch and return stats Adds a function page_pool_get_stats which can be used by drivers to obtain stats for a specified page_pool. Signed-off-by: Joe Damato Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/net/page_pool.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 298af95bbf96..ea5fb70e5101 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -107,6 +107,23 @@ struct page_pool_recycle_stats { * refcnt */ }; + +/* This struct wraps the above stats structs so users of the + * page_pool_get_stats API can pass a single argument when requesting the + * stats for the page pool. + */ +struct page_pool_stats { + struct page_pool_alloc_stats alloc_stats; + struct page_pool_recycle_stats recycle_stats; +}; + +/* + * Drivers that wish to harvest page pool stats and report them to users + * (perhaps via ethtool, debugfs, or another mechanism) can allocate a + * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool. + */ +bool page_pool_get_stats(struct page_pool *pool, + struct page_pool_stats *stats); #endif struct page_pool { -- cgit v1.2.3 From 46efc97b73060823fdc18103a5e317a8327d44e1 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 2 Mar 2022 18:31:17 +0200 Subject: net: rtnetlink: RTM_GETSTATS: Allow filtering inside nests The filter_mask field of RTM_GETSTATS header determines which top-level attributes should be included in the netlink response. This saves processing time by only including the bits that the user cares about instead of always dumping everything. This is doubly important for HW-backed statistics that would typically require a trip to the device to fetch the stats. So far there was only one HW-backed stat suite per attribute. However, IFLA_STATS_LINK_OFFLOAD_XSTATS is a nest, and will gain a new stat suite in the following patches. It would therefore be advantageous to be able to filter within that nest, and select just one or the other HW-backed statistics suite. Extend rtnetlink so that RTM_GETSTATS permits attributes in the payload. The scheme is as follows: - RTM_GETSTATS - struct if_stats_msg - attr nest IFLA_STATS_GET_FILTERS - attr IFLA_STATS_LINK_OFFLOAD_XSTATS - u32 filter_mask This scheme reuses the existing enumerators by nesting them in a dedicated context attribute. This is covered by policies as usual, therefore a gradual opt-in is possible. Currently only IFLA_STATS_LINK_OFFLOAD_XSTATS nest has filtering enabled, because for the SW counters the issue does not seem to be that important. rtnl_offload_xstats_get_size() and _fill() are extended to observe the requested filters. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index e315e53125f4..4d62ea6e1288 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1207,6 +1207,16 @@ enum { #define IFLA_STATS_FILTER_BIT(ATTR) (1 << (ATTR - 1)) +enum { + IFLA_STATS_GETSET_UNSPEC, + IFLA_STATS_GET_FILTERS, /* Nest of IFLA_STATS_LINK_xxx, each a u32 with + * a filter mask for the corresponding group. + */ + __IFLA_STATS_GETSET_MAX, +}; + +#define IFLA_STATS_GETSET_MAX (__IFLA_STATS_GETSET_MAX - 1) + /* These are embedded into IFLA_STATS_LINK_XSTATS: * [IFLA_STATS_LINK_XSTATS] * -> [LINK_XSTATS_TYPE_xxx] -- cgit v1.2.3 From 9309f97aef6d8250bb484dabeac925c3a7c57716 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 2 Mar 2022 18:31:20 +0200 Subject: net: dev: Add hardware stats support Offloading switch device drivers may be able to collect statistics of the traffic taking place in the HW datapath that pertains to a certain soft netdevice, such as VLAN. Add the necessary infrastructure to allow exposing these statistics to the offloaded netdevice in question. The API was shaped by the following considerations: - Collection of HW statistics is not free: there may be a finite number of counters, and the act of counting may have a performance impact. It is therefore necessary to allow toggling whether HW counting should be done for any particular SW netdevice. - As the drivers are loaded and removed, a particular device may get offloaded and unoffloaded again. At the same time, the statistics values need to stay monotonic (modulo the eventual 64-bit wraparound), increasing only to reflect traffic measured in the device. To that end, the netdevice keeps around a lazily-allocated copy of struct rtnl_link_stats64. Device drivers then contribute to the values kept therein at various points. Even as the driver goes away, the struct stays around to maintain the statistics values. - Different HW devices may be able to count different things. The motivation behind this patch in particular is exposure of HW counters on Nvidia Spectrum switches, where the only practical approach to counting traffic on offloaded soft netdevices currently is to use router interface counters, and count L3 traffic. Correspondingly that is the statistics suite added in this patch. Other devices may be able to measure different kinds of traffic, and for that reason, the APIs are built to allow uniform access to different statistics suites. - Because soft netdevices and offloading drivers are only loosely bound, a netdevice uses a notifier chain to communicate with the drivers. Several new notifiers, NETDEV_OFFLOAD_XSTATS_*, have been added to carry messages to the offloading drivers. - Devices can have various conditions for when a particular counter is available. As the device is configured and reconfigured, the device offload may become or cease being suitable for counter binding. A netdevice can use a notifier type NETDEV_OFFLOAD_XSTATS_REPORT_USED to ping offloading drivers and determine whether anyone currently implements a given statistics suite. This information can then be propagated to user space. When the driver decides to unoffload a netdevice, it can use a newly-added function, netdev_offload_xstats_report_delta(), to record outstanding collected statistics, before destroying the HW counter. This patch adds a helper, call_netdevice_notifiers_info_robust(), for dispatching a notifier with the possibility of unwind when one of the consumers bails. Given the wish to eventually get rid of the global notifier block altogether, this helper only invokes the per-netns notifier block. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 42 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/if_link.h | 15 +++++++++++++++ 2 files changed, 57 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c79ee2296296..19a27ac361ef 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1950,6 +1950,7 @@ enum netdev_ml_priv_type { * @watchdog_dev_tracker: refcount tracker used by watchdog. * @dev_registered_tracker: tracker for reference held while * registered + * @offload_xstats_l3: L3 HW stats for this netdevice. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2287,6 +2288,7 @@ struct net_device { netdevice_tracker linkwatch_dev_tracker; netdevice_tracker watchdog_dev_tracker; netdevice_tracker dev_registered_tracker; + struct rtnl_hw_stats64 *offload_xstats_l3; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -2727,6 +2729,10 @@ enum netdev_cmd { NETDEV_CVLAN_FILTER_DROP_INFO, NETDEV_SVLAN_FILTER_PUSH_INFO, NETDEV_SVLAN_FILTER_DROP_INFO, + NETDEV_OFFLOAD_XSTATS_ENABLE, + NETDEV_OFFLOAD_XSTATS_DISABLE, + NETDEV_OFFLOAD_XSTATS_REPORT_USED, + NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, }; const char *netdev_cmd_to_name(enum netdev_cmd cmd); @@ -2777,6 +2783,42 @@ struct netdev_notifier_pre_changeaddr_info { const unsigned char *dev_addr; }; +enum netdev_offload_xstats_type { + NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1, +}; + +struct netdev_notifier_offload_xstats_info { + struct netdev_notifier_info info; /* must be first */ + enum netdev_offload_xstats_type type; + + union { + /* NETDEV_OFFLOAD_XSTATS_REPORT_DELTA */ + struct netdev_notifier_offload_xstats_rd *report_delta; + /* NETDEV_OFFLOAD_XSTATS_REPORT_USED */ + struct netdev_notifier_offload_xstats_ru *report_used; + }; +}; + +int netdev_offload_xstats_enable(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct netlink_ext_ack *extack); +int netdev_offload_xstats_disable(struct net_device *dev, + enum netdev_offload_xstats_type type); +bool netdev_offload_xstats_enabled(const struct net_device *dev, + enum netdev_offload_xstats_type type); +int netdev_offload_xstats_get(struct net_device *dev, + enum netdev_offload_xstats_type type, + struct rtnl_hw_stats64 *stats, bool *used, + struct netlink_ext_ack *extack); +void +netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *rd, + const struct rtnl_hw_stats64 *stats); +void +netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *ru); +void netdev_offload_xstats_push_delta(struct net_device *dev, + enum netdev_offload_xstats_type type, + const struct rtnl_hw_stats64 *stats); + static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 4d62ea6e1288..ef6a62a2e15d 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -245,6 +245,21 @@ struct rtnl_link_stats64 { __u64 rx_nohandler; }; +/* Subset of link stats useful for in-HW collection. Meaning of the fields is as + * for struct rtnl_link_stats64. + */ +struct rtnl_hw_stats64 { + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; +}; + /* The struct should be in sync with struct ifmap */ struct rtnl_link_ifmap { __u64 mem_start; -- cgit v1.2.3 From 0e7788fd76222dba3229eada9162efab185923fc Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 2 Mar 2022 18:31:21 +0200 Subject: net: rtnetlink: Add UAPI for obtaining L3 offload xstats Add a new IFLA_STATS_LINK_OFFLOAD_XSTATS child attribute, IFLA_OFFLOAD_XSTATS_L3_STATS, to carry statistics for traffic that takes place in a HW router. The offloaded HW stats are designed to allow per-netdevice enablement and disablement. Additionally, as a netdevice is configured, it may become or cease being suitable for binding of a HW counter. Both of these aspects need to be communicated to the userspace. To that end, add another child attribute, IFLA_OFFLOAD_XSTATS_HW_S_INFO: - attr nest IFLA_OFFLOAD_XSTATS_HW_S_INFO - attr nest IFLA_OFFLOAD_XSTATS_L3_STATS - attr IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST - {0,1} as u8 - attr IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED - {0,1} as u8 Thus this one attribute is a nest that can be used to carry information about various types of HW statistics, and indexing is very simply done by wrapping the information for a given statistics suite into the attribute that carries the suite is the RTM_GETSTATS query. At the same time, because _HW_S_INFO is nested directly below IFLA_STATS_LINK_OFFLOAD_XSTATS, it is possible through filtering to request only the metadata about individual statistics suites, without having to hit the HW to get the actual counters. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ef6a62a2e15d..b1031f481d2f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1249,10 +1249,21 @@ enum { enum { IFLA_OFFLOAD_XSTATS_UNSPEC, IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ + IFLA_OFFLOAD_XSTATS_HW_S_INFO, /* HW stats info. A nest */ + IFLA_OFFLOAD_XSTATS_L3_STATS, /* struct rtnl_hw_stats64 */ __IFLA_OFFLOAD_XSTATS_MAX }; #define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) +enum { + IFLA_OFFLOAD_XSTATS_HW_S_INFO_UNSPEC, + IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, /* u8 */ + IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, /* u8 */ + __IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX, +}; +#define IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX \ + (__IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX - 1) + /* XDP section */ #define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) -- cgit v1.2.3 From 03ba35667091337d8e632cf4b814f1c1b914609b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 2 Mar 2022 18:31:22 +0200 Subject: net: rtnetlink: Add RTM_SETSTATS The offloaded HW stats are designed to allow per-netdevice enablement and disablement. These stats are only accessible through RTM_GETSTATS, and therefore should be toggled by a RTM_SETSTATS message. Add it, and the necessary skeleton handler. Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 0970cb4b1b88..14462dc159fd 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -146,6 +146,8 @@ enum { #define RTM_NEWSTATS RTM_NEWSTATS RTM_GETSTATS = 94, #define RTM_GETSTATS RTM_GETSTATS + RTM_SETSTATS, +#define RTM_SETSTATS RTM_SETSTATS RTM_NEWCACHEREPORT = 96, #define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT -- cgit v1.2.3 From 5fd0b838efac16046509f7fb100455d0463b9687 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 2 Mar 2022 18:31:23 +0200 Subject: net: rtnetlink: Add UAPI toggle for IFLA_OFFLOAD_XSTATS_L3_STATS The offloaded HW stats are designed to allow per-netdevice enablement and disablement. Add an attribute, IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS, which should be carried by the RTM_SETSTATS message, and expresses a desire to toggle L3 offload xstats on or off. As part of the above, add an exported function rtnl_offload_xstats_notify() that drivers can use when they have installed or deinstalled the counters backing the HW stats. At this point, it is possible to enable, disable and query L3 offload xstats on netdevices. (However there is no driver actually implementing these.) Signed-off-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 3 +++ include/uapi/linux/if_link.h | 1 + include/uapi/linux/rtnetlink.h | 2 ++ 3 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index bb9cb84114c1..7f970b16da3a 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -134,4 +134,7 @@ extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, int (*vlan_fill)(struct sk_buff *skb, struct net_device *dev, u32 filter_mask)); + +extern void rtnl_offload_xstats_notify(struct net_device *dev); + #endif /* __LINUX_RTNETLINK_H */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index b1031f481d2f..ddca20357e7e 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1227,6 +1227,7 @@ enum { IFLA_STATS_GET_FILTERS, /* Nest of IFLA_STATS_LINK_xxx, each a u32 with * a filter mask for the corresponding group. */ + IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS, /* 0 or 1 as u8 */ __IFLA_STATS_GETSET_MAX, }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 14462dc159fd..51530aade46e 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -767,6 +767,8 @@ enum rtnetlink_groups { #define RTNLGRP_MCTP_IFADDR RTNLGRP_MCTP_IFADDR RTNLGRP_TUNNEL, #define RTNLGRP_TUNNEL RTNLGRP_TUNNEL + RTNLGRP_STATS, +#define RTNLGRP_STATS RTNLGRP_STATS __RTNLGRP_MAX }; #define RTNLGRP_MAX (__RTNLGRP_MAX - 1) -- cgit v1.2.3 From f9cef64fa23f6c1ff177be5082113c5a94e34e5d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 2 Mar 2022 21:14:13 +0200 Subject: net: dsa: felix: migrate host FDB and MDB entries when changing tag proto The "ocelot" and "ocelot-8021q" tagging protocols make use of different hardware resources, and host FDB entries have different destination ports in the switch analyzer module, practically speaking. So when the user requests a tagging protocol change, the driver must migrate all host FDB and MDB entries from the NPI port (in fact CPU port module) towards the same physical port, but this time used as a regular port. It is pointless for the felix driver to keep a copy of the host addresses, when we can create and export DSA helpers for walking through the addresses that it already needs to keep on the CPU port, for refcounting purposes. felix_classify_db() is moved up to avoid a forward declaration. We pass "bool change" because dp->fdbs and dp->mdbs are uninitialized lists when felix_setup() first calls felix_set_tag_protocol(), so we need to avoid calling dsa_port_walk_fdbs() during probe time. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index cfedcfb86350..71cc363dbbd4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1219,6 +1219,13 @@ struct dsa_switch_driver { struct net_device *dsa_dev_to_net_device(struct device *dev); +typedef int dsa_fdb_walk_cb_t(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid, + struct dsa_db db); + +int dsa_port_walk_fdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb); +int dsa_port_walk_mdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb); + /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) { -- cgit v1.2.3 From a1ac9c8acec1605c6b43af418f79facafdced680 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:55:25 -0800 Subject: net: Add skb->mono_delivery_time to distinguish mono delivery_time from (rcv) timestamp skb->tstamp was first used as the (rcv) timestamp. The major usage is to report it to the user (e.g. SO_TIMESTAMP). Later, skb->tstamp is also set as the (future) delivery_time (e.g. EDT in TCP) during egress and used by the qdisc (e.g. sch_fq) to make decision on when the skb can be passed to the dev. Currently, there is no way to tell skb->tstamp having the (rcv) timestamp or the delivery_time, so it is always reset to 0 whenever forwarded between egress and ingress. While it makes sense to always clear the (rcv) timestamp in skb->tstamp to avoid confusing sch_fq that expects the delivery_time, it is a performance issue [0] to clear the delivery_time if the skb finally egress to a fq@phy-dev. For example, when forwarding from egress to ingress and then finally back to egress: tcp-sender => veth@netns => veth@hostns => fq@eth0@hostns ^ ^ reset rest This patch adds one bit skb->mono_delivery_time to flag the skb->tstamp is storing the mono delivery_time (EDT) instead of the (rcv) timestamp. The current use case is to keep the TCP mono delivery_time (EDT) and to be used with sch_fq. A latter patch will also allow tc-bpf@ingress to read and change the mono delivery_time. In the future, another bit (e.g. skb->user_delivery_time) can be added for the SCM_TXTIME where the clock base is tracked by sk->sk_clockid. [ This patch is a prep work. The following patches will get the other parts of the stack ready first. Then another patch after that will finally set the skb->mono_delivery_time. ] skb_set_delivery_time() function is added. It is used by the tcp_output.c and during ip[6] fragmentation to assign the delivery_time to the skb->tstamp and also set the skb->mono_delivery_time. A note on the change in ip_send_unicast_reply() in ip_output.c. It is only used by TCP to send reset/ack out of a ctl_sk. Like the new skb_set_delivery_time(), this patch sets the skb->mono_delivery_time to 0 for now as a place holder. It will be enabled in a latter patch. A similar case in tcp_ipv6 can be done with skb_set_delivery_time() in tcp_v6_send_response(). [0] (slide 22): https://linuxplumbersconf.org/event/11/contributions/953/attachments/867/1658/LPC_2021_BPF_Datapath_Extensions.pdf Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d67941f78b92..803ffa63dea6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -795,6 +795,10 @@ typedef unsigned char *sk_buff_data_t; * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required + * @mono_delivery_time: When set, skb->tstamp has the + * delivery_time in mono clock base (i.e. EDT). Otherwise, the + * skb->tstamp has the (rcv) timestamp at ingress and + * delivery_time at egress. * @napi_id: id of the NAPI struct this skb came from * @sender_cpu: (aka @napi_id) source CPU in XPS * @secmark: security marking @@ -965,6 +969,7 @@ struct sk_buff { __u8 decrypted:1; #endif __u8 slow_gro:1; + __u8 mono_delivery_time:1; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -3983,6 +3988,14 @@ static inline ktime_t net_timedelta(ktime_t t) return ktime_sub(ktime_get_real(), t); } +static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, + bool mono) +{ + skb->tstamp = kt; + /* Setting mono_delivery_time will be enabled later */ + skb->mono_delivery_time = 0; +} + static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; -- cgit v1.2.3 From de799101519aad23c6096041ba2744d7b5517e6a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:55:31 -0800 Subject: net: Add skb_clear_tstamp() to keep the mono delivery_time Right now, skb->tstamp is reset to 0 whenever the skb is forwarded. If skb->tstamp has the mono delivery_time, clearing it can hurt the performance when it finally transmits out to fq@phy-dev. The earlier patch added a skb->mono_delivery_time bit to flag the skb->tstamp carrying the mono delivery_time. This patch adds skb_clear_tstamp() helper which keeps the mono delivery_time and clears everything else. The delivery_time clearing will be postponed until the stack knows the skb will be delivered locally. It will be done in a latter patch. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 803ffa63dea6..27a28920e7b3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3996,6 +3996,14 @@ static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, skb->mono_delivery_time = 0; } +static inline void skb_clear_tstamp(struct sk_buff *skb) +{ + if (skb->mono_delivery_time) + return; + + skb->tstamp = 0; +} + static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; @@ -4852,7 +4860,7 @@ static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress) #ifdef CONFIG_NET_REDIRECT skb->from_ingress = from_ingress; if (skb->from_ingress) - skb->tstamp = 0; + skb_clear_tstamp(skb); #endif } -- cgit v1.2.3 From 27942a15209f564ed8ee2a9e126cb7b105181355 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:55:38 -0800 Subject: net: Handle delivery_time in skb->tstamp during network tapping with af_packet A latter patch will set the skb->mono_delivery_time to flag the skb->tstamp is used as the mono delivery_time (EDT) instead of the (rcv) timestamp. skb_clear_tstamp() will then keep this delivery_time during forwarding. This patch is to make the network tapping (with af_packet) to handle the delivery_time stored in skb->tstamp. Regardless of tapping at the ingress or egress, the tapped skb is received by the af_packet socket, so it is ingress to the af_packet socket and it expects the (rcv) timestamp. When tapping at egress, dev_queue_xmit_nit() is used. It has already expected skb->tstamp may have delivery_time, so it does skb_clone()+net_timestamp_set() to ensure the cloned skb has the (rcv) timestamp before passing to the af_packet sk. This patch only adds to clear the skb->mono_delivery_time bit in net_timestamp_set(). When tapping at ingress, it currently expects the skb->tstamp is either 0 or the (rcv) timestamp. Meaning, the tapping at ingress path has already expected the skb->tstamp could be 0 and it will get the (rcv) timestamp by ktime_get_real() when needed. There are two cases for tapping at ingress: One case is af_packet queues the skb to its sk_receive_queue. The skb is either not shared or new clone created. The newly added skb_clear_delivery_time() is called to clear the delivery_time (if any) and set the (rcv) timestamp if needed before the skb is queued to the sk_receive_queue. Another case, the ingress skb is directly copied to the rx_ring and tpacket_get_timestamp() is used to get the (rcv) timestamp. The newly added skb_tstamp() is used in tpacket_get_timestamp() to check the skb->mono_delivery_time bit before returning skb->tstamp. As mentioned earlier, the tapping@ingress has already expected the skb may not have the (rcv) timestamp (because no sk has asked for it) and has handled this case by directly calling ktime_get_real(). Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 27a28920e7b3..7e2d796ece80 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3996,6 +3996,22 @@ static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, skb->mono_delivery_time = 0; } +DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); + +/* It is used in the ingress path to clear the delivery_time. + * If needed, set the skb->tstamp to the (rcv) timestamp. + */ +static inline void skb_clear_delivery_time(struct sk_buff *skb) +{ + if (skb->mono_delivery_time) { + skb->mono_delivery_time = 0; + if (static_branch_unlikely(&netstamp_needed_key)) + skb->tstamp = ktime_get_real(); + else + skb->tstamp = 0; + } +} + static inline void skb_clear_tstamp(struct sk_buff *skb) { if (skb->mono_delivery_time) @@ -4004,6 +4020,14 @@ static inline void skb_clear_tstamp(struct sk_buff *skb) skb->tstamp = 0; } +static inline ktime_t skb_tstamp(const struct sk_buff *skb) +{ + if (skb->mono_delivery_time) + return 0; + + return skb->tstamp; +} + static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; -- cgit v1.2.3 From d93376f503c7a586707925957592c0f16f4db0b1 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:55:44 -0800 Subject: net: Clear mono_delivery_time bit in __skb_tstamp_tx() In __skb_tstamp_tx(), it may clone the egress skb and queues the clone to the sk_error_queue. The outgoing skb may have the mono delivery_time while the (rcv) timestamp is expected for the clone, so the skb->mono_delivery_time bit needs to be cleared from the clone. This patch adds the skb->mono_delivery_time clearing to the existing __net_timestamp() and use it in __skb_tstamp_tx(). The __net_timestamp() fast path usage in dev.c is changed to directly call ktime_get_real() since the mono_delivery_time bit is not set at that point. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7e2d796ece80..8e8a4af4f9e2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3981,6 +3981,7 @@ static inline void skb_get_new_timestampns(const struct sk_buff *skb, static inline void __net_timestamp(struct sk_buff *skb) { skb->tstamp = ktime_get_real(); + skb->mono_delivery_time = 0; } static inline ktime_t net_timedelta(ktime_t t) -- cgit v1.2.3 From d98d58a002619b5c165f1eedcd731e2fe2c19088 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:55:50 -0800 Subject: net: Set skb->mono_delivery_time and clear it after sch_handle_ingress() The previous patches handled the delivery_time before sch_handle_ingress(). This patch can now set the skb->mono_delivery_time to flag the skb->tstamp is used as the mono delivery_time (EDT) instead of the (rcv) timestamp and also clear it with skb_clear_delivery_time() after sch_handle_ingress(). This will make the bpf_redirect_*() to keep the mono delivery_time and used by a qdisc (fq) of the egress-ing interface. A latter patch will postpone the skb_clear_delivery_time() until the stack learns that the skb is being delivered locally and that will make other kernel forwarding paths (ip[6]_forward) able to keep the delivery_time also. Thus, like the previous patches on using the skb->mono_delivery_time bit, calling skb_clear_delivery_time() is not limited within the CONFIG_NET_INGRESS to avoid too many code churns among this set. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8e8a4af4f9e2..0f5fd53059cd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3993,8 +3993,7 @@ static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, bool mono) { skb->tstamp = kt; - /* Setting mono_delivery_time will be enabled later */ - skb->mono_delivery_time = 0; + skb->mono_delivery_time = kt && mono; } DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); -- cgit v1.2.3 From 8672406eb5d77333ca14e9612e3166704b367c40 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:55:57 -0800 Subject: net: ip: Handle delivery_time in ip defrag A latter patch will postpone the delivery_time clearing until the stack knows the skb is being delivered locally. That will allow other kernel forwarding path (e.g. ip[6]_forward) to keep the delivery_time also. An earlier attempt was to do skb_clear_delivery_time() in ip_local_deliver() and ip6_input(). The discussion [0] requested to move it one step later into ip_local_deliver_finish() and ip6_input_finish() so that the delivery_time can be kept for the ip_vs forwarding path also. To do that, this patch also needs to take care of the (rcv) timestamp usecase in ip_is_fragment(). It needs to expect delivery_time in the skb->tstamp, so it needs to save the mono_delivery_time bit in inet_frag_queue such that the delivery_time (if any) can be restored in the final defragmented skb. [Note that it will only happen when the locally generated skb is looping from egress to ingress over a virtual interface (e.g. veth, loopback...), skb->tstamp may have the delivery time before it is known that it will be delivered locally and received by another sk.] [0]: https://lore.kernel.org/netdev/ca728d81-80e8-3767-d5e-d44f6ad96e43@ssi.bg/ Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/net/inet_frag.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 63540be0fc34..911ad930867d 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -70,6 +70,7 @@ struct frag_v6_compare_key { * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far + * @mono_delivery_time: stamp has a mono delivery time (EDT) * @flags: fragment queue flags * @max_size: maximum received fragment size * @fqdir: pointer to struct fqdir @@ -90,6 +91,7 @@ struct inet_frag_queue { ktime_t stamp; int len; int meat; + u8 mono_delivery_time; __u8 flags; u16 max_size; struct fqdir *fqdir; -- cgit v1.2.3 From b6561f8491ca899e5a08311796085c9738d631ae Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:56:09 -0800 Subject: net: ipv6: Get rcv timestamp if needed when handling hop-by-hop IOAM option IOAM is a hop-by-hop option with a temporary iana allocation (49). Since it is hop-by-hop, it is done before the input routing decision. One of the traced data field is the (rcv) timestamp. When the locally generated skb is looping from egress to ingress over a virtual interface (e.g. veth, loopback...), skb->tstamp may have the delivery time before it is known that it will be delivered locally and received by another sk. Like handling the network tapping (tcpdump) in the earlier patch, this patch gets the timestamp if needed without over-writing the delivery_time in the skb->tstamp. skb_tstamp_cond() is added to do the ktime_get_real() with an extra cond arg to check on top of the netstamp_needed_key static key. skb_tstamp_cond() will also be used in a latter patch and it needs the netstamp_needed_key check. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0f5fd53059cd..4b5b926a81f2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4028,6 +4028,17 @@ static inline ktime_t skb_tstamp(const struct sk_buff *skb) return skb->tstamp; } +static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond) +{ + if (!skb->mono_delivery_time && skb->tstamp) + return skb->tstamp; + + if (static_branch_unlikely(&netstamp_needed_key) || cond) + return ktime_get_real(); + + return 0; +} + static inline u8 skb_metadata_len(const struct sk_buff *skb) { return skb_shinfo(skb)->meta_len; -- cgit v1.2.3 From 7449197d600d30d038b1ce6285ab4747096eac7f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:56:28 -0800 Subject: bpf: Keep the (rcv) timestamp behavior for the existing tc-bpf@ingress The current tc-bpf@ingress reads and writes the __sk_buff->tstamp as a (rcv) timestamp which currently could either be 0 (not available) or ktime_get_real(). This patch is to backward compatible with the (rcv) timestamp expectation at ingress. If the skb->tstamp has the delivery_time, the bpf insn rewrite will read 0 for tc-bpf running at ingress as it is not available. When writing at ingress, it will also clear the skb->mono_delivery_time bit. /* BPF_READ: a = __sk_buff->tstamp */ if (!skb->tc_at_ingress || !skb->mono_delivery_time) a = skb->tstamp; else a = 0 /* BPF_WRITE: __sk_buff->tstamp = a */ if (skb->tc_at_ingress) skb->mono_delivery_time = 0; skb->tstamp = a; [ A note on the BPF_CGROUP_INET_INGRESS which can also access skb->tstamp. At that point, the skb is delivered locally and skb_clear_delivery_time() has already been done, so the skb->tstamp will only have the (rcv) timestamp. ] If the tc-bpf@egress writes 0 to skb->tstamp, the skb->mono_delivery_time has to be cleared also. It could be done together during convert_ctx_access(). However, the latter patch will also expose the skb->mono_delivery_time bit as __sk_buff->delivery_time_type. Changing the delivery_time_type in the background may surprise the user, e.g. the 2nd read on __sk_buff->delivery_time_type may need a READ_ONCE() to avoid compiler optimization. Thus, in expecting the needs in the latter patch, this patch does a check on !skb->tstamp after running the tc-bpf and clears the skb->mono_delivery_time bit if needed. The earlier discussion on v4 [0]. The bpf insn rewrite requires the skb's mono_delivery_time bit and tc_at_ingress bit. They are moved up in sk_buff so that bpf rewrite can be done at a fixed offset. tc_skip_classify is moved together with tc_at_ingress. To get one bit for mono_delivery_time, csum_not_inet is moved down and this bit is currently used by sctp. [0]: https://lore.kernel.org/bpf/20220217015043.khqwqklx45c4m4se@kafai-mbp.dhcp.thefacebook.com/ Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/skbuff.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4b5b926a81f2..5445860e1ba6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -941,8 +941,12 @@ struct sk_buff { __u8 vlan_present:1; /* See PKT_VLAN_PRESENT_BIT */ __u8 csum_complete_sw:1; __u8 csum_level:2; - __u8 csum_not_inet:1; __u8 dst_pending_confirm:1; + __u8 mono_delivery_time:1; +#ifdef CONFIG_NET_CLS_ACT + __u8 tc_skip_classify:1; + __u8 tc_at_ingress:1; +#endif #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif @@ -953,10 +957,6 @@ struct sk_buff { #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; -#endif -#ifdef CONFIG_NET_CLS_ACT - __u8 tc_skip_classify:1; - __u8 tc_at_ingress:1; #endif __u8 redirected:1; #ifdef CONFIG_NET_REDIRECT @@ -969,7 +969,7 @@ struct sk_buff { __u8 decrypted:1; #endif __u8 slow_gro:1; - __u8 mono_delivery_time:1; + __u8 csum_not_inet:1; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ @@ -1047,10 +1047,16 @@ struct sk_buff { /* if you move pkt_vlan_present around you also must adapt these constants */ #ifdef __BIG_ENDIAN_BITFIELD #define PKT_VLAN_PRESENT_BIT 7 +#define TC_AT_INGRESS_MASK (1 << 0) +#define SKB_MONO_DELIVERY_TIME_MASK (1 << 2) #else #define PKT_VLAN_PRESENT_BIT 0 +#define TC_AT_INGRESS_MASK (1 << 7) +#define SKB_MONO_DELIVERY_TIME_MASK (1 << 5) #endif #define PKT_VLAN_PRESENT_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) +#define TC_AT_INGRESS_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) +#define SKB_MONO_DELIVERY_TIME_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) #ifdef __KERNEL__ /* -- cgit v1.2.3 From 8d21ec0e46ed6e39994accff8eb4f2be3d2e76b5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 2 Mar 2022 11:56:34 -0800 Subject: bpf: Add __sk_buff->delivery_time_type and bpf_skb_set_skb_delivery_time() * __sk_buff->delivery_time_type: This patch adds __sk_buff->delivery_time_type. It tells if the delivery_time is stored in __sk_buff->tstamp or not. It will be most useful for ingress to tell if the __sk_buff->tstamp has the (rcv) timestamp or delivery_time. If delivery_time_type is 0 (BPF_SKB_DELIVERY_TIME_NONE), it has the (rcv) timestamp. Two non-zero types are defined for the delivery_time_type, BPF_SKB_DELIVERY_TIME_MONO and BPF_SKB_DELIVERY_TIME_UNSPEC. For UNSPEC, it can only happen in egress because only mono delivery_time can be forwarded to ingress now. The clock of UNSPEC delivery_time can be deduced from the skb->sk->sk_clockid which is how the sch_etf doing it also. * Provide forwarded delivery_time to tc-bpf@ingress: With the help of the new delivery_time_type, the tc-bpf has a way to tell if the __sk_buff->tstamp has the (rcv) timestamp or the delivery_time. During bpf load time, the verifier will learn if the bpf prog has accessed the new __sk_buff->delivery_time_type. If it does, it means the tc-bpf@ingress is expecting the skb->tstamp could have the delivery_time. The kernel will then read the skb->tstamp as-is during bpf insn rewrite without checking the skb->mono_delivery_time. This is done by adding a new prog->delivery_time_access bit. The same goes for writing skb->tstamp. * bpf_skb_set_delivery_time(): The bpf_skb_set_delivery_time() helper is added to allow setting both delivery_time and the delivery_time_type at the same time. If the tc-bpf does not need to change the delivery_time_type, it can directly write to the __sk_buff->tstamp as the existing tc-bpf has already been doing. It will be most useful at ingress to change the __sk_buff->tstamp from the (rcv) timestamp to a mono delivery_time and then bpf_redirect_*(). bpf only has mono clock helper (bpf_ktime_get_ns), and the current known use case is the mono EDT for fq, and only mono delivery time can be kept during forward now, so bpf_skb_set_delivery_time() only supports setting BPF_SKB_DELIVERY_TIME_MONO. It can be extended later when use cases come up and the forwarding path also supports other clock bases. Signed-off-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/filter.h | 3 ++- include/uapi/linux/bpf.h | 41 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 1cb1af917617..9bf26307247f 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -572,7 +572,8 @@ struct bpf_prog { has_callchain_buf:1, /* callchain buffer allocated? */ enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ - call_get_func_ip:1; /* Do we call get_func_ip() */ + call_get_func_ip:1, /* Do we call get_func_ip() */ + delivery_time_access:1; /* Accessed __sk_buff->delivery_time_type */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index afe3d0d7f5f2..4eebea830613 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5086,6 +5086,37 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. On error * *dst* buffer is zeroed out. + * + * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type) + * Description + * Set a *dtime* (delivery time) to the __sk_buff->tstamp and also + * change the __sk_buff->delivery_time_type to *dtime_type*. + * + * When setting a delivery time (non zero *dtime*) to + * __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type* + * is supported. It is the only delivery_time_type that will be + * kept after bpf_redirect_*(). + * + * If there is no need to change the __sk_buff->delivery_time_type, + * the delivery time can be directly written to __sk_buff->tstamp + * instead. + * + * *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE + * can be used to clear any delivery time stored in + * __sk_buff->tstamp. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * Return + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported delivery_time_type and protocol */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5280,6 +5311,7 @@ union bpf_attr { FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ FN(copy_from_user_task), \ + FN(skb_set_delivery_time), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5469,6 +5501,12 @@ union { \ __u64 :64; \ } __attribute__((aligned(8))) +enum { + BPF_SKB_DELIVERY_TIME_NONE, + BPF_SKB_DELIVERY_TIME_UNSPEC, + BPF_SKB_DELIVERY_TIME_MONO, +}; + /* user accessible mirror of in-kernel sk_buff. * new fields can only be added to the end of this structure */ @@ -5509,7 +5547,8 @@ struct __sk_buff { __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size; - __u32 :32; /* Padding, future use. */ + __u8 delivery_time_type; + __u32 :24; /* Padding, future use. */ __u64 hwtstamp; }; -- cgit v1.2.3 From 98b4d7a4e7374a44c4afd9f08330e72f6ad0d644 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:40 +0800 Subject: net: dev: use kfree_skb_reason() for sch_handle_egress() Replace kfree_skb() used in sch_handle_egress() with kfree_skb_reason(). The drop reason SKB_DROP_REASON_TC_EGRESS is introduced. Considering the code path of tc egerss, we make it distinct with the drop reason of SKB_DROP_REASON_QDISC_DROP in the next commit. Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/trace/events/skb.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5445860e1ba6..1ffe64616741 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -394,6 +394,7 @@ enum skb_drop_reason { * entry is full */ SKB_DROP_REASON_NEIGH_DEAD, /* neigh entry is dead */ + SKB_DROP_REASON_TC_EGRESS, /* dropped in TC egress HOOK */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 1977f301260d..53755e8191a1 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -45,6 +45,7 @@ EM(SKB_DROP_REASON_NEIGH_FAILED, NEIGH_FAILED) \ EM(SKB_DROP_REASON_NEIGH_QUEUEFULL, NEIGH_QUEUEFULL) \ EM(SKB_DROP_REASON_NEIGH_DEAD, NEIGH_DEAD) \ + EM(SKB_DROP_REASON_TC_EGRESS, TC_EGRESS) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 215b0f1963d4e34fccac6992b3debe26f78a6eb8 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:41 +0800 Subject: net: skb: introduce the function kfree_skb_list_reason() To report reasons of skb drops, introduce the function kfree_skb_list_reason() and make kfree_skb_list() an inline call to it. This function will be used in the next commit in __dev_xmit_skb(). Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1ffe64616741..1f2de153755c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1202,10 +1202,16 @@ static inline void kfree_skb(struct sk_buff *skb) } void skb_release_head_state(struct sk_buff *skb); -void kfree_skb_list(struct sk_buff *segs); +void kfree_skb_list_reason(struct sk_buff *segs, + enum skb_drop_reason reason); void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt); void skb_tx_error(struct sk_buff *skb); +static inline void kfree_skb_list(struct sk_buff *segs) +{ + kfree_skb_list_reason(segs, SKB_DROP_REASON_NOT_SPECIFIED); +} + #ifdef CONFIG_TRACEPOINTS void consume_skb(struct sk_buff *skb); #else -- cgit v1.2.3 From 7faef0547f4c29031a68d058918b031a8e520d49 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:42 +0800 Subject: net: dev: add skb drop reasons to __dev_xmit_skb() Add reasons for skb drops to __dev_xmit_skb() by replacing kfree_skb_list() with kfree_skb_list_reason(). The drop reason of SKB_DROP_REASON_QDISC_DROP is introduced for qdisc enqueue fails. Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ include/trace/events/skb.h | 1 + 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 1f2de153755c..769d54ba2f3b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -395,6 +395,10 @@ enum skb_drop_reason { */ SKB_DROP_REASON_NEIGH_DEAD, /* neigh entry is dead */ SKB_DROP_REASON_TC_EGRESS, /* dropped in TC egress HOOK */ + SKB_DROP_REASON_QDISC_DROP, /* dropped by qdisc when packet + * outputting (failed to enqueue to + * current qdisc) + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 53755e8191a1..dbf3e2e3c1b4 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -46,6 +46,7 @@ EM(SKB_DROP_REASON_NEIGH_QUEUEFULL, NEIGH_QUEUEFULL) \ EM(SKB_DROP_REASON_NEIGH_DEAD, NEIGH_DEAD) \ EM(SKB_DROP_REASON_TC_EGRESS, TC_EGRESS) \ + EM(SKB_DROP_REASON_QDISC_DROP, QDISC_DROP) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 44f0bd40803c0e04f1c8cd59df3c7acce783ae9c Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:43 +0800 Subject: net: dev: use kfree_skb_reason() for enqueue_to_backlog() Replace kfree_skb() used in enqueue_to_backlog() with kfree_skb_reason(). The skb rop reason SKB_DROP_REASON_CPU_BACKLOG is introduced for the case of failing to enqueue the skb to the per CPU backlog queue. The further reason can be backlog queue full or RPS flow limition, and I think we needn't to make further distinctions. Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++++ include/trace/events/skb.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 769d54ba2f3b..44ecbfff2b69 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -399,6 +399,12 @@ enum skb_drop_reason { * outputting (failed to enqueue to * current qdisc) */ + SKB_DROP_REASON_CPU_BACKLOG, /* failed to enqueue the skb to + * the per CPU backlog queue. This + * can be caused by backlog queue + * full (see netdev_max_backlog in + * net.rst) or RPS flow limit + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index dbf3e2e3c1b4..3bb90ca893ae 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -47,6 +47,7 @@ EM(SKB_DROP_REASON_NEIGH_DEAD, NEIGH_DEAD) \ EM(SKB_DROP_REASON_TC_EGRESS, TC_EGRESS) \ EM(SKB_DROP_REASON_QDISC_DROP, QDISC_DROP) \ + EM(SKB_DROP_REASON_CPU_BACKLOG, CPU_BACKLOG) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 7e726ed81e1ddd5fdc431e02b94fcfe2a9876d42 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:44 +0800 Subject: net: dev: use kfree_skb_reason() for do_xdp_generic() Replace kfree_skb() used in do_xdp_generic() with kfree_skb_reason(). The drop reason SKB_DROP_REASON_XDP is introduced for this case. Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/trace/events/skb.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 44ecbfff2b69..7a38d0f90cef 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -405,6 +405,7 @@ enum skb_drop_reason { * full (see netdev_max_backlog in * net.rst) or RPS flow limit */ + SKB_DROP_REASON_XDP, /* dropped by XDP in input path */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 3bb90ca893ae..8c4c343c830f 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -48,6 +48,7 @@ EM(SKB_DROP_REASON_TC_EGRESS, TC_EGRESS) \ EM(SKB_DROP_REASON_QDISC_DROP, QDISC_DROP) \ EM(SKB_DROP_REASON_CPU_BACKLOG, CPU_BACKLOG) \ + EM(SKB_DROP_REASON_XDP, XDP) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From a568aff26ac03ee9eb1482683514914a5ec3b4c3 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:45 +0800 Subject: net: dev: use kfree_skb_reason() for sch_handle_ingress() Replace kfree_skb() used in sch_handle_ingress() with kfree_skb_reason(). Following drop reasons are introduced: SKB_DROP_REASON_TC_INGRESS Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/trace/events/skb.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7a38d0f90cef..6b14b2d297d2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -406,6 +406,7 @@ enum skb_drop_reason { * net.rst) or RPS flow limit */ SKB_DROP_REASON_XDP, /* dropped by XDP in input path */ + SKB_DROP_REASON_TC_INGRESS, /* dropped in TC ingress HOOK */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 8c4c343c830f..514dd2de8776 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -49,6 +49,7 @@ EM(SKB_DROP_REASON_QDISC_DROP, QDISC_DROP) \ EM(SKB_DROP_REASON_CPU_BACKLOG, CPU_BACKLOG) \ EM(SKB_DROP_REASON_XDP, XDP) \ + EM(SKB_DROP_REASON_TC_INGRESS, TC_INGRESS) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 6c2728b7c14164928cb7cb9c847dead101b2d503 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Fri, 4 Mar 2022 14:00:46 +0800 Subject: net: dev: use kfree_skb_reason() for __netif_receive_skb_core() Add reason for skb drops to __netif_receive_skb_core() when packet_type not found to handle the skb. For this purpose, the drop reason SKB_DROP_REASON_PTYPE_ABSENT is introduced. Take ether packets for example, this case mainly happens when L3 protocol is not supported. Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ include/trace/events/skb.h | 1 + 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6b14b2d297d2..2be263184d1e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -407,6 +407,11 @@ enum skb_drop_reason { */ SKB_DROP_REASON_XDP, /* dropped by XDP in input path */ SKB_DROP_REASON_TC_INGRESS, /* dropped in TC ingress HOOK */ + SKB_DROP_REASON_PTYPE_ABSENT, /* not packet_type found to handle + * the skb. For an etner packet, + * this means that L3 protocol is + * not supported + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 514dd2de8776..c0769d943f8e 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -50,6 +50,7 @@ EM(SKB_DROP_REASON_CPU_BACKLOG, CPU_BACKLOG) \ EM(SKB_DROP_REASON_XDP, XDP) \ EM(SKB_DROP_REASON_TC_INGRESS, TC_INGRESS) \ + EM(SKB_DROP_REASON_PTYPE_ABSENT, PTYPE_ABSENT) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 3a0318140a6f8c3ab60f1f46c3f203923cb01882 Mon Sep 17 00:00:00 2001 From: Changcheng Deng Date: Fri, 21 Jan 2022 01:35:08 +0000 Subject: Bluetooth: mgmt: Replace zero-length array with flexible-array member There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use "flexible array members" for these cases. The older style of one-element or zero-length arrays should no longer be used. Reference: https://www.kernel.org/doc/html/latest/process/deprecated.html#zero-length-and-one-element-arrays Reported-by: Zeal Robot Signed-off-by: Changcheng Deng Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/mgmt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 99266f7aebdc..3d26e6a3478b 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -1112,7 +1112,7 @@ struct mgmt_ev_adv_monitor_device_found { __s8 rssi; __le32 flags; __le16 eir_len; - __u8 eir[0]; + __u8 eir[]; } __packed; #define MGMT_EV_ADV_MONITOR_DEVICE_LOST 0x0030 -- cgit v1.2.3 From 9b392e0e0b6d026da5a62bb79a08f32e27af858e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 3 Mar 2022 13:11:57 -0800 Subject: Bluetooth: Fix not checking for valid hdev on bt_dev_{info,warn,err,dbg} This fixes attemting to print hdev->name directly which causes them to print an error: kernel: read_version:367: (efault): sock 000000006a3008f2 Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index a647e5fabdbd..2aa5e95808a5 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -204,19 +204,21 @@ void bt_err_ratelimited(const char *fmt, ...); #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) #endif +#define bt_dev_name(hdev) ((hdev) ? (hdev)->name : "null") + #define bt_dev_info(hdev, fmt, ...) \ - BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_INFO("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_warn(hdev, fmt, ...) \ - BT_WARN("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_WARN("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_err(hdev, fmt, ...) \ - BT_ERR("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_ERR("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_dbg(hdev, fmt, ...) \ - BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + BT_DBG("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_warn_ratelimited(hdev, fmt, ...) \ - bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + bt_warn_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) #define bt_dev_err_ratelimited(hdev, fmt, ...) \ - bt_err_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) + bt_err_ratelimited("%s: " fmt, bt_dev_name(hdev), ##__VA_ARGS__) /* Connection and socket states */ enum { -- cgit v1.2.3 From cd87fecdedd7b54c61f5d7c2b7237b43126ac50a Mon Sep 17 00:00:00 2001 From: Luiz Angelo Daros de Luca Date: Wed, 2 Mar 2022 22:52:34 -0300 Subject: net: dsa: tag_rtl8_4: add rtl8_4t trailing variant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Realtek switches supports the same tag both before ethertype or between payload and the CRC. Signed-off-by: Luiz Angelo Daros de Luca Reviewed-by: Alvin Šipraga Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 71cc363dbbd4..759479fe8573 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -52,6 +52,7 @@ struct phylink_link_state; #define DSA_TAG_PROTO_BRCM_LEGACY_VALUE 22 #define DSA_TAG_PROTO_SJA1110_VALUE 23 #define DSA_TAG_PROTO_RTL8_4_VALUE 24 +#define DSA_TAG_PROTO_RTL8_4T_VALUE 25 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -79,6 +80,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_SEVILLE = DSA_TAG_PROTO_SEVILLE_VALUE, DSA_TAG_PROTO_SJA1110 = DSA_TAG_PROTO_SJA1110_VALUE, DSA_TAG_PROTO_RTL8_4 = DSA_TAG_PROTO_RTL8_4_VALUE, + DSA_TAG_PROTO_RTL8_4T = DSA_TAG_PROTO_RTL8_4T_VALUE, }; struct dsa_switch; -- cgit v1.2.3 From 25b35dd28138f61f9a0fb8b76c0483761fd228bd Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 5 Mar 2022 04:16:38 +0530 Subject: bpf: Add check_func_arg_reg_off function Lift the list of register types allowed for having fixed and variable offsets when passed as helper function arguments into a common helper, so that they can be reused for kfunc checks in later commits. Keeping a common helper aids maintainability and allows us to follow the same consistent rules across helpers and kfuncs. Also, convert check_func_arg to use this function. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220304224645.3677453-2-memxor@gmail.com --- include/linux/bpf_verifier.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7a7be8c057f2..38b24ee8d8c2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -521,6 +521,9 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); int check_ptr_off_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); +int check_func_arg_reg_off(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + enum bpf_arg_type arg_type); int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, -- cgit v1.2.3 From 24d5bb806c7e2c0b9972564fd493069f612d90dd Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 5 Mar 2022 04:16:41 +0530 Subject: bpf: Harden register offset checks for release helpers and kfuncs Let's ensure that the PTR_TO_BTF_ID reg being passed in to release BPF helpers and kfuncs always has its offset set to 0. While not a real problem now, there's a very real possibility this will become a problem when more and more kfuncs are exposed, and more BPF helpers are added which can release PTR_TO_BTF_ID. Previous commits already protected against non-zero var_off. One of the case we are concerned about now is when we have a type that can be returned by e.g. an acquire kfunc: struct foo { int a; int b; struct bar b; }; ... and struct bar is also a type that can be returned by another acquire kfunc. Then, doing the following sequence: struct foo *f = bpf_get_foo(); // acquire kfunc if (!f) return 0; bpf_put_bar(&f->b); // release kfunc ... would work with the current code, since the btf_struct_ids_match takes reg->off into account for matching pointer type with release kfunc argument type, but would obviously be incorrect, and most likely lead to a kernel crash. A test has been included later to prevent regressions in this area. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220304224645.3677453-5-memxor@gmail.com --- include/linux/bpf_verifier.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 38b24ee8d8c2..c1fc4af47f69 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -523,7 +523,8 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type); + enum bpf_arg_type arg_type, + bool is_release_func); int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, -- cgit v1.2.3 From f014a00bbeb09cea16017b82448d32a468a6b96f Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Sat, 5 Mar 2022 04:16:42 +0530 Subject: compiler-clang.h: Add __diag infrastructure for clang Add __diag macros similar to those in compiler-gcc.h, so that warnings that need to be adjusted for specific cases but not globally can be ignored when building with clang. Signed-off-by: Nathan Chancellor Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220304224645.3677453-6-memxor@gmail.com [ Kartikeya: wrote commit message ] --- include/linux/compiler-clang.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index 3c4de9b6c6e3..f1aa41d520bd 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -68,3 +68,25 @@ #define __nocfi __attribute__((__no_sanitize__("cfi"))) #define __cficanonical __attribute__((__cfi_canonical_jump_table__)) + +/* + * Turn individual warnings and errors on and off locally, depending + * on version. + */ +#define __diag_clang(version, severity, s) \ + __diag_clang_ ## version(__diag_clang_ ## severity s) + +/* Severity used in pragma directives */ +#define __diag_clang_ignore ignored +#define __diag_clang_warn warning +#define __diag_clang_error error + +#define __diag_str1(s) #s +#define __diag_str(s) __diag_str1(s) +#define __diag(s) _Pragma(__diag_str(clang diagnostic s)) + +#if CONFIG_CLANG_VERSION >= 110000 +#define __diag_clang_11(s) __diag(s) +#else +#define __diag_clang_11(s) +#endif -- cgit v1.2.3 From 4d1ea705d797e66edd70ffa708b83888a210a437 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 5 Mar 2022 04:16:43 +0530 Subject: compiler_types.h: Add unified __diag_ignore_all for GCC/LLVM Add a __diag_ignore_all macro, to ignore warnings for both GCC and LLVM, without having to specify the compiler type and version. By default, GCC 8 and clang 11 are used. This will be used by bpf subsystem to ignore -Wmissing-prototypes warning for functions that are meant to be global functions so that they are in vmlinux BTF, but don't have a prototype. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220304224645.3677453-7-memxor@gmail.com --- include/linux/compiler-clang.h | 3 +++ include/linux/compiler-gcc.h | 3 +++ include/linux/compiler_types.h | 4 ++++ 3 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index f1aa41d520bd..babb1347148c 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -90,3 +90,6 @@ #else #define __diag_clang_11(s) #endif + +#define __diag_ignore_all(option, comment) \ + __diag_clang(11, ignore, option) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index ccbbd31b3aae..d364c98a4a80 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -151,6 +151,9 @@ #define __diag_GCC_8(s) #endif +#define __diag_ignore_all(option, comment) \ + __diag_GCC(8, ignore, option) + /* * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size" * attribute) do not work, and must be disabled. diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 3f31ff400432..8e5d2f50f951 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -371,4 +371,8 @@ struct ftrace_likely_data { #define __diag_error(compiler, version, option, comment) \ __diag_ ## compiler(version, error, option) +#ifndef __diag_ignore_all +#define __diag_ignore_all(option, comment) +#endif + #endif /* __LINUX_COMPILER_TYPES_H */ -- cgit v1.2.3 From 9216c916237805c93d054ed022afb172ddbc3ed1 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Fri, 4 Mar 2022 11:16:55 -0800 Subject: compiler_types: Define __percpu as __attribute__((btf_type_tag("percpu"))) This is similar to commit 7472d5a642c9 ("compiler_types: define __user as __attribute__((btf_type_tag("user")))"), where a type tag "user" was introduced to identify the pointers that point to user memory. With that change, the newest compile toolchain can encode __user information into vmlinux BTF, which can be used by the BPF verifier to enforce safe program behaviors. Similarly, we have __percpu attribute, which is mainly used to indicate memory is allocated in percpu region. The __percpu pointers in kernel are supposed to be used together with functions like per_cpu_ptr() and this_cpu_ptr(), which perform necessary calculation on the pointer's base address. Without the btf_type_tag introduced in this patch, __percpu pointers will be treated as regular memory pointers in vmlinux BTF and BPF programs are allowed to directly dereference them, generating incorrect behaviors. Now with "percpu" btf_type_tag, the BPF verifier is able to differentiate __percpu pointers from regular pointers and forbids unexpected behaviors like direct load. The following is an example similar to the one given in commit 7472d5a642c9: [$ ~] cat test.c #define __percpu __attribute__((btf_type_tag("percpu"))) int foo(int __percpu *arg) { return *arg; } [$ ~] clang -O2 -g -c test.c [$ ~] pahole -JV test.o ... File test.o: [1] INT int size=4 nr_bits=32 encoding=SIGNED [2] TYPE_TAG percpu type_id=1 [3] PTR (anon) type_id=2 [4] FUNC_PROTO (anon) return=1 args=(3 arg) [5] FUNC foo type_id=4 [$ ~] for the function argument "int __percpu *arg", its type is described as PTR -> TYPE_TAG(percpu) -> INT The kernel can use this information for bpf verification or other use cases. Like commit 7472d5a642c9, this feature requires clang (>= clang14) and pahole (>= 1.23). Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220304191657.981240-3-haoluo@google.com --- include/linux/compiler_types.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 8e5d2f50f951..b9a8ae9440c7 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -38,7 +38,12 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __user # endif # define __iomem -# define __percpu +# if defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \ + __has_attribute(btf_type_tag) +# define __percpu __attribute__((btf_type_tag("percpu"))) +# else +# define __percpu +# endif # define __rcu # define __chk_user_ptr(x) (void)0 # define __chk_io_ptr(x) (void)0 -- cgit v1.2.3 From 5844101a1be9b8636024cb31c865ef13c7cc6db3 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Fri, 4 Mar 2022 11:16:56 -0800 Subject: bpf: Reject programs that try to load __percpu memory. With the introduction of the btf_type_tag "percpu", we can add a MEM_PERCPU to identify those pointers that point to percpu memory. The ability of differetiating percpu pointers from regular memory pointers have two benefits: 1. It forbids unexpected use of percpu pointers, such as direct loads. In kernel, there are special functions used for accessing percpu memory. Directly loading percpu memory is meaningless. We already have BPF helpers like bpf_per_cpu_ptr() and bpf_this_cpu_ptr() that wrap the kernel percpu functions. So we can now convert percpu pointers into regular pointers in a safe way. 2. Previously, bpf_per_cpu_ptr() and bpf_this_cpu_ptr() only work on PTR_TO_PERCPU_BTF_ID, a special reg_type which describes static percpu variables in kernel (we rely on pahole to encode them into vmlinux BTF). Now, since we can identify __percpu tagged pointers, we can also identify dynamically allocated percpu memory as well. It means we can use bpf_xxx_cpu_ptr() on dynamic percpu memory. This would be very convenient when accessing fields like "cgroup->rstat_cpu". Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220304191657.981240-4-haoluo@google.com --- include/linux/bpf.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f19abc59b6cd..88449fbbe063 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -334,7 +334,15 @@ enum bpf_type_flag { /* MEM is in user address space. */ MEM_USER = BIT(3 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = MEM_USER, + /* MEM is a percpu memory. MEM_PERCPU tags PTR_TO_BTF_ID. When tagged + * with MEM_PERCPU, PTR_TO_BTF_ID _cannot_ be directly accessed. In + * order to drop this tag, it must be passed into bpf_per_cpu_ptr() + * or bpf_this_cpu_ptr(), which will return the pointer corresponding + * to the specified cpu. + */ + MEM_PERCPU = BIT(4 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = MEM_PERCPU, }; /* Max number of base types. */ @@ -516,7 +524,6 @@ enum bpf_reg_type { */ PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ - PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */ PTR_TO_FUNC, /* reg points to a bpf program function */ __BPF_REG_TYPE_MAX, -- cgit v1.2.3 From 736f16de75f9bb32d76f652cb66f04d1bc685057 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 4 Mar 2022 06:55:05 -0800 Subject: net: tap: track dropped skb via kfree_skb_reason() The TAP can be used as vhost-net backend. E.g., the tap_handle_frame() is the interface to forward the skb from TAP to vhost-net/virtio-net. However, there are many "goto drop" in the TAP driver. Therefore, the kfree_skb_reason() is involved at each "goto drop" to help userspace ftrace/ebpf to track the reason for the loss of packets. The below reasons are introduced: - SKB_DROP_REASON_SKB_CSUM - SKB_DROP_REASON_SKB_GSO_SEG - SKB_DROP_REASON_SKB_UCOPY_FAULT - SKB_DROP_REASON_DEV_HDR - SKB_DROP_REASON_FULL_RING Cc: Joao Martins Cc: Joe Jin Signed-off-by: Dongli Zhang Signed-off-by: David S. Miller --- include/linux/skbuff.h | 13 +++++++++++++ include/trace/events/skb.h | 5 +++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2be263184d1e..67cfff4065b6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -412,6 +412,19 @@ enum skb_drop_reason { * this means that L3 protocol is * not supported */ + SKB_DROP_REASON_SKB_CSUM, /* sk_buff checksum computation + * error + */ + SKB_DROP_REASON_SKB_GSO_SEG, /* gso segmentation error */ + SKB_DROP_REASON_SKB_UCOPY_FAULT, /* failed to copy data from + * user space, e.g., via + * zerocopy_sg_from_iter() + * or skb_orphan_frags_rx() + */ + SKB_DROP_REASON_DEV_HDR, /* device driver specific + * header/metadata is invalid + */ + SKB_DROP_REASON_FULL_RING, /* ring buffer is full */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index c0769d943f8e..240e7e7591fc 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -51,6 +51,11 @@ EM(SKB_DROP_REASON_XDP, XDP) \ EM(SKB_DROP_REASON_TC_INGRESS, TC_INGRESS) \ EM(SKB_DROP_REASON_PTYPE_ABSENT, PTYPE_ABSENT) \ + EM(SKB_DROP_REASON_SKB_CSUM, SKB_CSUM) \ + EM(SKB_DROP_REASON_SKB_GSO_SEG, SKB_GSO_SEG) \ + EM(SKB_DROP_REASON_SKB_UCOPY_FAULT, SKB_UCOPY_FAULT) \ + EM(SKB_DROP_REASON_DEV_HDR, DEV_HDR) \ + EM(SKB_DROP_REASON_FULL_RING, FULL_RING) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From 4b4f052e2d89c2eb7e13ee28ba9e85f8097aef3d Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 4 Mar 2022 06:55:07 -0800 Subject: net: tun: track dropped skb via kfree_skb_reason() The TUN can be used as vhost-net backend. E.g, the tun_net_xmit() is the interface to forward the skb from TUN to vhost-net/virtio-net. However, there are many "goto drop" in the TUN driver. Therefore, the kfree_skb_reason() is involved at each "goto drop" to help userspace ftrace/ebpf to track the reason for the loss of packets. The below reasons are introduced: - SKB_DROP_REASON_DEV_READY - SKB_DROP_REASON_NOMEM - SKB_DROP_REASON_HDR_TRUNC - SKB_DROP_REASON_TAP_FILTER - SKB_DROP_REASON_TAP_TXFILTER Cc: Joao Martins Cc: Joe Jin Signed-off-by: Dongli Zhang Signed-off-by: David S. Miller --- include/linux/skbuff.h | 18 ++++++++++++++++++ include/trace/events/skb.h | 5 +++++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 67cfff4065b6..34f572271c0c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -424,7 +424,25 @@ enum skb_drop_reason { SKB_DROP_REASON_DEV_HDR, /* device driver specific * header/metadata is invalid */ + /* the device is not ready to xmit/recv due to any of its data + * structure that is not up/ready/initialized, e.g., the IFF_UP is + * not set, or driver specific tun->tfiles[txq] is not initialized + */ + SKB_DROP_REASON_DEV_READY, SKB_DROP_REASON_FULL_RING, /* ring buffer is full */ + SKB_DROP_REASON_NOMEM, /* error due to OOM */ + SKB_DROP_REASON_HDR_TRUNC, /* failed to trunc/extract the header + * from networking data, e.g., failed + * to pull the protocol header from + * frags via pskb_may_pull() + */ + SKB_DROP_REASON_TAP_FILTER, /* dropped by (ebpf) filter directly + * attached to tun/tap, e.g., via + * TUNSETFILTEREBPF + */ + SKB_DROP_REASON_TAP_TXFILTER, /* dropped by tx filter implemented + * at tun/tap, e.g., check_filter() + */ SKB_DROP_REASON_MAX, }; diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 240e7e7591fc..e1670e1e4934 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -55,7 +55,12 @@ EM(SKB_DROP_REASON_SKB_GSO_SEG, SKB_GSO_SEG) \ EM(SKB_DROP_REASON_SKB_UCOPY_FAULT, SKB_UCOPY_FAULT) \ EM(SKB_DROP_REASON_DEV_HDR, DEV_HDR) \ + EM(SKB_DROP_REASON_DEV_READY, DEV_READY) \ EM(SKB_DROP_REASON_FULL_RING, FULL_RING) \ + EM(SKB_DROP_REASON_NOMEM, NOMEM) \ + EM(SKB_DROP_REASON_HDR_TRUNC, HDR_TRUNC) \ + EM(SKB_DROP_REASON_TAP_FILTER, TAP_FILTER) \ + EM(SKB_DROP_REASON_TAP_TXFILTER, TAP_TXFILTER) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM -- cgit v1.2.3 From f72de02ebece2e962462bc0c1e9efd29eaa029b2 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Sat, 5 Mar 2022 12:21:25 +0100 Subject: ptp: Add generic PTP is_sync() function PHY drivers such as micrel or dp83640 need to analyze whether a given skb is a PTP sync message for one step functionality. In order to avoid code duplication introduce a generic function and move it to ptp classify. Signed-off-by: Kurt Kanzenbach Signed-off-by: David S. Miller --- include/linux/ptp_classify.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/ptp_classify.h b/include/linux/ptp_classify.h index 9afd34a2d36c..fefa7790dc46 100644 --- a/include/linux/ptp_classify.h +++ b/include/linux/ptp_classify.h @@ -126,6 +126,17 @@ static inline u8 ptp_get_msgtype(const struct ptp_header *hdr, return msgtype; } +/** + * ptp_msg_is_sync - Evaluates whether the given skb is a PTP Sync message + * @skb: packet buffer + * @type: type of the packet (see ptp_classify_raw()) + * + * This function evaluates whether the given skb is a PTP Sync message. + * + * Return: true if sync message, false otherwise + */ +bool ptp_msg_is_sync(struct sk_buff *skb, unsigned int type); + void __init ptp_classifier_init(void); #else static inline void ptp_classifier_init(void) @@ -148,5 +159,9 @@ static inline u8 ptp_get_msgtype(const struct ptp_header *hdr, */ return PTP_MSGTYPE_SYNC; } +static inline bool ptp_msg_is_sync(struct sk_buff *skb, unsigned int type) +{ + return false; +} #endif #endif /* _PTP_CLASSIFY_H_ */ -- cgit v1.2.3 From 2655926aea9beea62c9ba80c032485456fd848f0 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sun, 6 Mar 2022 22:57:52 +0100 Subject: net: Remove netif_rx_any_context() and netif_rx_ni(). Remove netif_rx_any_context and netif_rx_ni() because there are no more users in tree. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- include/linux/netdevice.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 19a27ac361ef..29a850a8d460 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3718,16 +3718,6 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); -static inline int netif_rx_ni(struct sk_buff *skb) -{ - return netif_rx(skb); -} - -static inline int netif_rx_any_context(struct sk_buff *skb) -{ - return netif_rx(skb); -} - int netif_receive_skb(struct sk_buff *skb); int netif_receive_skb_core(struct sk_buff *skb); void netif_receive_skb_list_internal(struct list_head *head); -- cgit v1.2.3 From 64807c2321512f67959e51f09e302c145c336184 Mon Sep 17 00:00:00 2001 From: Arun Ramadoss Date: Mon, 7 Mar 2022 21:45:14 +0530 Subject: net: phy: exported the genphy_read_master_slave function genphy_read_master_slave function allows to configure the master/slave for gigabit phys only. In order to use this function irrespective of speed, moved the speed check to the genphy_read_status call. Signed-off-by: Arun Ramadoss Reviewed-by: Andrew Lunn Signed-off-by: Paolo Abeni --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index cd08cf1a8b0d..20beeaa7443b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1578,6 +1578,7 @@ int genphy_update_link(struct phy_device *phydev); int genphy_read_lpa(struct phy_device *phydev); int genphy_read_status_fixed(struct phy_device *phydev); int genphy_read_status(struct phy_device *phydev); +int genphy_read_master_slave(struct phy_device *phydev); int genphy_suspend(struct phy_device *phydev); int genphy_resume(struct phy_device *phydev); int genphy_loopback(struct phy_device *phydev, bool enable); -- cgit v1.2.3 From 0eb4e7ee1655b7ffd3204a35d77b809d42613cb9 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 7 Mar 2022 12:44:31 -0800 Subject: mptcp: add tracepoint in mptcp_sendmsg_frag The tracepoint in get_mapping_status() only dumped the incoming mpext fields. This patch added a new tracepoint in mptcp_sendmsg_frag() to dump the outgoing mpext too. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/trace/events/mptcp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/trace/events/mptcp.h b/include/trace/events/mptcp.h index 6bf43176f14c..f8e28e686c65 100644 --- a/include/trace/events/mptcp.h +++ b/include/trace/events/mptcp.h @@ -115,6 +115,10 @@ DECLARE_EVENT_CLASS(mptcp_dump_mpext, __entry->csum_reqd) ); +DEFINE_EVENT(mptcp_dump_mpext, mptcp_sendmsg_frag, + TP_PROTO(struct mptcp_ext *mpext), + TP_ARGS(mpext)); + DEFINE_EVENT(mptcp_dump_mpext, get_mapping_status, TP_PROTO(struct mptcp_ext *mpext), TP_ARGS(mpext)); -- cgit v1.2.3 From d045b9eb95a9b611c483897a69e7285aefdc66d7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 7 Mar 2022 12:44:36 -0800 Subject: mptcp: introduce implicit endpoints In some edge scenarios, an MPTCP subflows can use a local address mapped by a "implicit" endpoint created by the in-kernel path manager. Such endpoints presence can be confusing, as it's creation is hard to track and will prevent the later endpoint creation from the user-space using the same address. Define a new endpoint flag to mark implicit endpoints and allow the user-space to replace implicit them with user-provided data at endpoint creation time. Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index f106a3941cdf..9690efedb5fa 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -81,6 +81,7 @@ enum { #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) #define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) #define MPTCP_PM_ADDR_FLAG_FULLMESH (1 << 3) +#define MPTCP_PM_ADDR_FLAG_IMPLICIT (1 << 4) enum { MPTCP_PM_CMD_UNSPEC, -- cgit v1.2.3 From 7e580490ac9819dd55a36be2a9b3380d1391f91b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 8 Mar 2022 11:15:15 +0200 Subject: net: dsa: felix: avoid early deletion of host FDB entries The Felix driver declares FDB isolation but puts all standalone ports in VID 0. This is mostly problem-free as discussed with Alvin here: https://patchwork.kernel.org/project/netdevbpf/cover/20220302191417.1288145-1-vladimir.oltean@nxp.com/#24763870 however there is one catch. DSA still thinks that FDB entries are installed on the CPU port as many times as there are user ports, and this is problematic when multiple user ports share the same MAC address. Consider the default case where all user ports inherit their MAC address from the DSA master, and then the user runs: ip link set swp0 address 00:01:02:03:04:05 The above will make dsa_slave_set_mac_address() call dsa_port_standalone_host_fdb_add() for 00:01:02:03:04:05 in port 0's standalone database, and dsa_port_standalone_host_fdb_del() for the old address of swp0, again in swp0's standalone database. Both the ->port_fdb_add() and ->port_fdb_del() will be propagated down to the felix driver, which will end up deleting the old MAC address from the CPU port. But this is still in use by other user ports, so we end up breaking unicast termination for them. There isn't a problem in the fact that DSA keeps track of host standalone addresses in the individual database of each user port: some drivers like sja1105 need this. There also isn't a problem in the fact that some drivers choose the same VID/FID for all standalone ports. It is just that the deletion of these host addresses must be delayed until they are known to not be in use any longer, and only the driver has this knowledge. Since DSA keeps these addresses in &cpu_dp->fdbs and &cpu_db->mdbs, it is just a matter of walking over those lists and see whether the same MAC address is present on the CPU port in the port db of another user port. I have considered reusing the generic dsa_port_walk_fdbs() and dsa_port_walk_mdbs() schemes for this, but locking makes it difficult. In the ->port_fdb_add() method and co, &dp->addr_lists_lock is held, but dsa_port_walk_fdbs() also acquires that lock. Also, even assuming that we introduce an unlocked variant of the address iterator, we'd still need some relatively complex data structures, and a void *ctx in the dsa_fdb_walk_cb_t which we don't currently pass, such that drivers are able to figure out, after iterating, whether the same MAC address is or isn't present in the port db of another port. All the above, plus the fact that I expect other drivers to follow the same model as felix where all standalone ports use the same FID, made me conclude that a generic method provided by DSA is necessary: dsa_fdb_present_in_other_db() and the mdb equivalent. Felix calls this from the ->port_fdb_del() handler for the CPU port, when the database was classified to either a port db, or a LAG db. For symmetry, we also call this from ->port_fdb_add(), because if the address was installed once, then installing it a second time serves no purpose: it's already in hardware in VID 0 and it affects all standalone ports. This change moves dsa_db_equal() from switch.c to dsa.c, since it now has one more caller. Fixes: 54c319846086 ("net: mscc: ocelot: enforce FDB isolation when VLAN-unaware") Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 759479fe8573..9d16505fc0e2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1227,6 +1227,12 @@ typedef int dsa_fdb_walk_cb_t(struct dsa_switch *ds, int port, int dsa_port_walk_fdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb); int dsa_port_walk_mdbs(struct dsa_switch *ds, int port, dsa_fdb_walk_cb_t cb); +bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid, + struct dsa_db db); +bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db); /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) -- cgit v1.2.3 From 1330b6ef3313fcec577d2b020c290dc8b9f11f1a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 7 Mar 2022 16:44:21 -0800 Subject: skb: make drop reason booleanable We have a number of cases where function returns drop/no drop decision as a boolean. Now that we want to report the reason code as well we have to pass extra output arguments. We can make the reason code evaluate correctly as bool. I believe we're good to reorder the reasons as they are reported to user space as strings. Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + include/net/tcp.h | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 34f572271c0c..26538ceb4b01 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -314,6 +314,7 @@ struct sk_buff; * used to translate the reason to string. */ enum skb_drop_reason { + SKB_NOT_DROPPED_YET = 0, SKB_DROP_REASON_NOT_SPECIFIED, /* drop reason is not specified */ SKB_DROP_REASON_NO_SOCKET, /* socket not found */ SKB_DROP_REASON_PKT_TOO_SMALL, /* packet size is too small */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d486d7b6112d..ee8237b58e1d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1674,10 +1674,11 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family); } -bool tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, - enum skb_drop_reason *reason, - const void *saddr, const void *daddr, - int family, int dif, int sdif); + +enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif); #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) @@ -1688,13 +1689,13 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, { return NULL; } -static inline bool tcp_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb, - enum skb_drop_reason *reason, - const void *saddr, const void *daddr, - int family, int dif, int sdif) + +static inline enum skb_drop_reason +tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, + const void *saddr, const void *daddr, + int family, int dif, int sdif); { - return false; + return SKB_NOT_DROPPED_YET; } #define tcp_twsk_md5_key(twsk) NULL #endif -- cgit v1.2.3 From 24055bb87977e0c687b54ebf7bac8715f3636bc3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 9 Mar 2022 14:20:12 +0200 Subject: net: tcp: fix shim definition of tcp_inbound_md5_hash When CONFIG_TCP_MD5SIG isn't enabled, there is a compilation bug due to the fact that the static inline definition of tcp_inbound_md5_hash() has an unexpected semicolon. Remove it. Fixes: 1330b6ef3313 ("skb: make drop reason booleanable") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20220309122012.668986-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index ee8237b58e1d..70ca4a5e330a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1693,7 +1693,7 @@ tcp_md5_do_lookup(const struct sock *sk, int l3index, static inline enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, - int family, int dif, int sdif); + int family, int dif, int sdif) { return SKB_NOT_DROPPED_YET; } -- cgit v1.2.3 From 34f46ae0d4b38e83cfb26fb6f06b5b5efea47fdc Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 27 Jan 2022 15:22:21 +0200 Subject: net/mlx5: Add command failures data to debugfs Add new counters to command interface debugfs to count command failures. The following counters added: total_failed - number of times command failed (any kind of failure). failed_mbox_status - number of times command failed on bad status returned by FW. In addition, add data about last command failure to command interface debugfs: last_failed_errno - last command failed returned errno. last_failed_mbox_status - last bad status returned by FW. Signed-off-by: Moshe Shemesh Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d3b1a6a1f8d2..f18c1e15a12c 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -264,6 +264,14 @@ enum { struct mlx5_cmd_stats { u64 sum; u64 n; + /* number of times command failed */ + u64 failed; + /* number of times command failed on bad status returned by FW */ + u64 failed_mbox_status; + /* last command failed returned errno */ + u32 last_failed_errno; + /* last bad status returned by FW */ + u8 last_failed_mbox_status; struct dentry *root; /* protect command average calculations */ spinlock_t lock; @@ -955,6 +963,7 @@ typedef void (*mlx5_async_cbk_t)(int status, struct mlx5_async_work *context); struct mlx5_async_work { struct mlx5_async_ctx *ctx; mlx5_async_cbk_t user_callback; + u16 opcode; /* cmd opcode */ void *out; /* pointer to the cmd output buffer */ }; -- cgit v1.2.3 From d2cb8dda214fb75f946ba554a1ecd25da7004b2b Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 26 Jan 2022 07:23:53 +0200 Subject: net/mlx5: Change release_all_pages cap bit location mlx5 FW has changed release_all_pages cap bit by one bit offset to reflect a fix in the FW flow for release_all_pages. The driver should use the new bit to ensure it calls release_all_pages only if the FW fix is there. Signed-off-by: Moshe Shemesh Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ea65131835ab..69985e4d8dfe 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1419,8 +1419,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_130[0xa]; u8 log_max_ra_res_dc[0x6]; - u8 reserved_at_140[0x6]; + u8 reserved_at_140[0x5]; u8 release_all_pages[0x1]; + u8 must_not_use[0x1]; u8 reserved_at_147[0x2]; u8 roce_accl[0x1]; u8 log_max_ra_req_qp[0x6]; -- cgit v1.2.3 From 66771a1c729e816dc84e29ba555013b6e722fb22 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 18 Feb 2022 09:36:20 +0200 Subject: net/mlx5: Move debugfs entries to separate struct Move the debugfs entry pointers under priv to their own struct. Add get function for device debugfs root. Signed-off-by: Moshe Shemesh Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f18c1e15a12c..aa539d245a12 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -551,6 +551,14 @@ struct mlx5_adev { int idx; }; +struct mlx5_debugfs_entries { + struct dentry *dbg_root; + struct dentry *qp_debugfs; + struct dentry *eq_debugfs; + struct dentry *cq_debugfs; + struct dentry *cmdif_debugfs; +}; + struct mlx5_ft_pool; struct mlx5_priv { /* IRQ table valid only for real pci devices PF or VF */ @@ -570,12 +578,7 @@ struct mlx5_priv { struct mlx5_core_health health; struct list_head traps; - /* start: qp staff */ - struct dentry *qp_debugfs; - struct dentry *eq_debugfs; - struct dentry *cq_debugfs; - struct dentry *cmdif_debugfs; - /* end: qp staff */ + struct mlx5_debugfs_entries dbg; /* start: alloc staff */ /* protect buffer allocation according to numa node */ @@ -585,7 +588,6 @@ struct mlx5_priv { struct mutex pgdir_mutex; struct list_head pgdir_list; /* end: alloc staff */ - struct dentry *dbg_root; struct list_head ctx_list; spinlock_t ctx_lock; @@ -1038,6 +1040,7 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn); int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); +struct dentry *mlx5_debugfs_get_dev_root(struct mlx5_core_dev *dev); void mlx5_qp_debugfs_init(struct mlx5_core_dev *dev); void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev); int mlx5_access_reg(struct mlx5_core_dev *dev, void *data_in, int size_in, -- cgit v1.2.3 From 4e05cbf05c66ca6931ee4f2a5aff0d84f1e65f40 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 27 Jan 2022 07:03:33 +0200 Subject: net/mlx5: Add pages debugfs Add pages debugfs to expose the following counters for debuggability: fw_pages_total - How many pages were given to FW and not returned yet. vfs_pages - For SRIOV, how many pages were given to FW for virtual functions usage. host_pf_pages - For ECPF, how many pages were given to FW for external hosts physical functions usage. Signed-off-by: Moshe Shemesh Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index aa539d245a12..c5f93b5910ed 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -557,6 +557,7 @@ struct mlx5_debugfs_entries { struct dentry *eq_debugfs; struct dentry *cq_debugfs; struct dentry *cmdif_debugfs; + struct dentry *pages_debugfs; }; struct mlx5_ft_pool; @@ -569,11 +570,11 @@ struct mlx5_priv { struct mlx5_nb pg_nb; struct workqueue_struct *pg_wq; struct xarray page_root_xa; - int fw_pages; + u32 fw_pages; atomic_t reg_pages; struct list_head free_list; - int vfs_pages; - int host_pf_pages; + u32 vfs_pages; + u32 host_pf_pages; struct mlx5_core_health health; struct list_head traps; @@ -1026,6 +1027,8 @@ int mlx5_pagealloc_init(struct mlx5_core_dev *dev); void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev); void mlx5_pagealloc_start(struct mlx5_core_dev *dev); void mlx5_pagealloc_stop(struct mlx5_core_dev *dev); +void mlx5_pages_debugfs_init(struct mlx5_core_dev *dev); +void mlx5_pages_debugfs_cleanup(struct mlx5_core_dev *dev); void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id, s32 npages, bool ec_function); int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot); -- cgit v1.2.3 From 32071187e9fb18da62f5be569bd2ea0d7a981ee8 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 27 Jan 2022 07:51:14 +0200 Subject: net/mlx5: Add debugfs counters for page commands failures Add the following new debugfs counters for debug and verbosity: fw_pages_alloc_failed - number of pages FW requested but driver failed to allocate. give_pages_dropped - number of pages given to FW, but command give pages failed by FW. reclaim_pages_discard - number of pages which were about to reclaim back and FW failed the command. Signed-off-by: Moshe Shemesh Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index c5f93b5910ed..00a914b0716e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -575,6 +575,9 @@ struct mlx5_priv { struct list_head free_list; u32 vfs_pages; u32 host_pf_pages; + u32 fw_pages_alloc_failed; + u32 give_pages_dropped; + u32 reclaim_pages_discard; struct mlx5_core_health health; struct list_head traps; -- cgit v1.2.3 From 5c422bfad2fbab96381273e50c7084465199501d Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Thu, 24 Feb 2022 00:58:58 +0200 Subject: net/mlx5: DR, Add support for matching on Internet Header Length (IHL) Add support for matching on new field - Internet Header Length (IHL). Signed-off-by: Muhammad Sammar Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 69985e4d8dfe..1f0c35162b7b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -493,7 +493,10 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits { u8 tcp_sport[0x10]; u8 tcp_dport[0x10]; - u8 reserved_at_c0[0x18]; + u8 reserved_at_c0[0x10]; + u8 ipv4_ihl[0x4]; + u8 reserved_at_c4[0x4]; + u8 ttl_hoplimit[0x8]; u8 udp_sport[0x10]; -- cgit v1.2.3 From 6862c787c7e88df490675ed781dc9052dba88a56 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Wed, 23 Feb 2022 18:40:59 +0200 Subject: net/mlx5: DR, Add support for ConnectX-7 steering Add support for a new SW format version that is implemented by ConnectX-7. Except for several differences, the STEv2 is identical to STEv1, so for most callbacks the STEv2 context struct will call STEv1 functions. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1f0c35162b7b..bcf60ede6fcc 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1346,6 +1346,7 @@ enum mlx5_fc_bulk_alloc_bitmask { enum { MLX5_STEERING_FORMAT_CONNECTX_5 = 0, MLX5_STEERING_FORMAT_CONNECTX_6DX = 1, + MLX5_STEERING_FORMAT_CONNECTX_7 = 2, }; struct mlx5_ifc_cmd_hca_cap_bits { -- cgit v1.2.3 From b530e9e1063ed2b817eae7eec6ed2daa8be11608 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Wed, 9 Mar 2022 11:53:42 +0100 Subject: bpf: Add "live packet" mode for XDP in BPF_PROG_RUN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for running XDP programs through BPF_PROG_RUN in a mode that enables live packet processing of the resulting frames. Previous uses of BPF_PROG_RUN for XDP returned the XDP program return code and the modified packet data to userspace, which is useful for unit testing of XDP programs. The existing BPF_PROG_RUN for XDP allows userspace to set the ingress ifindex and RXQ number as part of the context object being passed to the kernel. This patch reuses that code, but adds a new mode with different semantics, which can be selected with the new BPF_F_TEST_XDP_LIVE_FRAMES flag. When running BPF_PROG_RUN in this mode, the XDP program return codes will be honoured: returning XDP_PASS will result in the frame being injected into the networking stack as if it came from the selected networking interface, while returning XDP_TX and XDP_REDIRECT will result in the frame being transmitted out that interface. XDP_TX is translated into an XDP_REDIRECT operation to the same interface, since the real XDP_TX action is only possible from within the network drivers themselves, not from the process context where BPF_PROG_RUN is executed. Internally, this new mode of operation creates a page pool instance while setting up the test run, and feeds pages from that into the XDP program. The setup cost of this is amortised over the number of repetitions specified by userspace. To support the performance testing use case, we further optimise the setup step so that all pages in the pool are pre-initialised with the packet data, and pre-computed context and xdp_frame objects stored at the start of each page. This makes it possible to entirely avoid touching the page content on each XDP program invocation, and enables sending up to 9 Mpps/core on my test box. Because the data pages are recycled by the page pool, and the test runner doesn't re-initialise them for each run, subsequent invocations of the XDP program will see the packet data in the state it was after the last time it ran on that particular page. This means that an XDP program that modifies the packet before redirecting it has to be careful about which assumptions it makes about the packet content, but that is only an issue for the most naively written programs. Enabling the new flag is only allowed when not setting ctx_out and data_out in the test specification, since using it means frames will be redirected somewhere else, so they can't be returned. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220309105346.100053-2-toke@redhat.com --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4eebea830613..bc23020b638d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1232,6 +1232,8 @@ enum { /* If set, run the test on the cpu specified by bpf_attr.test.cpu */ #define BPF_F_TEST_RUN_ON_CPU (1U << 0) +/* If set, XDP frames will be transmitted after processing */ +#define BPF_F_TEST_XDP_LIVE_FRAMES (1U << 1) /* type for BPF_ENABLE_STATS */ enum bpf_stats_type { @@ -1393,6 +1395,7 @@ union bpf_attr { __aligned_u64 ctx_out; __u32 flags; __u32 cpu; + __u32 batch_size; } test; struct { /* anonymous struct used by BPF_*_GET_*_ID */ -- cgit v1.2.3 From 013a3e7c79ac51e6cdd84e3580863a562c7597c7 Mon Sep 17 00:00:00 2001 From: Min Li Date: Tue, 8 Mar 2022 09:10:51 -0500 Subject: ptp: idt82p33: use rsmu driver to access i2c/spi bus rsmu (Renesas Synchronization Management Unit ) driver is located in drivers/mfd and responsible for creating multiple devices including idt82p33 phc, which will then use the exposed regmap and mutex handle to access i2c/spi bus. Signed-off-by: Min Li Acked-by: Richard Cochran Link: https://lore.kernel.org/r/1646748651-16811-1-git-send-email-min.li.xe@renesas.com Signed-off-by: Jakub Kicinski --- include/linux/mfd/idt82p33_reg.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mfd/idt82p33_reg.h b/include/linux/mfd/idt82p33_reg.h index 129a6c078221..1db532feeb91 100644 --- a/include/linux/mfd/idt82p33_reg.h +++ b/include/linux/mfd/idt82p33_reg.h @@ -7,6 +7,8 @@ #ifndef HAVE_IDT82P33_REG #define HAVE_IDT82P33_REG +#define REG_ADDR(page, offset) (((page) << 0x7) | ((offset) & 0x7f)) + /* Register address */ #define DPLL1_TOD_CNFG 0x134 #define DPLL2_TOD_CNFG 0x1B4 @@ -41,6 +43,7 @@ #define REG_SOFT_RESET 0X381 #define OUT_MUX_CNFG(outn) REG_ADDR(0x6, (0xC * (outn))) +#define TOD_TRIGGER(wr_trig, rd_trig) ((wr_trig & 0xf) << 4 | (rd_trig & 0xf)) /* Register bit definitions */ #define SYNC_TOD BIT(1) -- cgit v1.2.3 From 77f09e66f613e4801134c64d26a0be593588a42e Mon Sep 17 00:00:00 2001 From: Dimitris Michailidis Date: Tue, 8 Mar 2022 19:40:31 -0800 Subject: net/tls: Provide {__,}tls_driver_ctx() unconditionally Having the definitions of {__,}tls_driver_ctx() under an #if guard means code referencing them also needs to rely on the preprocessor. The protection doesn't appear needed so make the definitions unconditional. Fixes: db37bc177dae ("net/funeth: add the data path") Reported-by: Randy Dunlap Reported-by: kernel test robot Suggested-by: Jakub Kicinski Signed-off-by: Dimitris Michailidis Signed-off-by: Jakub Kicinski --- include/net/tls.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 526cb2c3b724..b6968a5b5538 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -626,7 +626,6 @@ tls_offload_ctx_rx(const struct tls_context *tls_ctx) return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx; } -#if IS_ENABLED(CONFIG_TLS_DEVICE) static inline void *__tls_driver_ctx(struct tls_context *tls_ctx, enum tls_offload_ctx_dir direction) { @@ -641,7 +640,6 @@ tls_driver_ctx(const struct sock *sk, enum tls_offload_ctx_dir direction) { return __tls_driver_ctx(tls_get_ctx(sk), direction); } -#endif #define RESYNC_REQ BIT(0) #define RESYNC_REQ_ASYNC BIT(1) -- cgit v1.2.3 From 65466904b015f6eeb9225b51aeb29b01a1d4b59c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Mar 2022 17:57:57 -0800 Subject: tcp: adjust TSO packet sizes based on min_rtt Back when tcp_tso_autosize() and TCP pacing were introduced, our focus was really to reduce burst sizes for long distance flows. The simple heuristic of using sk_pacing_rate/1024 has worked well, but can lead to too small packets for hosts in the same rack/cluster, when thousands of flows compete for the bottleneck. Neal Cardwell had the idea of making the TSO burst size a function of both sk_pacing_rate and tcp_min_rtt() Indeed, for local flows, sending bigger bursts is better to reduce cpu costs, as occasional losses can be repaired quite fast. This patch is based on Neal Cardwell implementation done more than two years ago. bbr is adjusting max_pacing_rate based on measured bandwidth, while cubic would over estimate max_pacing_rate. /proc/sys/net/ipv4/tcp_tso_rtt_log can be used to tune or disable this new feature, in logarithmic steps. Tested: 100Gbit NIC, two hosts in the same rack, 4K MTU. 600 flows rate-limited to 20000000 bytes per second. Before patch: (TSO sizes would be limited to 20000000/1024/4096 -> 4 segments per TSO) ~# echo 0 >/proc/sys/net/ipv4/tcp_tso_rtt_log ~# nstat -n;perf stat ./super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000;nstat|egrep "TcpInSegs|TcpOutSegs|TcpRetransSegs|Delivered" 96005 Performance counter stats for './super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000': 65,945.29 msec task-clock # 2.845 CPUs utilized 1,314,632 context-switches # 19935.279 M/sec 5,292 cpu-migrations # 80.249 M/sec 940,641 page-faults # 14264.023 M/sec 201,117,030,926 cycles # 3049769.216 GHz (83.45%) 17,699,435,405 stalled-cycles-frontend # 8.80% frontend cycles idle (83.48%) 136,584,015,071 stalled-cycles-backend # 67.91% backend cycles idle (83.44%) 53,809,530,436 instructions # 0.27 insn per cycle # 2.54 stalled cycles per insn (83.36%) 9,062,315,523 branches # 137422329.563 M/sec (83.22%) 153,008,621 branch-misses # 1.69% of all branches (83.32%) 23.182970846 seconds time elapsed TcpInSegs 15648792 0.0 TcpOutSegs 58659110 0.0 # Average of 3.7 4K segments per TSO packet TcpExtTCPDelivered 58654791 0.0 TcpExtTCPDeliveredCE 19 0.0 After patch: ~# echo 9 >/proc/sys/net/ipv4/tcp_tso_rtt_log ~# nstat -n;perf stat ./super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000;nstat|egrep "TcpInSegs|TcpOutSegs|TcpRetransSegs|Delivered" 96046 Performance counter stats for './super_netperf 600 -H otrv6 -l 20 -- -K dctcp -q 20000000': 48,982.58 msec task-clock # 2.104 CPUs utilized 186,014 context-switches # 3797.599 M/sec 3,109 cpu-migrations # 63.472 M/sec 941,180 page-faults # 19214.814 M/sec 153,459,763,868 cycles # 3132982.807 GHz (83.56%) 12,069,861,356 stalled-cycles-frontend # 7.87% frontend cycles idle (83.32%) 120,485,917,953 stalled-cycles-backend # 78.51% backend cycles idle (83.24%) 36,803,672,106 instructions # 0.24 insn per cycle # 3.27 stalled cycles per insn (83.18%) 5,947,266,275 branches # 121417383.427 M/sec (83.64%) 87,984,616 branch-misses # 1.48% of all branches (83.43%) 23.281200256 seconds time elapsed TcpInSegs 1434706 0.0 TcpOutSegs 58883378 0.0 # Average of 41 4K segments per TSO packet TcpExtTCPDelivered 58878971 0.0 TcpExtTCPDeliveredCE 9664 0.0 Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://lore.kernel.org/r/20220309015757.2532973-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index f0687867b5cd..ce0cc4e8d8c7 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -127,6 +127,7 @@ struct netns_ipv4 { u8 sysctl_tcp_synack_retries; u8 sysctl_tcp_syncookies; u8 sysctl_tcp_migrate_req; + u8 sysctl_tcp_comp_sack_nr; int sysctl_tcp_reordering; u8 sysctl_tcp_retries1; u8 sysctl_tcp_retries2; @@ -160,9 +161,9 @@ struct netns_ipv4 { int sysctl_tcp_challenge_ack_limit; int sysctl_tcp_min_rtt_wlen; u8 sysctl_tcp_min_tso_segs; + u8 sysctl_tcp_tso_rtt_log; u8 sysctl_tcp_autocorking; u8 sysctl_tcp_reflect_tos; - u8 sysctl_tcp_comp_sack_nr; int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; -- cgit v1.2.3 From 530e0d46c61314c59ecfdb8d3bcb87edbc0f85d3 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 9 Mar 2022 13:04:13 +0100 Subject: can: isotp: set default value for N_As to 50 micro seconds The N_As value describes the time a CAN frame needs on the wire when transmitted by the CAN controller. Even very short CAN FD frames need arround 100 usecs (bitrate 1Mbit/s, data bitrate 8Mbit/s). Having N_As to be zero (the former default) leads to 'no CAN frame separation' when STmin is set to zero by the receiving node. This 'burst mode' should not be enabled by default as it could potentially dump a high number of CAN frames into the netdev queue from the soft hrtimer context. This does not affect the system stability but is just not nice and cooperative. With this N_As/frame_txtime value the 'burst mode' is disabled by default. As user space applications usually do not set the frame_txtime element of struct can_isotp_options the new in-kernel default is very likely overwritten with zero when the sockopt() CAN_ISOTP_OPTS is invoked. To make sure that a N_As value of zero is only set intentional the value '0' is now interpreted as 'do not change the current value'. When a frame_txtime of zero is required for testing purposes this CAN_ISOTP_FRAME_TXTIME_ZERO u32 value has to be set in frame_txtime. Link: https://lore.kernel.org/all/20220309120416.83514-2-socketcan@hartkopp.net Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/isotp.h | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/can/isotp.h b/include/uapi/linux/can/isotp.h index c55935b64ccc..590f8aea2b6d 100644 --- a/include/uapi/linux/can/isotp.h +++ b/include/uapi/linux/can/isotp.h @@ -137,20 +137,16 @@ struct can_isotp_ll_options { #define CAN_ISOTP_WAIT_TX_DONE 0x400 /* wait for tx completion */ #define CAN_ISOTP_SF_BROADCAST 0x800 /* 1-to-N functional addressing */ -/* default values */ +/* protocol machine default values */ #define CAN_ISOTP_DEFAULT_FLAGS 0 #define CAN_ISOTP_DEFAULT_EXT_ADDRESS 0x00 #define CAN_ISOTP_DEFAULT_PAD_CONTENT 0xCC /* prevent bit-stuffing */ -#define CAN_ISOTP_DEFAULT_FRAME_TXTIME 0 +#define CAN_ISOTP_DEFAULT_FRAME_TXTIME 50000 /* 50 micro seconds */ #define CAN_ISOTP_DEFAULT_RECV_BS 0 #define CAN_ISOTP_DEFAULT_RECV_STMIN 0x00 #define CAN_ISOTP_DEFAULT_RECV_WFTMAX 0 -#define CAN_ISOTP_DEFAULT_LL_MTU CAN_MTU -#define CAN_ISOTP_DEFAULT_LL_TX_DL CAN_MAX_DLEN -#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS 0 - /* * Remark on CAN_ISOTP_DEFAULT_RECV_* values: * @@ -162,4 +158,24 @@ struct can_isotp_ll_options { * consistency and copied directly into the flow control (FC) frame. */ +/* link layer default values => make use of Classical CAN frames */ + +#define CAN_ISOTP_DEFAULT_LL_MTU CAN_MTU +#define CAN_ISOTP_DEFAULT_LL_TX_DL CAN_MAX_DLEN +#define CAN_ISOTP_DEFAULT_LL_TX_FLAGS 0 + +/* + * The CAN_ISOTP_DEFAULT_FRAME_TXTIME has become a non-zero value as + * it only makes sense for isotp implementation tests to run without + * a N_As value. As user space applications usually do not set the + * frame_txtime element of struct can_isotp_options the new in-kernel + * default is very likely overwritten with zero when the sockopt() + * CAN_ISOTP_OPTS is invoked. + * To make sure that a N_As value of zero is only set intentional the + * value '0' is now interpreted as 'do not change the current value'. + * When a frame_txtime of zero is required for testing purposes this + * CAN_ISOTP_FRAME_TXTIME_ZERO u32 value has to be set in frame_txtime. + */ +#define CAN_ISOTP_FRAME_TXTIME_ZERO 0xFFFFFFFF + #endif /* !_UAPI_CAN_ISOTP_H */ -- cgit v1.2.3 From 3b5d4ddf8fe1f60082513f94bae586ac80188a03 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 9 Mar 2022 01:04:50 -0800 Subject: bpf: net: Remove TC_AT_INGRESS_OFFSET and SKB_MONO_DELIVERY_TIME_OFFSET macro This patch removes the TC_AT_INGRESS_OFFSET and SKB_MONO_DELIVERY_TIME_OFFSET macros. Instead, PKT_VLAN_PRESENT_OFFSET is used because all of them are at the same offset. Comment is added to make it clear that changing the position of tc_at_ingress or mono_delivery_time will require to adjust the defined macros. The earlier discussion can be found here: https://lore.kernel.org/bpf/419d994e-ff61-7c11-0ec7-11fefcb0186e@iogearbox.net/ Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220309090450.3710955-1-kafai@fb.com --- include/linux/skbuff.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2be263184d1e..7b0cb10e70cb 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -960,10 +960,10 @@ struct sk_buff { __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 dst_pending_confirm:1; - __u8 mono_delivery_time:1; + __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ #ifdef CONFIG_NET_CLS_ACT __u8 tc_skip_classify:1; - __u8 tc_at_ingress:1; + __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ #endif #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; @@ -1062,7 +1062,9 @@ struct sk_buff { #endif #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) -/* if you move pkt_vlan_present around you also must adapt these constants */ +/* if you move pkt_vlan_present, tc_at_ingress, or mono_delivery_time + * around, you also must adapt these constants. + */ #ifdef __BIG_ENDIAN_BITFIELD #define PKT_VLAN_PRESENT_BIT 7 #define TC_AT_INGRESS_MASK (1 << 0) @@ -1073,8 +1075,6 @@ struct sk_buff { #define SKB_MONO_DELIVERY_TIME_MASK (1 << 5) #endif #define PKT_VLAN_PRESENT_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) -#define TC_AT_INGRESS_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) -#define SKB_MONO_DELIVERY_TIME_OFFSET offsetof(struct sk_buff, __pkt_vlan_present_offset) #ifdef __KERNEL__ /* -- cgit v1.2.3 From 9bb984f28d5bcb917d35d930fcfb89f90f9449fd Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 9 Mar 2022 01:05:09 -0800 Subject: bpf: Remove BPF_SKB_DELIVERY_TIME_NONE and rename s/delivery_time_/tstamp_/ This patch is to simplify the uapi bpf.h regarding to the tstamp type and use a similar way as the kernel to describe the value stored in __sk_buff->tstamp. My earlier thought was to avoid describing the semantic and clock base for the rcv timestamp until there is more clarity on the use case, so the __sk_buff->delivery_time_type naming instead of __sk_buff->tstamp_type. With some thoughts, it can reuse the UNSPEC naming. This patch first removes BPF_SKB_DELIVERY_TIME_NONE and also rename BPF_SKB_DELIVERY_TIME_UNSPEC to BPF_SKB_TSTAMP_UNSPEC and BPF_SKB_DELIVERY_TIME_MONO to BPF_SKB_TSTAMP_DELIVERY_MONO. The semantic of BPF_SKB_TSTAMP_DELIVERY_MONO is the same: __sk_buff->tstamp has delivery time in mono clock base. BPF_SKB_TSTAMP_UNSPEC means __sk_buff->tstamp has the (rcv) tstamp at ingress and the delivery time at egress. At egress, the clock base could be found from skb->sk->sk_clockid. __sk_buff->tstamp == 0 naturally means NONE, so NONE is not needed. With BPF_SKB_TSTAMP_UNSPEC for the rcv tstamp at ingress, the __sk_buff->delivery_time_type is also renamed to __sk_buff->tstamp_type which was also suggested in the earlier discussion: https://lore.kernel.org/bpf/b181acbe-caf8-502d-4b7b-7d96b9fc5d55@iogearbox.net/ The above will then make __sk_buff->tstamp and __sk_buff->tstamp_type the same as its kernel skb->tstamp and skb->mono_delivery_time counter part. The internal kernel function bpf_skb_convert_dtime_type_read() is then renamed to bpf_skb_convert_tstamp_type_read() and it can be simplified with the BPF_SKB_DELIVERY_TIME_NONE gone. A BPF_ALU32_IMM(BPF_AND) insn is also saved by using BPF_JMP32_IMM(BPF_JSET). The bpf helper bpf_skb_set_delivery_time() is also renamed to bpf_skb_set_tstamp(). The arg name is changed from dtime to tstamp also. It only allows setting tstamp 0 for BPF_SKB_TSTAMP_UNSPEC and it could be relaxed later if there is use case to change mono delivery time to non mono. prog->delivery_time_access is also renamed to prog->tstamp_type_access. Signed-off-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220309090509.3712315-1-kafai@fb.com --- include/linux/filter.h | 2 +- include/uapi/linux/bpf.h | 40 +++++++++++++++++++++------------------- 2 files changed, 22 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 9bf26307247f..05ed9bd31b45 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -573,7 +573,7 @@ struct bpf_prog { enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */ call_get_stack:1, /* Do we call bpf_get_stack() or bpf_get_stackid() */ call_get_func_ip:1, /* Do we call get_func_ip() */ - delivery_time_access:1; /* Accessed __sk_buff->delivery_time_type */ + tstamp_type_access:1; /* Accessed __sk_buff->tstamp_type */ enum bpf_prog_type type; /* Type of BPF program */ enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bc23020b638d..d288a0a9f797 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5090,23 +5090,22 @@ union bpf_attr { * 0 on success, or a negative error in case of failure. On error * *dst* buffer is zeroed out. * - * long bpf_skb_set_delivery_time(struct sk_buff *skb, u64 dtime, u32 dtime_type) + * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type) * Description - * Set a *dtime* (delivery time) to the __sk_buff->tstamp and also - * change the __sk_buff->delivery_time_type to *dtime_type*. + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. * - * When setting a delivery time (non zero *dtime*) to - * __sk_buff->tstamp, only BPF_SKB_DELIVERY_TIME_MONO *dtime_type* - * is supported. It is the only delivery_time_type that will be - * kept after bpf_redirect_*(). - * - * If there is no need to change the __sk_buff->delivery_time_type, - * the delivery time can be directly written to __sk_buff->tstamp + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp * instead. * - * *dtime* 0 and *dtime_type* BPF_SKB_DELIVERY_TIME_NONE - * can be used to clear any delivery time stored in - * __sk_buff->tstamp. + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. * * Only IPv4 and IPv6 skb->protocol are supported. * @@ -5119,7 +5118,7 @@ union bpf_attr { * Return * 0 on success. * **-EINVAL** for invalid input - * **-EOPNOTSUPP** for unsupported delivery_time_type and protocol + * **-EOPNOTSUPP** for unsupported protocol */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5314,7 +5313,7 @@ union bpf_attr { FN(xdp_load_bytes), \ FN(xdp_store_bytes), \ FN(copy_from_user_task), \ - FN(skb_set_delivery_time), \ + FN(skb_set_tstamp), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5505,9 +5504,12 @@ union { \ } __attribute__((aligned(8))) enum { - BPF_SKB_DELIVERY_TIME_NONE, - BPF_SKB_DELIVERY_TIME_UNSPEC, - BPF_SKB_DELIVERY_TIME_MONO, + BPF_SKB_TSTAMP_UNSPEC, + BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ + /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, + * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC + * and try to deduce it by ingress, egress or skb->sk->sk_clockid. + */ }; /* user accessible mirror of in-kernel sk_buff. @@ -5550,7 +5552,7 @@ struct __sk_buff { __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size; - __u8 delivery_time_type; + __u8 tstamp_type; __u32 :24; /* Padding, future use. */ __u64 hwtstamp; }; -- cgit v1.2.3 From 58617014405ad5c9f94f464444f4972dabb71ca7 Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Thu, 10 Mar 2022 23:53:35 +0800 Subject: bpf: Fix comment for helper bpf_current_task_under_cgroup() Fix the descriptions of the return values of helper bpf_current_task_under_cgroup(). Fixes: c6b5fb8690fa ("bpf: add documentation for eBPF helpers (42-50)") Signed-off-by: Hengqi Chen Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220310155335.1278783-1-hengqi.chen@gmail.com --- include/uapi/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d288a0a9f797..e9978a916c3e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2302,8 +2302,8 @@ union bpf_attr { * Return * The return value depends on the result of the test, and can be: * - * * 0, if current task belongs to the cgroup2. - * * 1, if current task does not belong to the cgroup2. + * * 1, if current task belongs to the cgroup2. + * * 0, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. * * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) -- cgit v1.2.3 From 174b16946e39ebd369097e0f773536c91a8c1a4c Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 2 Mar 2022 12:13:58 +0100 Subject: bpf-lsm: Introduce new helper bpf_ima_file_hash() ima_file_hash() has been modified to calculate the measurement of a file on demand, if it has not been already performed by IMA or the measurement is not fresh. For compatibility reasons, ima_inode_hash() remains unchanged. Keep the same approach in eBPF and introduce the new helper bpf_ima_file_hash() to take advantage of the modified behavior of ima_file_hash(). Signed-off-by: Roberto Sassu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220302111404.193900-4-roberto.sassu@huawei.com --- include/uapi/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e9978a916c3e..99fab54ae9c0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5119,6 +5119,16 @@ union bpf_attr { * 0 on success. * **-EINVAL** for invalid input * **-EOPNOTSUPP** for unsupported protocol + * + * long bpf_ima_file_hash(struct file *file, void *dst, u32 size) + * Description + * Returns a calculated IMA hash of the *file*. + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * invalid arguments are passed. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5314,6 +5324,7 @@ union bpf_attr { FN(xdp_store_bytes), \ FN(copy_from_user_task), \ FN(skb_set_tstamp), \ + FN(ima_file_hash), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 6789ab9668d920d7be636cb994d85be9a816f9d5 Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 10 Mar 2022 13:16:55 -0800 Subject: compiler_types: Refactor the use of btf_type_tag attribute. Previous patches have introduced the compiler attribute btf_type_tag for __user and __percpu. The availability of this attribute depends on some CONFIGs and compiler support. This patch refactors the use of btf_type_tag by introducing BTF_TYPE_TAG, which hides all the dependencies. No functional change. Suggested-by: Andrii Nakryiko Signed-off-by: Hao Luo Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20220310211655.3173786-1-haoluo@google.com --- include/linux/compiler_types.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index b9a8ae9440c7..1bc760ba400c 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -4,6 +4,13 @@ #ifndef __ASSEMBLY__ +#if defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \ + __has_attribute(btf_type_tag) +# define BTF_TYPE_TAG(value) __attribute__((btf_type_tag(#value))) +#else +# define BTF_TYPE_TAG(value) /* nothing */ +#endif + #ifdef __CHECKER__ /* address spaces */ # define __kernel __attribute__((address_space(0))) @@ -31,19 +38,11 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __kernel # ifdef STRUCTLEAK_PLUGIN # define __user __attribute__((user)) -# elif defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \ - __has_attribute(btf_type_tag) -# define __user __attribute__((btf_type_tag("user"))) # else -# define __user +# define __user BTF_TYPE_TAG(user) # endif # define __iomem -# if defined(CONFIG_DEBUG_INFO_BTF) && defined(CONFIG_PAHOLE_HAS_BTF_TAG) && \ - __has_attribute(btf_type_tag) -# define __percpu __attribute__((btf_type_tag("percpu"))) -# else -# define __percpu -# endif +# define __percpu BTF_TYPE_TAG(percpu) # define __rcu # define __chk_user_ptr(x) (void)0 # define __chk_io_ptr(x) (void)0 -- cgit v1.2.3 From 1926407a4ab0e59d5a27bed7b82029b356d80fa0 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 9 Mar 2022 23:20:33 +0100 Subject: net: openvswitch: fix uAPI incompatibility with existing user space Few years ago OVS user space made a strange choice in the commit [1] to define types only valid for the user space inside the copy of a kernel uAPI header. '#ifndef __KERNEL__' and another attribute was added later. This leads to the inevitable clash between user space and kernel types when the kernel uAPI is extended. The issue was unveiled with the addition of a new type for IPv6 extension header in kernel uAPI. When kernel provides the OVS_KEY_ATTR_IPV6_EXTHDRS attribute to the older user space application, application tries to parse it as OVS_KEY_ATTR_PACKET_TYPE and discards the whole netlink message as malformed. Since OVS_KEY_ATTR_IPV6_EXTHDRS is supplied along with every IPv6 packet that goes to the user space, IPv6 support is fully broken. Fixing that by bringing these user space attributes to the kernel uAPI to avoid the clash. Strictly speaking this is not the problem of the kernel uAPI, but changing it is the only way to avoid breakage of the older user space applications at this point. These 2 types are explicitly rejected now since they should not be passed to the kernel. Additionally, OVS_KEY_ATTR_TUNNEL_INFO moved out from the '#ifdef __KERNEL__' as there is no good reason to hide it from the userspace. And it's also explicitly rejected now, because it's for in-kernel use only. Comments with warnings were added to avoid the problem coming back. (1 << type) converted to (1ULL << type) to avoid integer overflow on OVS_KEY_ATTR_IPV6_EXTHDRS, since it equals 32 now. [1] beb75a40fdc2 ("userspace: Switching of L3 packets in L2 pipeline") Fixes: 28a3f0601727 ("net: openvswitch: IPv6: Add IPv6 extension header support") Link: https://lore.kernel.org/netdev/3adf00c7-fe65-3ef4-b6d7-6d8a0cad8a5f@nvidia.com Link: https://github.com/openvswitch/ovs/commit/beb75a40fdc295bfd6521b0068b4cd12f6de507c Reported-by: Roi Dayan Signed-off-by: Ilya Maximets Acked-by: Nicolas Dichtel Acked-by: Aaron Conole Link: https://lore.kernel.org/r/20220309222033.3018976-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/openvswitch.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 9d1710f20505..ce3e1738d427 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -351,11 +351,21 @@ enum ovs_key_attr { OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, /* struct ovs_key_ct_tuple_ipv4 */ OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, /* struct ovs_key_ct_tuple_ipv6 */ OVS_KEY_ATTR_NSH, /* Nested set of ovs_nsh_key_* */ - OVS_KEY_ATTR_IPV6_EXTHDRS, /* struct ovs_key_ipv6_exthdr */ -#ifdef __KERNEL__ - OVS_KEY_ATTR_TUNNEL_INFO, /* struct ip_tunnel_info */ -#endif + /* User space decided to squat on types 29 and 30. They are defined + * below, but should not be sent to the kernel. + * + * WARNING: No new types should be added unless they are defined + * for both kernel and user space (no 'ifdef's). It's hard + * to keep compatibility otherwise. + */ + OVS_KEY_ATTR_PACKET_TYPE, /* be32 packet type */ + OVS_KEY_ATTR_ND_EXTENSIONS, /* IPv6 Neighbor Discovery extensions */ + + OVS_KEY_ATTR_TUNNEL_INFO, /* struct ip_tunnel_info. + * For in-kernel use only. + */ + OVS_KEY_ATTR_IPV6_EXTHDRS, /* struct ovs_key_ipv6_exthdr */ __OVS_KEY_ATTR_MAX }; -- cgit v1.2.3 From 271907ee2f29cd1078fd219f0778fd824fb1971c Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 17 Jan 2022 15:14:44 +0200 Subject: net/mlx5: Query the maximum MCIA register read size from firmware The MCIA register supports either 12 or 32 dwords, use the correct value by querying the capability from the MCAM register. Signed-off-by: Gal Pressman Reviewed-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 4 +++- include/linux/mlx5/port.h | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 318fae4b3560..745107ff681d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9691,7 +9691,9 @@ struct mlx5_ifc_pcam_reg_bits { }; struct mlx5_ifc_mcam_enhanced_features_bits { - u8 reserved_at_0[0x6a]; + u8 reserved_at_0[0x5d]; + u8 mcia_32dwords[0x1]; + u8 reserved_at_5e[0xc]; u8 reset_state[0x1]; u8 ptpcyc2realtime_modify[0x1]; u8 reserved_at_6c[0x2]; diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 77ea4f9c5265..402413b3e914 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -56,7 +56,6 @@ enum mlx5_an_status { MLX5_AN_LINK_DOWN = 4, }; -#define MLX5_EEPROM_MAX_BYTES 32 #define MLX5_EEPROM_IDENTIFIER_BYTE_MASK 0x000000ff #define MLX5_I2C_ADDR_LOW 0x50 #define MLX5_I2C_ADDR_HIGH 0x51 -- cgit v1.2.3 From fcb610a86c53dfcfbb2aa62e704481112752f367 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 17 Jan 2022 15:53:06 +0200 Subject: net/mlx5: Parse module mapping using mlx5_ifc The assumption that the first byte in the module mapping dword is the module number shouldn't be hard-coded in the driver, but come from mlx5_ifc structs. While at it, fix the incorrect width for the 'rx_lane' and 'tx_lane' fields. Signed-off-by: Gal Pressman Reviewed-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 8 ++++---- include/linux/mlx5/port.h | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 745107ff681d..91b7f730ed91 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9888,10 +9888,10 @@ struct mlx5_ifc_pcmr_reg_bits { }; struct mlx5_ifc_lane_2_module_mapping_bits { - u8 reserved_at_0[0x6]; - u8 rx_lane[0x2]; - u8 reserved_at_8[0x6]; - u8 tx_lane[0x2]; + u8 reserved_at_0[0x4]; + u8 rx_lane[0x4]; + u8 reserved_at_8[0x4]; + u8 tx_lane[0x4]; u8 reserved_at_10[0x8]; u8 module[0x8]; }; diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 402413b3e914..28a928b0684b 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -56,7 +56,6 @@ enum mlx5_an_status { MLX5_AN_LINK_DOWN = 4, }; -#define MLX5_EEPROM_IDENTIFIER_BYTE_MASK 0x000000ff #define MLX5_I2C_ADDR_LOW 0x50 #define MLX5_I2C_ADDR_HIGH 0x51 #define MLX5_EEPROM_PAGE_LENGTH 256 -- cgit v1.2.3 From 2916b7a9c7c25ecf9be2f37e567a277e861f8e3f Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Tue, 22 Feb 2022 20:36:39 +0530 Subject: nl80211: fix typo of NL80211_IF_TYPE_OCB in documentation It should be NL80211_IFTYPE_OCB instead. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/1645542399-4680-1-git-send-email-quic_vjakkam@quicinc.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 98ed52663d6b..0568a79097b8 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3259,7 +3259,7 @@ enum nl80211_attrs { * and therefore can't be created in the normal ways, use the * %NL80211_CMD_START_P2P_DEVICE and %NL80211_CMD_STOP_P2P_DEVICE * commands to create and destroy one - * @NL80211_IF_TYPE_OCB: Outside Context of a BSS + * @NL80211_IFTYPE_OCB: Outside Context of a BSS * This mode corresponds to the MIB variable dot11OCBActivated=true * @NL80211_IFTYPE_NAN: NAN device interface type (not a netdev) * @NL80211_IFTYPE_MAX: highest interface type number currently defined -- cgit v1.2.3 From 3af722cb735d212554027ec81e2aa2e6bf1ee34d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 8 Mar 2022 17:12:10 +0100 Subject: powerpc/net: Implement powerpc specific csum_shift() to remove branch Today's implementation of csum_shift() leads to branching based on parity of 'offset' 000002f8 : 2f8: 70 a5 00 01 andi. r5,r5,1 2fc: 41 a2 00 08 beq 304 300: 54 84 c0 3e rotlwi r4,r4,24 304: 7c 63 20 14 addc r3,r3,r4 308: 7c 63 01 94 addze r3,r3 30c: 4e 80 00 20 blr Use first bit of 'offset' directly as input of the rotation instead of branching. 000002f8 : 2f8: 54 a5 1f 38 rlwinm r5,r5,3,28,28 2fc: 20 a5 00 20 subfic r5,r5,32 300: 5c 84 28 3e rotlw r4,r4,r5 304: 7c 63 20 14 addc r3,r3,r4 308: 7c 63 01 94 addze r3,r3 30c: 4e 80 00 20 blr And change to left shift instead of right shift to skip one more instruction. This has no impact on the final sum. 000002f8 : 2f8: 54 a5 1f 38 rlwinm r5,r5,3,28,28 2fc: 5c 84 28 3e rotlw r4,r4,r5 300: 7c 63 20 14 addc r3,r3,r4 304: 7c 63 01 94 addze r3,r3 308: 4e 80 00 20 blr Seems like only powerpc benefits from a branchless implementation. Other main architectures like ARM or X86 get better code with the generic implementation and its branch. Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller --- include/net/checksum.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index 79c67f14c448..6bc783b7a06c 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -80,6 +80,7 @@ static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend) return csum16_add(csum, ~addend); } +#ifndef HAVE_ARCH_CSUM_SHIFT static __always_inline __wsum csum_shift(__wsum sum, int offset) { /* rotate sum to align it with a 16b boundary */ @@ -87,6 +88,7 @@ static __always_inline __wsum csum_shift(__wsum sum, int offset) return (__force __wsum)ror32((__force u32)sum, 8); return sum; } +#endif static __always_inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) -- cgit v1.2.3 From 7d352ccf1e9935b5222ca84e8baeb07a0c8f94b9 Mon Sep 17 00:00:00 2001 From: Youghandhar Chintala Date: Tue, 8 Mar 2022 17:23:24 +0530 Subject: mac80211: Add support to trigger sta disconnect on hardware restart Currently in case of target hardware restart, we just reconfig and re-enable the security keys and enable the network queues to start data traffic back from where it was interrupted. Many ath10k wifi chipsets have sequence numbers for the data packets assigned by firmware and the mac sequence number will restart from zero after target hardware restart leading to mismatch in the sequence number expected by the remote peer vs the sequence number of the frame sent by the target firmware. This mismatch in sequence number will cause out-of-order packets on the remote peer and all the frames sent by the device are dropped until we reach the sequence number which was sent before we restarted the target hardware In order to fix this, we trigger a sta disconnect, in case of target hw restart. After this there will be a fresh connection and thereby avoiding the dropping of frames by remote peer. The right fix would be to pull the entire data path into the host which is not feasible or would need lots of complex changes and will still be inefficient. Tested on ath10k using WCN3990, QCA6174 Signed-off-by: Youghandhar Chintala Link: https://lore.kernel.org/r/20220308115325.5246-2-youghand@codeaurora.org Signed-off-by: Johannes Berg --- include/net/mac80211.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f118b8fe667a..b8e8c82b53aa 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6070,6 +6070,16 @@ void ieee80211_disconnect(struct ieee80211_vif *vif, bool reconnect); */ void ieee80211_resume_disconnect(struct ieee80211_vif *vif); +/** + * ieee80211_hw_restart_disconnect - disconnect from AP after + * hardware restart + * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * + * Instructs mac80211 to disconnect from the AP after + * hardware restart. + */ +void ieee80211_hw_restart_disconnect(struct ieee80211_vif *vif); + /** * ieee80211_cqm_rssi_notify - inform a configured connection quality monitoring * rssi threshold triggered -- cgit v1.2.3 From b20dc3c684580ddc07eb48ee3c3dc7597cd5eebf Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Fri, 4 Mar 2022 17:40:42 +0100 Subject: gtp: Allow to create GTP device without FDs Currently, when the user wants to create GTP device, he has to provide file handles to the sockets created in userspace (IFLA_GTP_FD0, IFLA_GTP_FD1). This behaviour is not ideal, considering the option of adding support for GTP device creation through ip link. Ip link application is not a good place to create such sockets. This patch allows to create GTP device without providing IFLA_GTP_FD0 and IFLA_GTP_FD1 arguments. If the user sets IFLA_GTP_CREATE_SOCKETS attribute, then GTP module takes care of creating UDP sockets by itself. Sockets are created with the commonly known UDP ports used for GTP protocol (GTP0_PORT and GTP1U_PORT). In this case we don't have to provide encap_destroy because no extra deinitialization is needed, everything is covered by udp_tunnel_sock_release. Note: GTP instance created with only this change applied, does not handle GTP Echo Requests. This is implemented in the following patch. Signed-off-by: Wojciech Drewek Signed-off-by: Tony Nguyen --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ddca20357e7e..ebd2aa3ef809 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -887,6 +887,7 @@ enum { IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, + IFLA_GTP_CREATE_SOCKETS, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) -- cgit v1.2.3 From 9af41cc33471ea1efa6f77e188f055cc77d0a5c5 Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Fri, 4 Mar 2022 17:40:43 +0100 Subject: gtp: Implement GTP echo response Adding GTP device through ip link creates the situation where there is no userspace daemon which would handle GTP messages (Echo Request for example). GTP-U instance which would not respond to echo requests would violate GTP specification. When GTP packet arrives with GTP_ECHO_REQ message type, GTP_ECHO_RSP is send to the sender. GTP_ECHO_RSP message should contain information element with GTPIE_RECOVERY tag and restart counter value. For GTPv1 restart counter is not used and should be equal to 0, for GTPv0 restart counter contains information provided from userspace(IFLA_GTP_RESTART_COUNT). Signed-off-by: Wojciech Drewek Suggested-by: Harald Welte Reviewed-by: Harald Welte Tested-by: Harald Welte Signed-off-by: Tony Nguyen --- include/net/gtp.h | 31 +++++++++++++++++++++++++++++++ include/uapi/linux/if_link.h | 1 + 2 files changed, 32 insertions(+) (limited to 'include') diff --git a/include/net/gtp.h b/include/net/gtp.h index 0e16ebb2a82d..0e12c37f2958 100644 --- a/include/net/gtp.h +++ b/include/net/gtp.h @@ -7,8 +7,13 @@ #define GTP0_PORT 3386 #define GTP1U_PORT 2152 +/* GTP messages types */ +#define GTP_ECHO_REQ 1 /* Echo Request */ +#define GTP_ECHO_RSP 2 /* Echo Response */ #define GTP_TPDU 255 +#define GTPIE_RECOVERY 14 + struct gtp0_header { /* According to GSM TS 09.60. */ __u8 flags; __u8 type; @@ -27,6 +32,32 @@ struct gtp1_header { /* According to 3GPP TS 29.060. */ __be32 tid; } __attribute__ ((packed)); +struct gtp1_header_long { /* According to 3GPP TS 29.060. */ + __u8 flags; + __u8 type; + __be16 length; + __be32 tid; + __be16 seq; + __u8 npdu; + __u8 next; +} __packed; + +/* GTP Information Element */ +struct gtp_ie { + __u8 tag; + __u8 val; +} __packed; + +struct gtp0_packet { + struct gtp0_header gtp0_h; + struct gtp_ie ie; +} __packed; + +struct gtp1u_packet { + struct gtp1_header_long gtp1u_h; + struct gtp_ie ie; +} __packed; + #define GTP1_F_NPDU 0x01 #define GTP1_F_SEQ 0x02 #define GTP1_F_EXTHDR 0x04 diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ebd2aa3ef809..bd24c7dc10a2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -888,6 +888,7 @@ enum { IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, IFLA_GTP_CREATE_SOCKETS, + IFLA_GTP_RESTART_COUNT, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) -- cgit v1.2.3 From d33bd757d362699cfce3c68b53cd12b947d196f4 Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Fri, 4 Mar 2022 17:40:44 +0100 Subject: gtp: Implement GTP echo request Adding GTP device through ip link creates the situation where GTP instance is not able to send GTP echo requests. Echo requests are used to check if GTP peer is still alive. With this patch, gtp_genl_ops are extended by new cmd (GTP_CMD_ECHOREQ) which allows to send echo request in the given version of GTP protocol (v0 or v1), from the given ms address to he given peer. TID is not inclued because in all path management messages it should be equal to 0. When GTP echo response is detected, multicast message is send to everyone in the gtp_genl_family. Message contains GTP version, ms address and peer address. Suggested-by: Harald Welte Signed-off-by: Wojciech Drewek Reviewed-by: Harald Welte Signed-off-by: Tony Nguyen --- include/uapi/linux/gtp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h index 79f9191bbb24..2f61298a7b77 100644 --- a/include/uapi/linux/gtp.h +++ b/include/uapi/linux/gtp.h @@ -8,6 +8,7 @@ enum gtp_genl_cmds { GTP_CMD_NEWPDP, GTP_CMD_DELPDP, GTP_CMD_GETPDP, + GTP_CMD_ECHOREQ, GTP_CMD_MAX, }; -- cgit v1.2.3 From e3acda7ade0a36c5cbebc2b54d30b7f08a4ba29b Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Fri, 4 Mar 2022 17:40:45 +0100 Subject: net/sched: Allow flower to match on GTP options Options are as follows: PDU_TYPE:QFI and they refernce to the fields from the PDU Session Protocol. PDU Session data is conveyed in GTP-U Extension Header. GTP-U Extension Header is described in 3GPP TS 29.281. PDU Session Protocol is described in 3GPP TS 38.415. PDU_TYPE - indicates the type of the PDU Session Information (4 bits) QFI - QoS Flow Identifier (6 bits) # ip link add gtp_dev type gtp role sgsn # tc qdisc add dev gtp_dev ingress # tc filter add dev gtp_dev protocol ip parent ffff: \ flower \ enc_key_id 11 \ gtp_opts 1:8/ff:ff \ action mirred egress redirect dev eth0 Signed-off-by: Wojciech Drewek Signed-off-by: Tony Nguyen --- include/net/gtp.h | 5 +++++ include/uapi/linux/if_tunnel.h | 4 +++- include/uapi/linux/pkt_cls.h | 15 +++++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/gtp.h b/include/net/gtp.h index 0e12c37f2958..23c2aaae8a42 100644 --- a/include/net/gtp.h +++ b/include/net/gtp.h @@ -58,6 +58,11 @@ struct gtp1u_packet { struct gtp_ie ie; } __packed; +struct gtp_pdu_session_info { /* According to 3GPP TS 38.415. */ + u8 pdu_type; + u8 qfi; +}; + #define GTP1_F_NPDU 0x01 #define GTP1_F_SEQ 0x02 #define GTP1_F_EXTHDR 0x04 diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 7d9105533c7b..102119628ff5 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -176,8 +176,10 @@ enum { #define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) #define TUNNEL_NOCACHE __cpu_to_be16(0x2000) #define TUNNEL_ERSPAN_OPT __cpu_to_be16(0x4000) +#define TUNNEL_GTP_OPT __cpu_to_be16(0x8000) #define TUNNEL_OPTIONS_PRESENT \ - (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT) + (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT | \ + TUNNEL_GTP_OPT) #endif /* _UAPI_IF_TUNNEL_H_ */ diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index ee38b35c3f57..404f97fb239c 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -616,6 +616,10 @@ enum { * TCA_FLOWER_KEY_ENC_OPT_ERSPAN_ * attributes */ + TCA_FLOWER_KEY_ENC_OPTS_GTP, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_GTP_ + * attributes + */ __TCA_FLOWER_KEY_ENC_OPTS_MAX, }; @@ -654,6 +658,17 @@ enum { #define TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX \ (__TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX - 1) +enum { + TCA_FLOWER_KEY_ENC_OPT_GTP_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_GTP_PDU_TYPE, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_GTP_QFI, /* u8 */ + + __TCA_FLOWER_KEY_ENC_OPT_GTP_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_GTP_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_GTP_MAX - 1) + enum { TCA_FLOWER_KEY_MPLS_OPTS_UNSPEC, TCA_FLOWER_KEY_MPLS_OPTS_LSE, -- cgit v1.2.3 From 81dd9849fa4911f76a14f354a048865894b9751e Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Fri, 4 Mar 2022 17:40:46 +0100 Subject: gtp: Add support for checking GTP device type Add a function that checks if a net device type is GTP. Signed-off-by: Wojciech Drewek Reviewed-by: Harald Welte Signed-off-by: Tony Nguyen --- include/net/gtp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/gtp.h b/include/net/gtp.h index 23c2aaae8a42..c1d6169df331 100644 --- a/include/net/gtp.h +++ b/include/net/gtp.h @@ -63,6 +63,12 @@ struct gtp_pdu_session_info { /* According to 3GPP TS 38.415. */ u8 qfi; }; +static inline bool netif_is_gtp(const struct net_device *dev) +{ + return dev->rtnl_link_ops && + !strcmp(dev->rtnl_link_ops->kind, "gtp"); +} + #define GTP1_F_NPDU 0x01 #define GTP1_F_SEQ 0x02 #define GTP1_F_EXTHDR 0x04 -- cgit v1.2.3 From d3826a95222c44a527f76b011bb5af8c924632e9 Mon Sep 17 00:00:00 2001 From: Dirk van der Merwe Date: Fri, 11 Mar 2022 11:43:06 +0100 Subject: nfp: add support for NFP3800/NFP3803 PCIe devices Enable binding the nfp driver to NFP3800 and NFP3803 devices. The PCIE_SRAM offset is different for the NFP3800 device, which also only supports a single explicit group. Changes to Dirk's work: * 48-bit dma addressing is not ready yet. Keep 40-bit dma addressing for NFP3800. Signed-off-by: Dirk van der Merwe Signed-off-by: Jakub Kicinski Signed-off-by: Fei Qin Signed-off-by: Simon Horman Signed-off-by: Jakub Kicinski --- include/linux/pci_ids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index c7e6f2043c7d..5462c29f0538 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2531,9 +2531,11 @@ #define PCI_VENDOR_ID_HUAWEI 0x19e5 #define PCI_VENDOR_ID_NETRONOME 0x19ee +#define PCI_DEVICE_ID_NETRONOME_NFP3800 0x3800 #define PCI_DEVICE_ID_NETRONOME_NFP4000 0x4000 #define PCI_DEVICE_ID_NETRONOME_NFP5000 0x5000 #define PCI_DEVICE_ID_NETRONOME_NFP6000 0x6000 +#define PCI_DEVICE_ID_NETRONOME_NFP3800_VF 0x3803 #define PCI_DEVICE_ID_NETRONOME_NFP6000_VF 0x6003 #define PCI_VENDOR_ID_QMI 0x1a32 -- cgit v1.2.3 From 625788b5844511cf4c30cffa7fa0bc3a69cebc82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Mar 2022 21:14:20 -0800 Subject: net: add per-cpu storage and net->core_stats Before adding yet another possibly contended atomic_long_t, it is time to add per-cpu storage for existing ones: dev->tx_dropped, dev->rx_dropped, and dev->rx_nohandler Because many devices do not have to increment such counters, allocate the per-cpu storage on demand, so that dev_get_stats() does not have to spend considerable time folding zero counters. Note that some drivers have abused these counters which were supposed to be only used by core networking stack. v4: should use per_cpu_ptr() in dev_get_stats() (Jakub) v3: added a READ_ONCE() in netdev_core_stats_alloc() (Paolo) v2: add a missing include (reported by kernel test robot ) Change in netdev_core_stats_alloc() (Jakub) Signed-off-by: Eric Dumazet Cc: jeffreyji Reviewed-by: Brian Vazquez Reviewed-by: Jakub Kicinski Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220311051420.2608812-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 46 +++++++++++++++++++++++++++++++++++++--------- include/net/bonding.h | 2 +- 2 files changed, 38 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index acd3cf69b61f..0d994710b335 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -194,6 +195,14 @@ struct net_device_stats { unsigned long tx_compressed; }; +/* per-cpu stats, allocated on demand. + * Try to fit them in a single cache line, for dev_get_stats() sake. + */ +struct net_device_core_stats { + local_t rx_dropped; + local_t tx_dropped; + local_t rx_nohandler; +} __aligned(4 * sizeof(local_t)); #include #include @@ -1735,12 +1744,8 @@ enum netdev_ml_priv_type { * @stats: Statistics struct, which was left as a legacy, use * rtnl_link_stats64 instead * - * @rx_dropped: Dropped packets by core network, - * do not use this in drivers - * @tx_dropped: Dropped packets by core network, + * @core_stats: core networking counters, * do not use this in drivers - * @rx_nohandler: nohandler dropped packets by core network on - * inactive devices, do not use this in drivers * @carrier_up_count: Number of times the carrier has been up * @carrier_down_count: Number of times the carrier has been down * @@ -2023,9 +2028,7 @@ struct net_device { struct net_device_stats stats; /* not used by modern drivers */ - atomic_long_t rx_dropped; - atomic_long_t tx_dropped; - atomic_long_t rx_nohandler; + struct net_device_core_stats __percpu *core_stats; /* Stats to monitor link on/off, flapping */ atomic_t carrier_up_count; @@ -3839,13 +3842,38 @@ static __always_inline bool __is_skb_forwardable(const struct net_device *dev, return false; } +struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev); + +static inline struct net_device_core_stats *dev_core_stats(struct net_device *dev) +{ + /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */ + struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats); + + if (likely(p)) + return this_cpu_ptr(p); + + return netdev_core_stats_alloc(dev); +} + +#define DEV_CORE_STATS_INC(FIELD) \ +static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ +{ \ + struct net_device_core_stats *p = dev_core_stats(dev); \ + \ + if (p) \ + local_inc(&p->FIELD); \ +} +DEV_CORE_STATS_INC(rx_dropped) +DEV_CORE_STATS_INC(tx_dropped) +DEV_CORE_STATS_INC(rx_nohandler) + static __always_inline int ____dev_forward_skb(struct net_device *dev, struct sk_buff *skb, const bool check_mtu) { if (skb_orphan_frags(skb, GFP_ATOMIC) || unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) { - atomic_long_inc(&dev->rx_dropped); + dev_core_stats_rx_dropped_inc(dev); kfree_skb(skb); return NET_RX_DROP; } diff --git a/include/net/bonding.h b/include/net/bonding.h index d0dfe727e0b1..b14f4c0b4e9e 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -770,7 +770,7 @@ extern const struct sysfs_ops slave_sysfs_ops; static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb) { - atomic_long_inc(&dev->tx_dropped); + dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); return NET_XMIT_DROP; } -- cgit v1.2.3 From 1f4a5983d623d6dbda4cc7587a2d9d798e0d4035 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Fri, 11 Mar 2022 17:04:03 +0800 Subject: net: macvlan: add net device refcount tracker Add net device refcount tracker to macvlan. Signed-off-by: Ziyang Xuan Signed-off-by: David S. Miller --- include/linux/if_macvlan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/if_macvlan.h b/include/linux/if_macvlan.h index 10c94a3936ca..b42294739063 100644 --- a/include/linux/if_macvlan.h +++ b/include/linux/if_macvlan.h @@ -21,6 +21,7 @@ struct macvlan_dev { struct hlist_node hlist; struct macvlan_port *port; struct net_device *lowerdev; + netdevice_tracker dev_tracker; void *accel_priv; struct vlan_pcpu_stats __percpu *pcpu_stats; -- cgit v1.2.3 From fbd9a2ceba5c74bbfa19cf257ae4b4b2c820860d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 11 Mar 2022 16:03:42 +0100 Subject: net: Add lockdep asserts to ____napi_schedule(). ____napi_schedule() needs to be invoked with disabled interrupts due to __raise_softirq_irqoff (in order not to corrupt the per-CPU list). ____napi_schedule() needs also to be invoked from an interrupt context so that the raised-softirq is processed while the interrupt context is left. Add lockdep asserts for both conditions. While this is the second time the irq/softirq check is needed, provide a generic lockdep_assert_softirq_will_run() which is used by both caller. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller --- include/linux/lockdep.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 467b94257105..0cc65d216701 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -329,6 +329,12 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_assert_none_held_once() \ lockdep_assert_once(!current->lockdep_depth) +/* + * Ensure that softirq is handled within the callchain and not delayed and + * handled by chance. + */ +#define lockdep_assert_softirq_will_run() \ + lockdep_assert_once(hardirq_count() | softirq_count()) #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) @@ -414,6 +420,7 @@ extern int lockdep_is_held(const void *); #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_assert_none_held_once() do { } while (0) +#define lockdep_assert_softirq_will_run() do { } while (0) #define lockdep_recursing(tsk) (0) -- cgit v1.2.3 From d538eca85c2aa6ecb5036e6ff47efc93d9f294da Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Mar 2022 23:15:18 +0200 Subject: net: dsa: report and change port default priority using dcbnl The port-based default QoS class is assigned to packets that lack a VLAN PCP (or the port is configured to not trust the VLAN PCP), an IP DSCP (or the port is configured to not trust IP DSCP), and packets on which no tc-skbedit action has matched. Similar to other drivers, this can be exposed to user space using the DCB Application Priority Table. IEEE 802.1Q-2018 specifies in Table D-8 - Sel field values that when the Selector is 1, the Protocol ID value of 0 denotes the "Default application priority. For use when application priority is not otherwise specified." The way in which the dcbnl integration in DSA has been designed has to do with its requirements. Andrew Lunn explains that SOHO switches are expected to come with some sort of pre-configured QoS profile, and that it is desirable for this to come pre-loaded into the DSA slave interfaces' DCB application priority table. In the dcbnl design, this is possible because calls to dcb_ieee_setapp() can be initiated by anyone including being self-initiated by this device driver. However, what makes this challenging to implement in DSA is that the DSA core manages the net_devices (effectively hiding them from drivers), while drivers manage the hardware. The DSA core has no knowledge of what individual drivers' QoS policies are. DSA could export to drivers a wrapper over dcb_ieee_setapp() and these could call that function to pre-populate the app priority table, however drivers don't have a good moment in time to do this. The dsa_switch_ops :: setup() method gets called before the net_devices are created (dsa_slave_create), and so is dsa_switch_ops :: port_setup(). What remains is dsa_switch_ops :: port_enable(), but this gets called upon each ndo_open. If we add app table entries on every open, we'd need to remove them on close, to avoid duplicate entry errors. But if we delete app priority entries on close, what we delete may not be the initial, driver pre-populated entries, but rather user-added entries. So it is clear that letting drivers choose the timing of the dcb_ieee_setapp() call is inappropriate. The alternative which was chosen is to introduce hardware-specific ops in dsa_switch_ops, and effectively hide dcbnl details from drivers as well. For pre-populating the application table, dsa_slave_dcbnl_init() will call ds->ops->port_get_default_prio() which is supposed to read from hardware. If the operation succeeds, DSA creates a default-prio app table entry. The method is called as soon as the slave_dev is registered, but before we release the rtnl_mutex. This is done such that user space sees the app table entries as soon as it sees the interface being registered. The fact that we populate slave_dev->dcbnl_ops with a non-NULL pointer changes behavior in dcb_doit() from net/dcb/dcbnl.c, which used to return -EOPNOTSUPP for any dcbnl operation where netdev->dcbnl_ops is NULL. Because there are still dcbnl-unaware DSA drivers even if they have dcbnl_ops populated, the way to restore the behavior is to make all dcbnl_ops return -EOPNOTSUPP on absence of the hardware-specific dsa_switch_ops method. The dcbnl framework absurdly allows there to be more than one app table entry for the same selector and protocol (in other words, more than one port-based default priority). In the iproute2 dcb program, there is a "replace" syntactical sugar command which performs an "add" and a "del" to hide this away. But we choose the largest configured priority when we call ds->ops->port_set_default_prio(), using __fls(). When there is no default-prio app table entry left, the port-default priority is restored to 0. Link: https://patchwork.kernel.org/project/netdevbpf/patch/20210113154139.1803705-2-olteanv@gmail.com/ Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9d16505fc0e2..1220af73151b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -892,6 +892,13 @@ struct dsa_switch_ops { int (*get_ts_info)(struct dsa_switch *ds, int port, struct ethtool_ts_info *ts); + /* + * DCB ops + */ + int (*port_get_default_prio)(struct dsa_switch *ds, int port); + int (*port_set_default_prio)(struct dsa_switch *ds, int port, + u8 prio); + /* * Suspend and resume */ -- cgit v1.2.3 From 47d75f7822064d024ec9207c0fc1777f983783b7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Mar 2022 23:15:19 +0200 Subject: net: dsa: report and change port dscp priority using dcbnl Similar to the port-based default priority, IEEE 802.1Q-2018 allows the Application Priority Table to define QoS classes (0 to 7) per IP DSCP value (0 to 63). In the absence of an app table entry for a packet with DSCP value X, QoS classification for that packet falls back to other methods (VLAN PCP or port-based default). The presence of an app table for DSCP value X with priority Y makes the hardware classify the packet to QoS class Y. As opposed to the default-prio where DSA exposes only a "set" in dsa_switch_ops (because the port-based default is the fallback, it always exists, either implicitly or explicitly), for DSCP priorities we expose an "add" and a "del". The addition of a DSCP entry means trusting that DSCP priority, the deletion means ignoring it. Drivers that already trust (at least some) DSCP values can describe their configuration in dsa_switch_ops :: port_get_dscp_prio(), which is called for each DSCP value from 0 to 63. Again, there can be more than one dcbnl app table entry for the same DSCP value, DSA chooses the one with the largest configured priority. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1220af73151b..9bfe984fcdbf 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -898,6 +898,11 @@ struct dsa_switch_ops { int (*port_get_default_prio)(struct dsa_switch *ds, int port); int (*port_set_default_prio)(struct dsa_switch *ds, int port, u8 prio); + int (*port_get_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp); + int (*port_add_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, + u8 prio); + int (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp, + u8 prio); /* * Suspend and resume -- cgit v1.2.3 From 978777d0fb06663523281aa50a5040c3aa31fbe7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Mar 2022 23:15:20 +0200 Subject: net: dsa: felix: configure default-prio and dscp priorities Follow the established programming model for this driver and provide shims in the felix DSA driver which call the implementations from the ocelot switch lib. The ocelot switchdev driver wasn't integrated with dcbnl due to lack of hardware availability. The switch doesn't have any fancy QoS classification enabled by default. The provided getters will create a default-prio app table entry of 0, and no dscp entry. However, the getters have been made to actually retrieve the hardware configuration rather than static values, to be future proof in case DSA will need this information from more call paths. For default-prio, there is a single field per port, in ANA_PORT_QOS_CFG, called QOS_DEFAULT_VAL. DSCP classification is enabled per-port, again via ANA_PORT_QOS_CFG (field QOS_DSCP_ENA), and individual DSCP values are configured as trusted or not through register ANA_DSCP_CFG (replicated 64 times). An untrusted DSCP value falls back to other QoS classification methods. If trusted, the selected ANA_DSCP_CFG register also holds the QoS class in the QOS_DSCP_VAL field. The hardware also supports DSCP remapping (DSCP value X is translated to DSCP value Y before the QoS class is determined based on the app table entry for Y) and DSCP packet rewriting. The dcbnl framework, for being so flexible in other useless areas, doesn't appear to support this. So this functionality has been left out. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index ee3c59639d70..4d51e2a7120f 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -869,6 +869,11 @@ int ocelot_port_pre_bridge_flags(struct ocelot *ocelot, int port, struct switchdev_brport_flags val); void ocelot_port_bridge_flags(struct ocelot *ocelot, int port, struct switchdev_brport_flags val); +int ocelot_port_get_default_prio(struct ocelot *ocelot, int port); +int ocelot_port_set_default_prio(struct ocelot *ocelot, int port, u8 prio); +int ocelot_port_get_dscp_prio(struct ocelot *ocelot, int port, u8 dscp); +int ocelot_port_add_dscp_prio(struct ocelot *ocelot, int port, u8 dscp, u8 prio); +int ocelot_port_del_dscp_prio(struct ocelot *ocelot, int port, u8 dscp, u8 prio); int ocelot_port_bridge_join(struct ocelot *ocelot, int port, struct net_device *bridge, int bridge_num, struct netlink_ext_ack *extack); -- cgit v1.2.3 From fc93db153b0187a9047f00bfd5cce75108530593 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 12 Mar 2022 13:45:05 -0800 Subject: net: disable preemption in dev_core_stats_XXX_inc() helpers syzbot was kind enough to remind us that dev->{tx_dropped|rx_dropped} could be increased in process context. BUG: using smp_processor_id() in preemptible [00000000] code: syz-executor413/3593 caller is netdev_core_stats_alloc+0x98/0x110 net/core/dev.c:10298 CPU: 1 PID: 3593 Comm: syz-executor413 Not tainted 5.17.0-rc7-syzkaller-02426-g97aeb877de7f #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 check_preemption_disabled+0x16b/0x170 lib/smp_processor_id.c:49 netdev_core_stats_alloc+0x98/0x110 net/core/dev.c:10298 dev_core_stats include/linux/netdevice.h:3855 [inline] dev_core_stats_rx_dropped_inc include/linux/netdevice.h:3866 [inline] tun_get_user+0x3455/0x3ab0 drivers/net/tun.c:1800 tun_chr_write_iter+0xe1/0x200 drivers/net/tun.c:2015 call_write_iter include/linux/fs.h:2074 [inline] new_sync_write+0x431/0x660 fs/read_write.c:503 vfs_write+0x7cd/0xae0 fs/read_write.c:590 ksys_write+0x12d/0x250 fs/read_write.c:643 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f2cf4f887e3 Code: 5d 41 5c 41 5d 41 5e e9 9b fd ff ff 66 2e 0f 1f 84 00 00 00 00 00 90 64 8b 04 25 18 00 00 00 85 c0 75 14 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 55 c3 0f 1f 40 00 48 83 ec 28 48 89 54 24 18 RSP: 002b:00007ffd50dd5fd8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007ffd50dd6000 RCX: 00007f2cf4f887e3 RDX: 000000000000002a RSI: 0000000000000000 RDI: 00000000000000c8 RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffd50dd5ff0 R14: 00007ffd50dd5fe8 R15: 00007ffd50dd5fe4 Fixes: 625788b58445 ("net: add per-cpu storage and net->core_stats") Signed-off-by: Eric Dumazet Cc: jeffreyji Cc: Brian Vazquez Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20220312214505.3294762-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0d994710b335..8cbe96ce0a2c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3858,10 +3858,14 @@ static inline struct net_device_core_stats *dev_core_stats(struct net_device *de #define DEV_CORE_STATS_INC(FIELD) \ static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev) \ { \ - struct net_device_core_stats *p = dev_core_stats(dev); \ + struct net_device_core_stats *p; \ + \ + preempt_disable(); \ + p = dev_core_stats(dev); \ \ if (p) \ local_inc(&p->FIELD); \ + preempt_enable(); \ } DEV_CORE_STATS_INC(rx_dropped) DEV_CORE_STATS_INC(tx_dropped) -- cgit v1.2.3 From 2b3171c6fe0af24b5506e061525e08917a2f744a Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 24 Feb 2022 12:54:58 +0100 Subject: mac80211: MBSSID beacon handling in AP mode Add new fields in struct beacon_data to store all MBSSID elements. Generate a beacon template which includes all MBSSID elements. Move CSA offset to reflect the MBSSID element length. Co-developed-by: Aloka Dixit Signed-off-by: Aloka Dixit Co-developed-by: John Crispin Signed-off-by: John Crispin Signed-off-by: Lorenzo Bianconi Tested-by: Money Wang Link: https://lore.kernel.org/r/5322db3c303f431adaf191ab31c45e151dde5465.1645702516.git.lorenzo@kernel.org [small cleanups] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b8e8c82b53aa..382ebb862ea8 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4948,12 +4948,14 @@ void ieee80211_report_low_ack(struct ieee80211_sta *sta, u32 num_packets); * @cntdwn_counter_offs: array of IEEE80211_MAX_CNTDWN_COUNTERS_NUM offsets * to countdown counters. This array can contain zero values which * should be ignored. + * @mbssid_off: position of the multiple bssid element */ struct ieee80211_mutable_offsets { u16 tim_offset; u16 tim_length; u16 cntdwn_counter_offs[IEEE80211_MAX_CNTDWN_COUNTERS_NUM]; + u16 mbssid_off; }; /** -- cgit v1.2.3 From 938d3480b92fa5e454b7734294f12a7b75126f09 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Fri, 4 Mar 2022 16:11:42 +0800 Subject: bpf, sockmap: Fix memleak in sk_psock_queue_msg If tcp_bpf_sendmsg is running during a tear down operation we may enqueue data on the ingress msg queue while tear down is trying to free it. sk1 (redirect sk2) sk2 ------------------- --------------- tcp_bpf_sendmsg() tcp_bpf_send_verdict() tcp_bpf_sendmsg_redir() bpf_tcp_ingress() sock_map_close() lock_sock() lock_sock() ... blocking sk_psock_stop sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); release_sock(sk); lock_sock() sk_mem_charge() get_page() sk_psock_queue_msg() sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED); drop_sk_msg() release_sock() While drop_sk_msg(), the msg has charged memory form sk by sk_mem_charge and has sg pages need to put. To fix we use sk_msg_free() and then kfee() msg. This issue can cause the following info: WARNING: CPU: 0 PID: 9202 at net/core/stream.c:205 sk_stream_kill_queues+0xc8/0xe0 Call Trace: inet_csk_destroy_sock+0x55/0x110 tcp_rcv_state_process+0xe5f/0xe90 ? sk_filter_trim_cap+0x10d/0x230 ? tcp_v4_do_rcv+0x161/0x250 tcp_v4_do_rcv+0x161/0x250 tcp_v4_rcv+0xc3a/0xce0 ip_protocol_deliver_rcu+0x3d/0x230 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0xfd/0x110 ? ip_protocol_deliver_rcu+0x230/0x230 ip_rcv+0xd6/0x100 ? ip_local_deliver+0x110/0x110 __netif_receive_skb_one_core+0x85/0xa0 process_backlog+0xa4/0x160 __napi_poll+0x29/0x1b0 net_rx_action+0x287/0x300 __do_softirq+0xff/0x2fc do_softirq+0x79/0x90 WARNING: CPU: 0 PID: 531 at net/ipv4/af_inet.c:154 inet_sock_destruct+0x175/0x1b0 Call Trace: __sk_destruct+0x24/0x1f0 sk_psock_destroy+0x19b/0x1c0 process_one_work+0x1b3/0x3c0 ? process_one_work+0x3c0/0x3c0 worker_thread+0x30/0x350 ? process_one_work+0x3c0/0x3c0 kthread+0xe6/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 Fixes: 9635720b7c88 ("bpf, sockmap: Fix memleak on ingress msg enqueue") Signed-off-by: Wang Yufen Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220304081145.2037182-2-wangyufen@huawei.com --- include/linux/skmsg.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index fdb5375f0562..c5a2d6f50f25 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -304,21 +304,16 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } -static inline void drop_sk_msg(struct sk_psock *psock, struct sk_msg *msg) -{ - if (msg->skb) - sock_drop(psock->sk, msg->skb); - kfree(msg); -} - static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) list_add_tail(&msg->list, &psock->ingress_msg); - else - drop_sk_msg(psock, msg); + else { + sk_msg_free(psock->sk, msg); + kfree(msg); + } spin_unlock_bh(&psock->ingress_lock); } -- cgit v1.2.3 From 40867d74c374b235e14d839f3a77f26684feefe5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 14 Mar 2022 14:45:51 -0600 Subject: net: Add l3mdev index to flow struct and avoid oif reset for port devices The fundamental premise of VRF and l3mdev core code is binding a socket to a device (l3mdev or netdev with an L3 domain) to indicate L3 scope. Legacy code resets flowi_oif to the l3mdev losing any original port device binding. Ben (among others) has demonstrated use cases where the original port device binding is important and needs to be retained. This patch handles that by adding a new entry to the common flow struct that can indicate the l3mdev index for later rule and table matching avoiding the need to reset flowi_oif. In addition to allowing more use cases that require port device binds, this patch brings a few datapath simplications: 1. l3mdev_fib_rule_match is only called when walking fib rules and always after l3mdev_update_flow. That allows an optimization to bail early for non-VRF type uses cases when flowi_l3mdev is not set. Also, only that index needs to be checked for the FIB table id. 2. l3mdev_update_flow can be called with flowi_oif set to a l3mdev (e.g., VRF) device. By resetting flowi_oif only for this case the FLOWI_FLAG_SKIP_NH_OIF flag is not longer needed and can be removed, removing several checks in the datapath. The flowi_iif path can be simplified to only be called if the it is not loopback (loopback can not be assigned to an L3 domain) and the l3mdev index is not already set. 3. Avoid another device lookup in the output path when the fib lookup returns a reject failure. Note: 2 functional tests for local traffic with reject fib rules are updated to reflect the new direct failure at FIB lookup time for ping rather than the failure on packet path. The current code fails like this: HINT: Fails since address on vrf device is out of device scope COMMAND: ip netns exec ns-A ping -c1 -w1 -I eth1 172.16.3.1 ping: Warning: source address might be selected on device other than: eth1 PING 172.16.3.1 (172.16.3.1) from 172.16.3.1 eth1: 56(84) bytes of data. --- 172.16.3.1 ping statistics --- 1 packets transmitted, 0 received, 100% packet loss, time 0ms where the test now directly fails: HINT: Fails since address on vrf device is out of device scope COMMAND: ip netns exec ns-A ping -c1 -w1 -I eth1 172.16.3.1 ping: connect: No route to host Signed-off-by: David Ahern Tested-by: Ben Greear Link: https://lore.kernel.org/r/20220314204551.16369-1-dsahern@kernel.org Signed-off-by: Jakub Kicinski --- include/net/flow.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/flow.h b/include/net/flow.h index 58beb16a49b8..987bd511d652 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -29,6 +29,7 @@ struct flowi_tunnel { struct flowi_common { int flowic_oif; int flowic_iif; + int flowic_l3mdev; __u32 flowic_mark; __u8 flowic_tos; __u8 flowic_scope; @@ -36,7 +37,6 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 -#define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; kuid_t flowic_uid; struct flowi_tunnel flowic_tun_key; @@ -70,6 +70,7 @@ struct flowi4 { struct flowi_common __fl_common; #define flowi4_oif __fl_common.flowic_oif #define flowi4_iif __fl_common.flowic_iif +#define flowi4_l3mdev __fl_common.flowic_l3mdev #define flowi4_mark __fl_common.flowic_mark #define flowi4_tos __fl_common.flowic_tos #define flowi4_scope __fl_common.flowic_scope @@ -102,6 +103,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, { fl4->flowi4_oif = oif; fl4->flowi4_iif = LOOPBACK_IFINDEX; + fl4->flowi4_l3mdev = 0; fl4->flowi4_mark = mark; fl4->flowi4_tos = tos; fl4->flowi4_scope = scope; @@ -132,6 +134,7 @@ struct flowi6 { struct flowi_common __fl_common; #define flowi6_oif __fl_common.flowic_oif #define flowi6_iif __fl_common.flowic_iif +#define flowi6_l3mdev __fl_common.flowic_l3mdev #define flowi6_mark __fl_common.flowic_mark #define flowi6_scope __fl_common.flowic_scope #define flowi6_proto __fl_common.flowic_proto @@ -177,6 +180,7 @@ struct flowi { } u; #define flowi_oif u.__fl_common.flowic_oif #define flowi_iif u.__fl_common.flowic_iif +#define flowi_l3mdev u.__fl_common.flowic_l3mdev #define flowi_mark u.__fl_common.flowic_mark #define flowi_tos u.__fl_common.flowic_tos #define flowi_scope u.__fl_common.flowic_scope -- cgit v1.2.3 From 0492d857636e1c52cd71594a723c4b26a7b31978 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 16 Mar 2022 11:19:43 +0100 Subject: netfilter: flowtable: Fix QinQ and pppoe support for inet table nf_flow_offload_inet_hook() does not check for 802.1q and PPPoE. Fetch inner ethertype from these encapsulation protocols. Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support") Fixes: 4cd91f7c290f ("netfilter: flowtable: add vlan support") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index bd59e950f4d6..64daafd1fc41 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -10,6 +10,8 @@ #include #include #include +#include +#include struct nf_flowtable; struct nf_flow_rule; @@ -317,4 +319,20 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, int nf_flow_table_offload_init(void); void nf_flow_table_offload_exit(void); +static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb) +{ + __be16 proto; + + proto = *((__be16 *)(skb_mac_header(skb) + ETH_HLEN + + sizeof(struct pppoe_hdr))); + switch (proto) { + case htons(PPP_IP): + return htons(ETH_P_IP); + case htons(PPP_IPV6): + return htons(ETH_P_IPV6); + } + + return 0; +} + #endif /* _NF_FLOW_TABLE_H */ -- cgit v1.2.3 From 2cb7b4890d6e7f20560dc251e7f8d3cc68b0d554 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Mar 2022 23:00:04 -0700 Subject: devlink: expose instance locking and add locked port registering It should be familiar and beneficial to expose devlink instance lock to the drivers. This way drivers can block devlink from calling them during critical sections without breakneck locking. Add port helpers, port splitting callbacks will be the first target. Use 'devl_' prefix for "explicitly locked" API. Initial RFC used '__devlink' but that's too much typing. devl_lock_is_held() is not defined without lockdep, which is the same behavior as lockdep_is_held() itself. Reviewed-by: Leon Romanovsky Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 8d5349d2fb68..9de0d091aee9 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1479,6 +1479,17 @@ void *devlink_priv(struct devlink *devlink); struct devlink *priv_to_devlink(void *priv); struct device *devlink_to_dev(const struct devlink *devlink); +/* Devlink instance explicit locking */ +void devl_lock(struct devlink *devlink); +void devl_unlock(struct devlink *devlink); +void devl_assert_locked(struct devlink *devlink); +bool devl_lock_is_held(struct devlink *devlink); + +int devl_port_register(struct devlink *devlink, + struct devlink_port *devlink_port, + unsigned int port_index); +void devl_port_unregister(struct devlink_port *devlink_port); + struct ib_device; struct net *devlink_net(const struct devlink *devlink); -- cgit v1.2.3 From 706217c1ceb516c96283a1557a31fe003d0c8052 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Mar 2022 23:00:09 -0700 Subject: devlink: pass devlink_port to port_split / port_unsplit callbacks Now that devlink ports are protected by the instance lock it seems natural to pass devlink_port as an argument to the port_split / port_unsplit callbacks. This should save the drivers from doing a lookup. In theory drivers may have supported unsplitting ports which were not registered prior to this change. Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 9de0d091aee9..fd89a17adea1 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1197,9 +1197,9 @@ struct devlink_ops { struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); - int (*port_split)(struct devlink *devlink, unsigned int port_index, + int (*port_split)(struct devlink *devlink, struct devlink_port *port, unsigned int count, struct netlink_ext_ack *extack); - int (*port_unsplit)(struct devlink *devlink, unsigned int port_index, + int (*port_unsplit)(struct devlink *devlink, struct devlink_port *port, struct netlink_ext_ack *extack); int (*sb_pool_get)(struct devlink *devlink, unsigned int sb_index, u16 pool_index, -- cgit v1.2.3 From d2a3b7c5becc3992f8e7d2b9bf5eacceeedb9a48 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 9 Mar 2022 20:33:20 +0800 Subject: bpf: Fix net.core.bpf_jit_harden race It is the bpf_jit_harden counterpart to commit 60b58afc96c9 ("bpf: fix net.core.bpf_jit_enable race"). bpf_jit_harden will be tested twice for each subprog if there are subprogs in bpf program and constant blinding may increase the length of program, so when running "./test_progs -t subprogs" and toggling bpf_jit_harden between 0 and 2, jit_subprogs may fail because constant blinding increases the length of subprog instructions during extra passs. So cache the value of bpf_jit_blinding_enabled() during program allocation, and use the cached value during constant blinding, subprog JITing and args tracking of tail call. Signed-off-by: Hou Tao Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220309123321.2400262-4-houtao1@huawei.com --- include/linux/filter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 05ed9bd31b45..ed0c0ff42ad5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -566,6 +566,7 @@ struct bpf_prog { gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ + blinding_requested:1, /* needs constant blinding */ blinded:1, /* Was blinded */ is_func:1, /* program is a bpf function */ kprobe_override:1, /* Do we override a kprobe? */ -- cgit v1.2.3 From ab95465cde2337108252cdf01f064abdc1a67f6c Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Tue, 15 Mar 2022 13:02:09 +0200 Subject: net/sched: add vlan push_eth and pop_eth action to the hardware IR Add vlan push_eth and pop_eth action to the hardware intermediate representation model which would subsequently allow it to be used by drivers for offload. Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Jakub Kicinski --- include/net/flow_offload.h | 6 ++++++ include/net/tc_act/tc_vlan.h | 10 ++++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 92267d23083e..021778a7e1af 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -150,6 +150,8 @@ enum flow_action_id { FLOW_ACTION_PPPOE_PUSH, FLOW_ACTION_JUMP, FLOW_ACTION_PIPE, + FLOW_ACTION_VLAN_PUSH_ETH, + FLOW_ACTION_VLAN_POP_ETH, NUM_FLOW_ACTIONS, }; @@ -211,6 +213,10 @@ struct flow_action_entry { __be16 proto; u8 prio; } vlan; + struct { /* FLOW_ACTION_VLAN_PUSH_ETH */ + unsigned char dst[ETH_ALEN]; + unsigned char src[ETH_ALEN]; + } vlan_push_eth; struct { /* FLOW_ACTION_MANGLE */ /* FLOW_ACTION_ADD */ enum flow_action_mangle_base htype; diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index f94b8bc26f9e..a97600f742de 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -78,4 +78,14 @@ static inline u8 tcf_vlan_push_prio(const struct tc_action *a) return tcfv_push_prio; } + +static inline void tcf_vlan_push_eth(unsigned char *src, unsigned char *dest, + const struct tc_action *a) +{ + rcu_read_lock(); + memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_dst, ETH_ALEN); + memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_src, ETH_ALEN); + rcu_read_unlock(); +} + #endif /* __NET_TC_VLAN_H */ -- cgit v1.2.3 From 435fe1c0c1f74b682dba85641406abf4337aade6 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Wed, 16 Mar 2022 08:15:57 +0200 Subject: net: geneve: support IPv4/IPv6 as inner protocol This patch adds support for encapsulating IPv4/IPv6 within GENEVE. In order to use this, a new IFLA_GENEVE_INNER_PROTO_INHERIT flag needs to be provided at device creation. This property cannot be changed for the time being. In case IP traffic is received on a non-tun device the drop count is increased. Signed-off-by: Eyal Birger Link: https://lore.kernel.org/r/20220316061557.431872-1-eyal.birger@gmail.com Signed-off-by: Paolo Abeni --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index bd24c7dc10a2..cc284c048e69 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -842,6 +842,7 @@ enum { IFLA_GENEVE_LABEL, IFLA_GENEVE_TTL_INHERIT, IFLA_GENEVE_DF, + IFLA_GENEVE_INNER_PROTO_INHERIT, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) -- cgit v1.2.3 From 4206fe40b2c01fb5988980cff6ea958b16578026 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 27 Apr 2021 16:30:32 +0300 Subject: net/mlx5: Remove unused exported contiguous coherent buffer allocation API All WQ types moved to using the fragmented allocation API for coherent memory. Contiguous API is not used anymore. Remove it, reduce the number of exported functions. Signed-off-by: Tariq Toukan Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 00a914b0716e..a386aec1eb65 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1009,9 +1009,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health); void mlx5_drain_health_wq(struct mlx5_core_dev *dev); void mlx5_trigger_health_work(struct mlx5_core_dev *dev); -int mlx5_buf_alloc(struct mlx5_core_dev *dev, - int size, struct mlx5_frag_buf *buf); -void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf); int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size, struct mlx5_frag_buf *buf, int node); void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf); -- cgit v1.2.3 From 770c9a3a01af178a90368a78c75eb91707c7233c Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Thu, 20 May 2021 15:34:57 +0300 Subject: net/mlx5: Remove unused fill page array API function mlx5_fill_page_array API function is not used. Remove it, reduce the number of exported functions. Signed-off-by: Tariq Toukan Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index a386aec1eb65..96cd740d94a3 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1036,7 +1036,6 @@ int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev); void mlx5_register_debugfs(void); void mlx5_unregister_debugfs(void); -void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas); void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm); void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas); int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn); -- cgit v1.2.3 From 5142239a22219921a7863cf00c9ab853c00689d8 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 11 Mar 2022 10:14:18 +0100 Subject: net: veth: Account total xdp_frame len running ndo_xdp_xmit Even if this is a theoretical issue since it is not possible to perform XDP_REDIRECT on a non-linear xdp_frame, veth driver does not account paged area in ndo_xdp_xmit function pointer. Introduce xdp_get_frame_len utility routine to get the xdp_frame full length and account total frame size running XDP_REDIRECT of a non-linear xdp frame into a veth device. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Toke Hoiland-Jorgensen Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/54f9fd3bb65d190daf2c0bbae2f852ff16cfbaa0.1646989407.git.lorenzo@kernel.org --- include/net/xdp.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index b7721c3e4d1f..04c852c7a77f 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -343,6 +343,20 @@ out: __xdp_release_frame(xdpf->data, mem); } +static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf) +{ + struct skb_shared_info *sinfo; + unsigned int len = xdpf->len; + + if (likely(!xdp_frame_has_frags(xdpf))) + goto out; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + len += sinfo->xdp_frags_size; +out: + return len; +} + int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id, u32 frag_size); -- cgit v1.2.3 From ec7328b59176227216c461601c6bd0e922232a9b Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:43 +0100 Subject: net: bridge: mst: Multiple Spanning Tree (MST) mode Allow the user to switch from the current per-VLAN STP mode to an MST mode. Up to this point, per-VLAN STP states where always isolated from each other. This is in contrast to the MSTP standard (802.1Q-2018, Clause 13.5), where VLANs are grouped into MST instances (MSTIs), and the state is managed on a per-MSTI level, rather that at the per-VLAN level. Perhaps due to the prevalence of the standard, many switching ASICs are built after the same model. Therefore, add a corresponding MST mode to the bridge, which we can later add offloading support for in a straight-forward way. For now, all VLANs are fixed to MSTI 0, also called the Common Spanning Tree (CST). That is, all VLANs will follow the port-global state. Upcoming changes will make this actually useful by allowing VLANs to be mapped to arbitrary MSTIs and allow individual MSTI states to be changed. Signed-off-by: Tobias Waldekranz Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 2711c3522010..30a242195ced 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -759,6 +759,7 @@ struct br_mcast_stats { enum br_boolopt_id { BR_BOOLOPT_NO_LL_LEARN, BR_BOOLOPT_MCAST_VLAN_SNOOPING, + BR_BOOLOPT_MST_ENABLE, BR_BOOLOPT_MAX }; -- cgit v1.2.3 From 8c678d60562f3e5f6d0a5f5465e27930ffedb8ca Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:44 +0100 Subject: net: bridge: mst: Allow changing a VLAN's MSTI Allow a VLAN to move out of the CST (MSTI 0), to an independent tree. The user manages the VID to MSTI mappings via a global VLAN setting. The proposed iproute2 interface would be: bridge vlan global set dev br0 vid msti Changing the state in non-zero MSTIs is still not supported, but will be addressed in upcoming changes. Signed-off-by: Tobias Waldekranz Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 30a242195ced..f60244b747ae 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -564,6 +564,7 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_QUERIER, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE, + BRIDGE_VLANDB_GOPTS_MSTI, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From 122c29486e1ff78033c45d0d31c710e7dc8945a5 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:45 +0100 Subject: net: bridge: mst: Support setting and reporting MST port states Make it possible to change the port state in a given MSTI by extending the bridge port netlink interface (RTM_SETLINK on PF_BRIDGE).The proposed iproute2 interface would be: bridge mst set dev msti state Current states in all applicable MSTIs can also be dumped via a corresponding RTM_GETLINK. The proposed iproute interface looks like this: $ bridge mst port msti vb1 0 state forwarding 100 state disabled vb2 0 state forwarding 100 state forwarding The preexisting per-VLAN states are still valid in the MST mode (although they are read-only), and can be queried as usual if one is interested in knowing a particular VLAN's state without having to care about the VID to MSTI mapping (in this example VLAN 20 and 30 are bound to MSTI 100): $ bridge -d vlan port vlan-id vb1 10 state forwarding mcast_router 1 20 state disabled mcast_router 1 30 state disabled mcast_router 1 40 state forwarding mcast_router 1 vb2 10 state forwarding mcast_router 1 20 state forwarding mcast_router 1 30 state forwarding mcast_router 1 40 state forwarding mcast_router 1 Signed-off-by: Tobias Waldekranz Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 16 ++++++++++++++++ include/uapi/linux/rtnetlink.h | 1 + 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index f60244b747ae..a86a7e7b811f 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -122,6 +122,7 @@ enum { IFLA_BRIDGE_VLAN_TUNNEL_INFO, IFLA_BRIDGE_MRP, IFLA_BRIDGE_CFM, + IFLA_BRIDGE_MST, __IFLA_BRIDGE_MAX, }; #define IFLA_BRIDGE_MAX (__IFLA_BRIDGE_MAX - 1) @@ -453,6 +454,21 @@ enum { #define IFLA_BRIDGE_CFM_CC_PEER_STATUS_MAX (__IFLA_BRIDGE_CFM_CC_PEER_STATUS_MAX - 1) +enum { + IFLA_BRIDGE_MST_UNSPEC, + IFLA_BRIDGE_MST_ENTRY, + __IFLA_BRIDGE_MST_MAX, +}; +#define IFLA_BRIDGE_MST_MAX (__IFLA_BRIDGE_MST_MAX - 1) + +enum { + IFLA_BRIDGE_MST_ENTRY_UNSPEC, + IFLA_BRIDGE_MST_ENTRY_MSTI, + IFLA_BRIDGE_MST_ENTRY_STATE, + __IFLA_BRIDGE_MST_ENTRY_MAX, +}; +#define IFLA_BRIDGE_MST_ENTRY_MAX (__IFLA_BRIDGE_MST_ENTRY_MAX - 1) + struct bridge_stp_xstats { __u64 transition_blk; __u64 transition_fwd; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 51530aade46e..83849a37db5b 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -817,6 +817,7 @@ enum { #define RTEXT_FILTER_MRP (1 << 4) #define RTEXT_FILTER_CFM_CONFIG (1 << 5) #define RTEXT_FILTER_CFM_STATUS (1 << 6) +#define RTEXT_FILTER_MST (1 << 7) /* End of information exported to user level */ -- cgit v1.2.3 From 87c167bb94ee3fd49569d4aa2038b9b8840d906f Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:46 +0100 Subject: net: bridge: mst: Notify switchdev drivers of MST mode changes Trigger a switchdev event whenever the bridge's MST mode is enabled/disabled. This allows constituent ports to either perform any required hardware config, or refuse the change if it not supported. Signed-off-by: Tobias Waldekranz Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/net/switchdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 3e424d40fae3..85dd004dc9ad 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -27,6 +27,7 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL, SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, + SWITCHDEV_ATTR_ID_BRIDGE_MST, SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, }; @@ -48,6 +49,7 @@ struct switchdev_attr { clock_t ageing_time; /* BRIDGE_AGEING_TIME */ bool vlan_filtering; /* BRIDGE_VLAN_FILTERING */ u16 vlan_protocol; /* BRIDGE_VLAN_PROTOCOL */ + bool mst; /* BRIDGE_MST */ bool mc_disabled; /* MC_DISABLED */ u8 mrp_port_role; /* MRP_PORT_ROLE */ } u; -- cgit v1.2.3 From 6284c723d9b9995cc27ab3c6368a9d95d67111ff Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:47 +0100 Subject: net: bridge: mst: Notify switchdev drivers of VLAN MSTI migrations Whenever a VLAN moves to a new MSTI, send a switchdev notification so that switchdevs can track a bridge's VID to MSTI mappings. Signed-off-by: Tobias Waldekranz Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/net/switchdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 85dd004dc9ad..53dfa0f7cf5b 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -29,6 +29,7 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, SWITCHDEV_ATTR_ID_BRIDGE_MST, SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, + SWITCHDEV_ATTR_ID_VLAN_MSTI, }; struct switchdev_brport_flags { @@ -36,6 +37,11 @@ struct switchdev_brport_flags { unsigned long mask; }; +struct switchdev_vlan_msti { + u16 vid; + u16 msti; +}; + struct switchdev_attr { struct net_device *orig_dev; enum switchdev_attr_id id; @@ -52,6 +58,7 @@ struct switchdev_attr { bool mst; /* BRIDGE_MST */ bool mc_disabled; /* MC_DISABLED */ u8 mrp_port_role; /* MRP_PORT_ROLE */ + struct switchdev_vlan_msti vlan_msti; /* VLAN_MSTI */ } u; }; -- cgit v1.2.3 From 7ae9147f4312903b97eae231c48571bbd95dc63f Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:48 +0100 Subject: net: bridge: mst: Notify switchdev drivers of MST state changes Generate a switchdev notification whenever an MST state changes. This notification is keyed by the VLANs MSTI rather than the VID, since multiple VLANs may share the same MST instance. Signed-off-by: Tobias Waldekranz Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/net/switchdev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 53dfa0f7cf5b..aa0171d5786d 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -19,6 +19,7 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_UNDEFINED, SWITCHDEV_ATTR_ID_PORT_STP_STATE, + SWITCHDEV_ATTR_ID_PORT_MST_STATE, SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS, SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS, SWITCHDEV_ATTR_ID_PORT_MROUTER, @@ -32,6 +33,11 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_VLAN_MSTI, }; +struct switchdev_mst_state { + u16 msti; + u8 state; +}; + struct switchdev_brport_flags { unsigned long val; unsigned long mask; @@ -50,6 +56,7 @@ struct switchdev_attr { void (*complete)(struct net_device *dev, int err, void *priv); union { u8 stp_state; /* PORT_STP_STATE */ + struct switchdev_mst_state mst_state; /* PORT_MST_STATE */ struct switchdev_brport_flags brport_flags; /* PORT_BRIDGE_FLAGS */ bool mrouter; /* PORT_MROUTER */ clock_t ageing_time; /* BRIDGE_AGEING_TIME */ -- cgit v1.2.3 From cceac97afa090284b3ceecd93ea6b7b527116767 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:49 +0100 Subject: net: bridge: mst: Add helper to map an MSTI to a VID set br_mst_get_info answers the question: "On this bridge, which VIDs are mapped to the given MSTI?" This is useful in switchdev drivers, which might have to fan-out operations, relating to an MSTI, per VLAN. An example: When a port's MST state changes from forwarding to blocking, a driver may choose to flush the dynamic FDB entries on that port to get faster reconvergence of the network, but this should only be done in the VLANs that are managed by the MSTI in question. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/linux/if_bridge.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 3aae023a9353..1cf0cc46d90d 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -119,6 +119,7 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); +int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids); #else static inline bool br_vlan_enabled(const struct net_device *dev) { @@ -151,6 +152,12 @@ static inline int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, { return -EINVAL; } + +static inline int br_mst_get_info(const struct net_device *dev, u16 msti, + unsigned long *vids) +{ + return -EINVAL; +} #endif #if IS_ENABLED(CONFIG_BRIDGE) -- cgit v1.2.3 From 48d57b2e5f439246c4e12ed7705a5e3241294b03 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:50 +0100 Subject: net: bridge: mst: Add helper to check if MST is enabled This is useful for switchdev drivers that might want to refuse to join a bridge where MST is enabled, if the hardware can't support it. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/linux/if_bridge.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 1cf0cc46d90d..4efd5540279a 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -119,6 +119,7 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); +bool br_mst_enabled(const struct net_device *dev); int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids); #else static inline bool br_vlan_enabled(const struct net_device *dev) @@ -153,6 +154,11 @@ static inline int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, return -EINVAL; } +static inline bool br_mst_enabled(const struct net_device *dev) +{ + return false; +} + static inline int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids) { -- cgit v1.2.3 From f54fd0e163068ced4d0d27db64cd3cbda95b74e4 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:51 +0100 Subject: net: bridge: mst: Add helper to query a port's MST state This is useful for switchdev drivers who are offloading MST states into hardware. As an example, a driver may wish to flush the FDB for a port when it transitions from forwarding to blocking - which means that the previous state must be discoverable. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/linux/if_bridge.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 4efd5540279a..d62ef428e3aa 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -121,6 +121,7 @@ int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, struct bridge_vlan_info *p_vinfo); bool br_mst_enabled(const struct net_device *dev); int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids); +int br_mst_get_state(const struct net_device *dev, u16 msti, u8 *state); #else static inline bool br_vlan_enabled(const struct net_device *dev) { @@ -164,6 +165,11 @@ static inline int br_mst_get_info(const struct net_device *dev, u16 msti, { return -EINVAL; } +static inline int br_mst_get_state(const struct net_device *dev, u16 msti, + u8 *state) +{ + return -EINVAL; +} #endif #if IS_ENABLED(CONFIG_BRIDGE) -- cgit v1.2.3 From 8e6598a7b0fa834a563392d30017eac1b0102fcd Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:53 +0100 Subject: net: dsa: Pass VLAN MSTI migration notifications to driver Add the usual trampoline functionality from the generic DSA layer down to the drivers for VLAN MSTI migrations. Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 9bfe984fcdbf..644fda2293a2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -976,6 +976,9 @@ struct dsa_switch_ops { struct netlink_ext_ack *extack); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); + int (*vlan_msti_set)(struct dsa_switch *ds, struct dsa_bridge bridge, + const struct switchdev_vlan_msti *msti); + /* * Forwarding database */ -- cgit v1.2.3 From 7414af30b7d80f117bed5ad6769e6ffb433573ba Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Wed, 16 Mar 2022 16:08:54 +0100 Subject: net: dsa: Handle MST state changes Add the usual trampoline functionality from the generic DSA layer down to the drivers for MST state changes. When a state changes to disabled/blocking/listening, make sure to fast age any dynamic entries in the affected VLANs (those controlled by the MSTI in question). Signed-off-by: Tobias Waldekranz Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 644fda2293a2..06cdefd3b9dd 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -957,7 +957,10 @@ struct dsa_switch_ops { struct dsa_bridge bridge); void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); + int (*port_mst_state_set)(struct dsa_switch *ds, int port, + const struct switchdev_mst_state *state); void (*port_fast_age)(struct dsa_switch *ds, int port); + int (*port_vlan_fast_age)(struct dsa_switch *ds, int port, u16 vid); int (*port_pre_bridge_flags)(struct dsa_switch *ds, int port, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); -- cgit v1.2.3 From ccb6ed426f10ac4f742efa7d897c266aa10ac64a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Mar 2022 22:41:40 +0200 Subject: net: mscc: ocelot: add port mirroring support using tc-matchall Ocelot switches perform port-based ingress mirroring if ANA:PORT:PORT_CFG field SRC_MIRROR_ENA is set, and egress mirroring if the port is in ANA:ANA:EMIRRORPORTS. Both ingress-mirrored and egress-mirrored frames are copied to the port mask from ANA:ANA:MIRRORPORTS. So the choice of limiting to a single mirror port via ocelot_mirror_get() and ocelot_mirror_put() may seem bizarre, but the hardware model doesn't map very well to the user space model. If the user wants to mirror the ingress of swp1 towards swp2 and the ingress of swp3 towards swp4, we'd have to program ANA:ANA:MIRRORPORTS with BIT(2) | BIT(4), and that would make swp1 be mirrored towards swp4 too, and swp3 towards swp2. But there are no tc-matchall rules to describe those actions. Now, we could offload a matchall rule with multiple mirred actions, one per desired mirror port, and force the user to stick to the multi-action rule format for subsequent matchall filters. But both DSA and ocelot have the flow_offload_has_one_action() check for the matchall offload, plus the fact that it will get cumbersome to cross-check matchall mirrors with flower mirrors (which will be added in the next patch). As a result, we limit the configuration to a single mirror port, with the possibility of lifting the restriction in the future. Frames injected from the CPU don't get egress-mirrored, since they are sent with the BYPASS bit in the injection frame header, and this bypasses the analyzer module (effectively also the mirroring logic). I don't know what to do/say about this. Functionality was tested with: tc qdisc add dev swp3 clsact tc filter add dev swp3 ingress \ matchall skip_sw \ action mirred egress mirror dev swp1 and pinging through swp3, while seeing that the ICMP replies are mirrored towards swp1. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/soc/mscc/ocelot.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 4d51e2a7120f..9b4e6c78d0f4 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -642,6 +642,11 @@ struct ocelot_lag_fdb { struct list_head list; }; +struct ocelot_mirror { + refcount_t refcount; + int to; +}; + struct ocelot_port { struct ocelot *ocelot; @@ -723,6 +728,7 @@ struct ocelot { struct ocelot_vcap_block block[3]; struct ocelot_vcap_policer vcap_pol; struct vcap_props *vcap; + struct ocelot_mirror *mirror; struct ocelot_psfp_list psfp; @@ -908,6 +914,9 @@ int ocelot_get_max_mtu(struct ocelot *ocelot, int port); int ocelot_port_policer_add(struct ocelot *ocelot, int port, struct ocelot_policer *pol); int ocelot_port_policer_del(struct ocelot *ocelot, int port); +int ocelot_port_mirror_add(struct ocelot *ocelot, int from, int to, + bool ingress, struct netlink_ext_ack *extack); +void ocelot_port_mirror_del(struct ocelot *ocelot, int from, bool ingress); int ocelot_cls_flower_replace(struct ocelot *ocelot, int port, struct flow_cls_offload *f, bool ingress); int ocelot_cls_flower_destroy(struct ocelot *ocelot, int port, -- cgit v1.2.3 From f2a0e216bee5d95e2c2d916a8815a659cd3703c2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Mar 2022 22:41:42 +0200 Subject: net: mscc: ocelot: offload per-flow mirroring using tc-mirred and VCAP IS2 Per-flow mirroring with the VCAP IS2 TCAM (in itself handled as an offload for tc-flower) is done by setting the MIRROR_ENA bit from the action vector of the filter. The packet is mirrored to the port mask configured in the ANA:ANA:MIRRORPORTS register (the same port mask as the destinations for port-based mirroring). Functionality was tested with: tc qdisc add dev swp3 clsact tc filter add dev swp3 ingress protocol ip \ flower skip_sw ip_proto icmp \ action mirred egress mirror dev swp1 and pinging through swp3, while seeing that the ICMP replies are mirrored towards swp1. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/soc/mscc/ocelot_vcap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot_vcap.h b/include/soc/mscc/ocelot_vcap.h index deb2ad9eb0a5..7b2bf9b1fe69 100644 --- a/include/soc/mscc/ocelot_vcap.h +++ b/include/soc/mscc/ocelot_vcap.h @@ -654,6 +654,7 @@ struct ocelot_vcap_action { enum ocelot_mask_mode mask_mode; unsigned long port_mask; bool police_ena; + bool mirror_ena; struct ocelot_policer pol; u32 pol_ix; }; @@ -697,6 +698,7 @@ struct ocelot_vcap_filter { unsigned long ingress_port_mask; /* For VCAP ES0 */ struct ocelot_vcap_port ingress_port; + /* For VCAP IS2 mirrors and ES0 */ struct ocelot_vcap_port egress_port; enum ocelot_vcap_bit dmac_mc; -- cgit v1.2.3 From 0148bb50b8fd51baf357de8b237c0c6011506540 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 16 Mar 2022 22:41:43 +0200 Subject: net: dsa: pass extack to dsa_switch_ops :: port_mirror_add() Drivers might have error messages to propagate to user space, most common being that they support a single mirror port. Propagate the netlink extack so that they can inform user space in a verbal way of their limitations. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 06cdefd3b9dd..934958fda962 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1028,7 +1028,7 @@ struct dsa_switch_ops { struct flow_cls_offload *cls, bool ingress); int (*port_mirror_add)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, - bool ingress); + bool ingress, struct netlink_ext_ack *extack); void (*port_mirror_del)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); int (*port_policer_add)(struct dsa_switch *ds, int port, -- cgit v1.2.3 From 4f554e955614f19425cee86de4669351741a6280 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 15 Mar 2022 23:00:26 +0900 Subject: ftrace: Add ftrace_set_filter_ips function Adding ftrace_set_filter_ips function to be able to set filter on multiple ip addresses at once. With the kprobe multi attach interface we have cases where we need to initialize ftrace_ops object with thousands of functions, so having single function diving into ftrace_hash_move_and_update_ops with ftrace_lock is faster. The functions ips are passed as unsigned long array with count. Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164735282673.1084943.18310504594134769804.stgit@devnote2 --- include/linux/ftrace.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 9999e29187de..60847cbce0da 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -512,6 +512,8 @@ struct dyn_ftrace { int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset); +int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips, + unsigned int cnt, int remove, int reset); int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, @@ -802,6 +804,7 @@ static inline unsigned long ftrace_location(unsigned long ip) #define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; }) #define ftrace_set_early_filter(ops, buf, enable) do { } while (0) #define ftrace_set_filter_ip(ops, ip, remove, reset) ({ -ENODEV; }) +#define ftrace_set_filter_ips(ops, ips, cnt, remove, reset) ({ -ENODEV; }) #define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; }) #define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; }) #define ftrace_free_filter(ops) do { } while (0) -- cgit v1.2.3 From cad9931f64dc7f5dbdec12cae9f30063360f9855 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 15 Mar 2022 23:00:38 +0900 Subject: fprobe: Add ftrace based probe APIs The fprobe is a wrapper API for ftrace function tracer. Unlike kprobes, this probes only supports the function entry, but this can probe multiple functions by one fprobe. The usage is similar, user will set their callback to fprobe::entry_handler and call register_fprobe*() with probed functions. There are 3 registration interfaces, - register_fprobe() takes filtering patterns of the functin names. - register_fprobe_ips() takes an array of ftrace-location addresses. - register_fprobe_syms() takes an array of function names. The registered fprobes can be unregistered with unregister_fprobe(). e.g. struct fprobe fp = { .entry_handler = user_handler }; const char *targets[] = { "func1", "func2", "func3"}; ... ret = register_fprobe_syms(&fp, targets, ARRAY_SIZE(targets)); ... unregister_fprobe(&fp); Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164735283857.1084943.1154436951479395551.stgit@devnote2 --- include/linux/fprobe.h | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 include/linux/fprobe.h (limited to 'include') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h new file mode 100644 index 000000000000..2ba099aff041 --- /dev/null +++ b/include/linux/fprobe.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Simple ftrace probe wrapper */ +#ifndef _LINUX_FPROBE_H +#define _LINUX_FPROBE_H + +#include +#include + +/** + * struct fprobe - ftrace based probe. + * @ops: The ftrace_ops. + * @nmissed: The counter for missing events. + * @flags: The status flag. + * @entry_handler: The callback function for function entry. + */ +struct fprobe { +#ifdef CONFIG_FUNCTION_TRACER + /* + * If CONFIG_FUNCTION_TRACER is not set, CONFIG_FPROBE is disabled too. + * But user of fprobe may keep embedding the struct fprobe on their own + * code. To avoid build error, this will keep the fprobe data structure + * defined here, but remove ftrace_ops data structure. + */ + struct ftrace_ops ops; +#endif + unsigned long nmissed; + unsigned int flags; + void (*entry_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs); +}; + +#define FPROBE_FL_DISABLED 1 + +static inline bool fprobe_disabled(struct fprobe *fp) +{ + return (fp) ? fp->flags & FPROBE_FL_DISABLED : false; +} + +#ifdef CONFIG_FPROBE +int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter); +int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num); +int register_fprobe_syms(struct fprobe *fp, const char **syms, int num); +int unregister_fprobe(struct fprobe *fp); +#else +static inline int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter) +{ + return -EOPNOTSUPP; +} +static inline int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num) +{ + return -EOPNOTSUPP; +} +static inline int register_fprobe_syms(struct fprobe *fp, const char **syms, int num) +{ + return -EOPNOTSUPP; +} +static inline int unregister_fprobe(struct fprobe *fp) +{ + return -EOPNOTSUPP; +} +#endif + +/** + * disable_fprobe() - Disable fprobe + * @fp: The fprobe to be disabled. + * + * This will soft-disable @fp. Note that this doesn't remove the ftrace + * hooks from the function entry. + */ +static inline void disable_fprobe(struct fprobe *fp) +{ + if (fp) + fp->flags |= FPROBE_FL_DISABLED; +} + +/** + * enable_fprobe() - Enable fprobe + * @fp: The fprobe to be enabled. + * + * This will soft-enable @fp. + */ +static inline void enable_fprobe(struct fprobe *fp) +{ + if (fp) + fp->flags &= ~FPROBE_FL_DISABLED; +} + +#endif -- cgit v1.2.3 From 54ecbe6f1ed5138c895bdff55608cf502755b20e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 15 Mar 2022 23:00:50 +0900 Subject: rethook: Add a generic return hook Add a return hook framework which hooks the function return. Most of the logic came from the kretprobe, but this is independent from kretprobe. Note that this is expected to be used with other function entry hooking feature, like ftrace, fprobe, adn kprobes. Eventually this will replace the kretprobe (e.g. kprobe + rethook = kretprobe), but at this moment, this is just an additional hook. Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164735285066.1084943.9259661137330166643.stgit@devnote2 --- include/linux/rethook.h | 100 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 3 ++ 2 files changed, 103 insertions(+) create mode 100644 include/linux/rethook.h (limited to 'include') diff --git a/include/linux/rethook.h b/include/linux/rethook.h new file mode 100644 index 000000000000..c8ac1e5afcd1 --- /dev/null +++ b/include/linux/rethook.h @@ -0,0 +1,100 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Return hooking with list-based shadow stack. + */ +#ifndef _LINUX_RETHOOK_H +#define _LINUX_RETHOOK_H + +#include +#include +#include +#include +#include +#include + +struct rethook_node; + +typedef void (*rethook_handler_t) (struct rethook_node *, void *, struct pt_regs *); + +/** + * struct rethook - The rethook management data structure. + * @data: The user-defined data storage. + * @handler: The user-defined return hook handler. + * @pool: The pool of struct rethook_node. + * @ref: The reference counter. + * @rcu: The rcu_head for deferred freeing. + * + * Don't embed to another data structure, because this is a self-destructive + * data structure when all rethook_node are freed. + */ +struct rethook { + void *data; + rethook_handler_t handler; + struct freelist_head pool; + refcount_t ref; + struct rcu_head rcu; +}; + +/** + * struct rethook_node - The rethook shadow-stack entry node. + * @freelist: The freelist, linked to struct rethook::pool. + * @rcu: The rcu_head for deferred freeing. + * @llist: The llist, linked to a struct task_struct::rethooks. + * @rethook: The pointer to the struct rethook. + * @ret_addr: The storage for the real return address. + * @frame: The storage for the frame pointer. + * + * You can embed this to your extended data structure to store any data + * on each entry of the shadow stack. + */ +struct rethook_node { + union { + struct freelist_node freelist; + struct rcu_head rcu; + }; + struct llist_node llist; + struct rethook *rethook; + unsigned long ret_addr; + unsigned long frame; +}; + +struct rethook *rethook_alloc(void *data, rethook_handler_t handler); +void rethook_free(struct rethook *rh); +void rethook_add_node(struct rethook *rh, struct rethook_node *node); +struct rethook_node *rethook_try_get(struct rethook *rh); +void rethook_recycle(struct rethook_node *node); +void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount); +unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame, + struct llist_node **cur); + +/* Arch dependent code must implement arch_* and trampoline code */ +void arch_rethook_prepare(struct rethook_node *node, struct pt_regs *regs, bool mcount); +void arch_rethook_trampoline(void); + +/** + * is_rethook_trampoline() - Check whether the address is rethook trampoline + * @addr: The address to be checked + * + * Return true if the @addr is the rethook trampoline address. + */ +static inline bool is_rethook_trampoline(unsigned long addr) +{ + return addr == (unsigned long)dereference_symbol_descriptor(arch_rethook_trampoline); +} + +/* If the architecture needs to fixup the return address, implement it. */ +void arch_rethook_fixup_return(struct pt_regs *regs, + unsigned long correct_ret_addr); + +/* Generic trampoline handler, arch code must prepare asm stub */ +unsigned long rethook_trampoline_handler(struct pt_regs *regs, + unsigned long frame); + +#ifdef CONFIG_RETHOOK +void rethook_flush_task(struct task_struct *tk); +#else +#define rethook_flush_task(tsk) do { } while (0) +#endif + +#endif + diff --git a/include/linux/sched.h b/include/linux/sched.h index 75ba8aa60248..7034f53404e3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1481,6 +1481,9 @@ struct task_struct { #ifdef CONFIG_KRETPROBES struct llist_head kretprobe_instances; #endif +#ifdef CONFIG_RETHOOK + struct llist_head rethooks; +#endif #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH /* -- cgit v1.2.3 From 5b0ab78998e32564a011b14c4c7f9c81e2d42b9d Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 15 Mar 2022 23:01:48 +0900 Subject: fprobe: Add exit_handler support Add exit_handler to fprobe. fprobe + rethook allows us to hook the kernel function return. The rethook will be enabled only if the fprobe::exit_handler is set. Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164735290790.1084943.10601965782208052202.stgit@devnote2 --- include/linux/fprobe.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 2ba099aff041..8eefec2b485e 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -5,13 +5,16 @@ #include #include +#include /** * struct fprobe - ftrace based probe. * @ops: The ftrace_ops. * @nmissed: The counter for missing events. * @flags: The status flag. + * @rethook: The rethook data structure. (internal data) * @entry_handler: The callback function for function entry. + * @exit_handler: The callback function for function exit. */ struct fprobe { #ifdef CONFIG_FUNCTION_TRACER @@ -25,7 +28,10 @@ struct fprobe { #endif unsigned long nmissed; unsigned int flags; + struct rethook *rethook; + void (*entry_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs); + void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs); }; #define FPROBE_FL_DISABLED 1 -- cgit v1.2.3 From ab51e15d535e07be9839e0df056a4ebe9c5bac83 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 15 Mar 2022 23:02:11 +0900 Subject: fprobe: Introduce FPROBE_FL_KPROBE_SHARED flag for fprobe Introduce FPROBE_FL_KPROBE_SHARED flag for sharing fprobe callback with kprobes safely from the viewpoint of recursion. Since the recursion safety of the fprobe (and ftrace) is a bit different from the kprobes, this may cause an issue if user wants to run the same code from the fprobe and the kprobes. The kprobes has per-cpu 'current_kprobe' variable which protects the kprobe handler from recursion in any case. On the other hand, the fprobe uses only ftrace_test_recursion_trylock(), which will allow interrupt context calls another (or same) fprobe during the fprobe user handler is running. This is not a matter in cases if the common callback shared among the kprobes and the fprobe has its own recursion detection, or it can handle the recursion in the different contexts (normal/interrupt/NMI.) But if it relies on the 'current_kprobe' recursion lock, it has to check kprobe_running() and use kprobe_busy_*() APIs. Fprobe has FPROBE_FL_KPROBE_SHARED flag to do this. If your common callback code will be shared with kprobes, please set FPROBE_FL_KPROBE_SHARED *before* registering the fprobe, like; fprobe.flags = FPROBE_FL_KPROBE_SHARED; register_fprobe(&fprobe, "func*", NULL); This will protect your common callback from the nested call. Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (Google) Tested-by: Steven Rostedt (Google) Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/164735293127.1084943.15687374237275817599.stgit@devnote2 --- include/linux/fprobe.h | 12 ++++++++++++ include/linux/kprobes.h | 3 +++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 8eefec2b485e..1c2bde0ead73 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -34,13 +34,25 @@ struct fprobe { void (*exit_handler)(struct fprobe *fp, unsigned long entry_ip, struct pt_regs *regs); }; +/* This fprobe is soft-disabled. */ #define FPROBE_FL_DISABLED 1 +/* + * This fprobe handler will be shared with kprobes. + * This flag must be set before registering. + */ +#define FPROBE_FL_KPROBE_SHARED 2 + static inline bool fprobe_disabled(struct fprobe *fp) { return (fp) ? fp->flags & FPROBE_FL_DISABLED : false; } +static inline bool fprobe_shared_with_kprobes(struct fprobe *fp) +{ + return (fp) ? fp->flags & FPROBE_FL_KPROBE_SHARED : false; +} + #ifdef CONFIG_FPROBE int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter); int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num); diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 19b884353b15..5f1859836deb 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -427,6 +427,9 @@ static inline struct kprobe *kprobe_running(void) { return NULL; } +#define kprobe_busy_begin() do {} while (0) +#define kprobe_busy_end() do {} while (0) + static inline int register_kprobe(struct kprobe *p) { return -EOPNOTSUPP; -- cgit v1.2.3 From a0019cd7d41a191253859349535d76ee28ab7d96 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Mar 2022 13:24:07 +0100 Subject: lib/sort: Add priv pointer to swap function Adding support to have priv pointer in swap callback function. Following the initial change on cmp callback functions [1] and adding SWAP_WRAPPER macro to identify sort call of sort_r. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20220316122419.933957-2-jolsa@kernel.org [1] 4333fb96ca10 ("media: lib/sort.c: implement sort() variant taking context argument") --- include/linux/sort.h | 2 +- include/linux/types.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sort.h b/include/linux/sort.h index b5898725fe9d..e163287ac6c1 100644 --- a/include/linux/sort.h +++ b/include/linux/sort.h @@ -6,7 +6,7 @@ void sort_r(void *base, size_t num, size_t size, cmp_r_func_t cmp_func, - swap_func_t swap_func, + swap_r_func_t swap_func, const void *priv); void sort(void *base, size_t num, size_t size, diff --git a/include/linux/types.h b/include/linux/types.h index ac825ad90e44..ea8cf60a8a79 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -226,6 +226,7 @@ struct callback_head { typedef void (*rcu_callback_t)(struct rcu_head *head); typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func); +typedef void (*swap_r_func_t)(void *a, void *b, int size, const void *priv); typedef void (*swap_func_t)(void *a, void *b, int size); typedef int (*cmp_r_func_t)(const void *a, const void *b, const void *priv); -- cgit v1.2.3 From 0dcac272540613d41c05e89679e4ddb978b612f1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Mar 2022 13:24:09 +0100 Subject: bpf: Add multi kprobe link Adding new link type BPF_LINK_TYPE_KPROBE_MULTI that attaches kprobe program through fprobe API. The fprobe API allows to attach probe on multiple functions at once very fast, because it works on top of ftrace. On the other hand this limits the probe point to the function entry or return. The kprobe program gets the same pt_regs input ctx as when it's attached through the perf API. Adding new attach type BPF_TRACE_KPROBE_MULTI that allows attachment kprobe to multiple function with new link. User provides array of addresses or symbols with count to attach the kprobe program to. The new link_create uapi interface looks like: struct { __u32 flags; __u32 cnt; __aligned_u64 syms; __aligned_u64 addrs; } kprobe_multi; The flags field allows single BPF_TRACE_KPROBE_MULTI bit to create return multi kprobe. Signed-off-by: Masami Hiramatsu Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220316122419.933957-4-jolsa@kernel.org --- include/linux/bpf_types.h | 1 + include/linux/trace_events.h | 7 +++++++ include/uapi/linux/bpf.h | 13 +++++++++++++ 3 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 48a91c51c015..3e24ad0c4b3c 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -140,3 +140,4 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) #ifdef CONFIG_PERF_EVENTS BPF_LINK_TYPE(BPF_LINK_TYPE_PERF_EVENT, perf) #endif +BPF_LINK_TYPE(BPF_LINK_TYPE_KPROBE_MULTI, kprobe_multi) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index dcea51fb60e2..8f0e9e7cb493 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -15,6 +15,7 @@ struct array_buffer; struct tracer; struct dentry; struct bpf_prog; +union bpf_attr; const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, @@ -738,6 +739,7 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp); int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, u32 *fd_type, const char **buf, u64 *probe_offset, u64 *probe_addr); +int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -779,6 +781,11 @@ static inline int bpf_get_perf_event_info(const struct perf_event *event, { return -EOPNOTSUPP; } +static inline int +bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif enum { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 99fab54ae9c0..d77f47af7752 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -997,6 +997,7 @@ enum bpf_attach_type { BPF_SK_REUSEPORT_SELECT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, BPF_PERF_EVENT, + BPF_TRACE_KPROBE_MULTI, __MAX_BPF_ATTACH_TYPE }; @@ -1011,6 +1012,7 @@ enum bpf_link_type { BPF_LINK_TYPE_NETNS = 5, BPF_LINK_TYPE_XDP = 6, BPF_LINK_TYPE_PERF_EVENT = 7, + BPF_LINK_TYPE_KPROBE_MULTI = 8, MAX_BPF_LINK_TYPE, }; @@ -1118,6 +1120,11 @@ enum bpf_link_type { */ #define BPF_F_XDP_HAS_FRAGS (1U << 5) +/* link_create.kprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_KPROBE_MULTI attach type to create return probe. + */ +#define BPF_F_KPROBE_MULTI_RETURN (1U << 0) + /* When BPF ldimm64's insn[0].src_reg != 0 then this can have * the following extensions: * @@ -1475,6 +1482,12 @@ union bpf_attr { */ __u64 bpf_cookie; } perf_event; + struct { + __u32 flags; + __u32 cnt; + __aligned_u64 syms; + __aligned_u64 addrs; + } kprobe_multi; }; } link_create; -- cgit v1.2.3 From ca74823c6e16dd42b7cf60d9fdde80e2a81a67bb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 16 Mar 2022 13:24:12 +0100 Subject: bpf: Add cookie support to programs attached with kprobe multi link Adding support to call bpf_get_attach_cookie helper from kprobe programs attached with kprobe multi link. The cookie is provided by array of u64 values, where each value is paired with provided function address or symbol with the same array index. When cookie array is provided it's sorted together with addresses (check bpf_kprobe_multi_cookie_swap). This way we can find cookie based on the address in bpf_get_attach_cookie helper. Suggested-by: Andrii Nakryiko Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220316122419.933957-7-jolsa@kernel.org --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d77f47af7752..7604e7d5438f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1487,6 +1487,7 @@ union bpf_attr { __u32 cnt; __aligned_u64 syms; __aligned_u64 addrs; + __aligned_u64 cookies; } kprobe_multi; }; } link_create; -- cgit v1.2.3 From b58b1f563ab78955d37e9e43e02790a85c66ac05 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 14 Mar 2022 11:38:22 +0100 Subject: xfrm: rework default policy structure This is a follow up of commit f8d858e607b2 ("xfrm: make user policy API complete"). The goal is to align userland API to the internal structures. Signed-off-by: Nicolas Dichtel Reviewed-by: Antony Antony Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 6 +----- include/net/xfrm.h | 48 ++++++++++++++++++------------------------------ 2 files changed, 19 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 947733a639a6..bd7c3be4af5d 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -66,11 +66,7 @@ struct netns_xfrm { int sysctl_larval_drop; u32 sysctl_acq_expires; - u8 policy_default; -#define XFRM_POL_DEFAULT_IN 1 -#define XFRM_POL_DEFAULT_OUT 2 -#define XFRM_POL_DEFAULT_FWD 4 -#define XFRM_POL_DEFAULT_MASK 7 + u8 policy_default[XFRM_POLICY_MAX]; #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_hdr; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fdb41e8bb626..1541ca0cb13c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1081,25 +1081,18 @@ xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, un } #ifdef CONFIG_XFRM -static inline bool -xfrm_default_allow(struct net *net, int dir) -{ - u8 def = net->xfrm.policy_default; - - switch (dir) { - case XFRM_POLICY_IN: - return def & XFRM_POL_DEFAULT_IN ? false : true; - case XFRM_POLICY_OUT: - return def & XFRM_POL_DEFAULT_OUT ? false : true; - case XFRM_POLICY_FWD: - return def & XFRM_POL_DEFAULT_FWD ? false : true; - } - return false; -} - int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb, unsigned short family); +static inline bool __xfrm_check_nopolicy(struct net *net, struct sk_buff *skb, + int dir) +{ + if (!net->xfrm.policy_count[dir] && !secpath_exists(skb)) + return net->xfrm.policy_default[dir] == XFRM_USERPOLICY_ACCEPT; + + return false; +} + static inline int __xfrm_policy_check2(struct sock *sk, int dir, struct sk_buff *skb, unsigned int family, int reverse) @@ -1110,13 +1103,9 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir, if (sk && sk->sk_policy[XFRM_POLICY_IN]) return __xfrm_policy_check(sk, ndir, skb, family); - if (xfrm_default_allow(net, dir)) - return (!net->xfrm.policy_count[dir] && !secpath_exists(skb)) || - (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) || - __xfrm_policy_check(sk, ndir, skb, family); - else - return (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) || - __xfrm_policy_check(sk, ndir, skb, family); + return __xfrm_check_nopolicy(net, skb, dir) || + (skb_dst(skb) && (skb_dst(skb)->flags & DST_NOPOLICY)) || + __xfrm_policy_check(sk, ndir, skb, family); } static inline int xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) @@ -1168,13 +1157,12 @@ static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); - if (xfrm_default_allow(net, XFRM_POLICY_OUT)) - return !net->xfrm.policy_count[XFRM_POLICY_OUT] || - (skb_dst(skb)->flags & DST_NOXFRM) || - __xfrm_route_forward(skb, family); - else - return (skb_dst(skb)->flags & DST_NOXFRM) || - __xfrm_route_forward(skb, family); + if (!net->xfrm.policy_count[XFRM_POLICY_OUT] && + net->xfrm.policy_default[XFRM_POLICY_OUT] == XFRM_USERPOLICY_ACCEPT) + return true; + + return (skb_dst(skb)->flags & DST_NOXFRM) || + __xfrm_route_forward(skb, family); } static inline int xfrm4_route_forward(struct sk_buff *skb) -- cgit v1.2.3 From 54f586a9153201c6cff55e1f561990c78bd99aa7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 16 Mar 2022 21:27:51 +0100 Subject: rfkill: make new event layout opt-in Again new complaints surfaced that we had broken the ABI here, although previously all the userspace tools had agreed that it was their mistake and fixed it. Yet now there are cases (e.g. RHEL) that want to run old userspace with newer kernels, and thus are broken. Since this is a bit of a whack-a-mole thing, change the whole extensibility scheme of rfkill to no longer just rely on the message lengths, but instead require userspace to opt in via a new ioctl to a given maximum event size that it is willing to understand. By default, set that to RFKILL_EVENT_SIZE_V1 (8), so that the behaviour for userspace not calling the ioctl will look as if it's just running on an older kernel. Fixes: 14486c82612a ("rfkill: add a reason to the HW rfkill state") Cc: stable@vger.kernel.org # 5.11+ Signed-off-by: Johannes Berg Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220316212749.16491491b270.Ifcb1950998330a596f29a2a162e00b7546a1d6d0@changeid --- include/uapi/linux/rfkill.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h index 9b77cfc42efa..283c5a7b3f2c 100644 --- a/include/uapi/linux/rfkill.h +++ b/include/uapi/linux/rfkill.h @@ -159,8 +159,16 @@ struct rfkill_event_ext { * old behaviour for all userspace, unless it explicitly opts in to the * rules outlined here by using the new &struct rfkill_event_ext. * - * Userspace using &struct rfkill_event_ext must adhere to the following - * rules + * Additionally, some other userspace (bluez, g-s-d) was reading with a + * large size but as streaming reads rather than message-based, or with + * too strict checks for the returned size. So eventually, we completely + * reverted this, and extended messages need to be opted in to by using + * an ioctl: + * + * ioctl(fd, RFKILL_IOCTL_MAX_SIZE, sizeof(struct rfkill_event_ext)); + * + * Userspace using &struct rfkill_event_ext and the ioctl must adhere to + * the following rules: * * 1. accept short writes, optionally using them to detect that it's * running on an older kernel; @@ -175,6 +183,8 @@ struct rfkill_event_ext { #define RFKILL_IOC_MAGIC 'R' #define RFKILL_IOC_NOINPUT 1 #define RFKILL_IOCTL_NOINPUT _IO(RFKILL_IOC_MAGIC, RFKILL_IOC_NOINPUT) +#define RFKILL_IOC_MAX_SIZE 2 +#define RFKILL_IOCTL_MAX_SIZE _IOW(RFKILL_IOC_MAGIC, RFKILL_IOC_EXT_SIZE, __u32) /* and that's all userspace gets */ -- cgit v1.2.3 From 0eaecfb2e4814d51ab172df3823e35d7c488b6d2 Mon Sep 17 00:00:00 2001 From: Ismael Ferreras Morezuelas Date: Mon, 7 Mar 2022 21:04:44 +0100 Subject: Bluetooth: hci_sync: Add a new quirk to skip HCI_FLT_CLEAR_ALL Some controllers have problems with being sent a command to clear all filtering. While the HCI code does not unconditionally send a clear-all anymore at BR/EDR setup (after the state machine refactor), there might be more ways of hitting these codepaths in the future as the kernel develops. Cc: stable@vger.kernel.org Cc: Hans de Goede Signed-off-by: Ismael Ferreras Morezuelas Reviewed-by: Hans de Goede Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 35c073d44ec5..5cb095b09a94 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -255,6 +255,16 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, + + /* When this quirk is set, HCI_OP_SET_EVENT_FLT requests with + * HCI_FLT_CLEAR_ALL are ignored and event filtering is + * completely avoided. A subset of the CSR controller + * clones struggle with this and instantly lock up. + * + * Note that devices using this must (separately) disable + * runtime suspend, because event filtering takes place there. + */ + HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, }; /* HCI device flags */ -- cgit v1.2.3 From da8912176fb0ff9fd60e14fa653108d96422b896 Mon Sep 17 00:00:00 2001 From: Gavin Li Date: Mon, 14 Mar 2022 15:42:52 -0700 Subject: Bluetooth: fix incorrect nonblock bitmask in bt_sock_wait_ready() Callers pass msg->msg_flags as flags, which contains MSG_DONTWAIT instead of O_NONBLOCK. Signed-off-by: Gavin Li Signed-off-by: Marcel Holtmann --- include/net/bluetooth/bluetooth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 2aa5e95808a5..6b48d9e2aab9 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -345,7 +345,7 @@ int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, __poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo); -int bt_sock_wait_ready(struct sock *sk, unsigned long flags); +int bt_sock_wait_ready(struct sock *sk, unsigned int msg_flags); void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh); void bt_accept_unlink(struct sock *sk); -- cgit v1.2.3 From 046e1537a3cf0adc68fe865b5dc9a7e731cc63b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8D=C3=B1igo=20Huguet?= Date: Tue, 15 Mar 2022 10:18:32 +0100 Subject: net: set default rss queues num to physical cores / 2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Network drivers can call to netif_get_num_default_rss_queues to get the default number of receive queues to use. Right now, this default number is min(8, num_online_cpus()). Instead, as suggested by Jakub, use the number of physical cores divided by 2 as a way to avoid wasting CPU resources and to avoid using both CPU threads, but still allowing to scale for high-end processors with many cores. As an exception, select 2 queues for processors with 2 cores, because otherwise it won't take any advantage of RSS despite being SMP capable. Tested: Processor Intel Xeon E5-2620 (2 sockets, 6 cores/socket, 2 threads/core). NIC Broadcom NetXtreme II BCM57810 (10GBps). Ran some tests with `perf stat iperf3 -R`, with parallelisms of 1, 8 and 24, getting the following results: - Number of queues: 6 (instead of 8) - Network throughput: not affected - CPU usage: utilized 0.05-0.12 CPUs more than before (having 24 CPUs this is only 0.2-0.5% higher) - Reduced the number of context switches by 7-50%, being more noticeable when using a higher number of parallel threads. Suggested-by: Jakub Kicinski Signed-off-by: Íñigo Huguet Link: https://lore.kernel.org/r/20220315091832.13873-1-ihuguet@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8cbe96ce0a2c..e01a8ce7181f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3664,7 +3664,6 @@ static inline unsigned int get_netdev_rx_queue_index( } #endif -#define DEFAULT_MAX_NUM_RSS_QUEUES (8) int netif_get_num_default_rss_queues(void); enum skb_free_reason { -- cgit v1.2.3 From 31d0bb9763efad30377505f3467f958d1ebe1e3d Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 2 Mar 2022 22:02:55 +0100 Subject: netfilter: conntrack: Add and use nf_ct_set_auto_assign_helper_warned() The function sets the pernet boolean to avoid the spurious warning from nf_ct_lookup_helper() when assigning conntrack helpers via nftables. Fixes: 1a64edf54f55 ("netfilter: nft_ct: add helper set support") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_helper.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 37f0fbefb060..9939c366f720 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -177,4 +177,5 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat); int nf_nat_helper_try_module_get(const char *name, u16 l3num, u8 protonum); void nf_nat_helper_put(struct nf_conntrack_helper *helper); +void nf_ct_set_auto_assign_helper_warned(struct net *net); #endif /*_NF_CONNTRACK_HELPER_H*/ -- cgit v1.2.3 From b2d306542ff935a4edf7a88ba8145c108193442a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Mar 2022 18:23:00 +0100 Subject: netfilter: nf_tables: do not reduce read-only expressions Skip register tracking for expressions that perform read-only operations on the registers. Define and use a cookie pointer NFT_REDUCE_READONLY to avoid defining stubs for these expressions. This patch re-enables register tracking which was disabled in ed5f85d42290 ("netfilter: nf_tables: disable register tracking"). Follow up patches add remaining register tracking for existing expressions. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c4c0861deac1..edabfb9e97ce 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1633,4 +1633,12 @@ static inline struct nftables_pernet *nft_pernet(const struct net *net) return net_generic(net, nf_tables_net_id); } +#define __NFT_REDUCE_READONLY 1UL +#define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY + +static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) +{ + return expr->ops->reduce == NFT_REDUCE_READONLY; +} + #endif /* _NET_NF_TABLES_H */ -- cgit v1.2.3 From 34cc9e52884a16c62acbfb309863fb60e4c24f55 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 14 Mar 2022 18:23:01 +0100 Subject: netfilter: nf_tables: cancel tracking for clobbered destination registers Output of expressions might be larger than one single register, this might clobber existing data. Reset tracking for all destination registers that required to store the expression output. This patch adds three new helper functions: - nft_reg_track_update: cancel previous register tracking and update it. - nft_reg_track_cancel: cancel any previous register tracking info. - __nft_reg_track_cancel: cancel only one single register tracking info. Partial register clobbering detection is also supported by checking the .num_reg field which describes the number of register that are used. This patch updates the following expressions: - meta_bridge - bitwise - byteorder - meta - payload to use these helper functions. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 14 ++++++++++++++ include/net/netfilter/nft_meta.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index edabfb9e97ce..20af9d3557b9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -126,6 +126,7 @@ struct nft_regs_track { struct { const struct nft_expr *selector; const struct nft_expr *bitwise; + u8 num_reg; } regs[NFT_REG32_NUM]; const struct nft_expr *cur; @@ -1641,4 +1642,17 @@ static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) return expr->ops->reduce == NFT_REDUCE_READONLY; } +void nft_reg_track_update(struct nft_regs_track *track, + const struct nft_expr *expr, u8 dreg, u8 len); +void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); +void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg); + +static inline bool nft_reg_track_cmp(struct nft_regs_track *track, + const struct nft_expr *expr, u8 dreg) +{ + return track->regs[dreg].selector && + track->regs[dreg].selector->ops == expr->ops && + track->regs[dreg].num_reg == 0; +} + #endif /* _NET_NF_TABLES_H */ diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index 2dce55c736f4..246fd023dcf4 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -6,6 +6,7 @@ struct nft_meta { enum nft_meta_keys key:8; + u8 len; union { u8 dreg; u8 sreg; -- cgit v1.2.3 From aaa7b20bd4d637fd4ef0d72b6c828c061b9bc5f7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 Mar 2022 18:23:04 +0100 Subject: netfilter: nft_meta: extend reduce support to bridge family its enough to export the meta get reduce helper and then call it from nft_meta_bridge too. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_meta.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index 246fd023dcf4..9b51cc67de54 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -44,4 +44,6 @@ int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); +bool nft_meta_get_reduce(struct nft_regs_track *track, + const struct nft_expr *expr); #endif -- cgit v1.2.3 From 3c1eb413a45b6c6327fed394705081ec6202b31a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 14 Mar 2022 18:23:12 +0100 Subject: netfilter: nft_fib: add reduce support The fib expression stores to a register, so we can't add empty stub. Check that the register that is being written is in fact redundant. In most cases, this is expected to cancel tracking as re-use is unlikely. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_fib.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 237f3757637e..eed099eae672 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -37,4 +37,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, void nft_fib_store_result(void *reg, const struct nft_fib *priv, const struct net_device *dev); + +bool nft_fib_reduce(struct nft_regs_track *track, + const struct nft_expr *expr); #endif -- cgit v1.2.3 From b00fa38a9c1cba044a32a601b49a55a18ed719d1 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 17 Mar 2022 21:55:52 -0700 Subject: bpf: Enable non-atomic allocations in local storage Currently, local storage memory can only be allocated atomically (GFP_ATOMIC). This restriction is too strict for sleepable bpf programs. In this patch, the verifier detects whether the program is sleepable, and passes the corresponding GFP_KERNEL or GFP_ATOMIC flag as a 5th argument to bpf_task/sk/inode_storage_get. This flag will propagate down to the local storage functions that allocate memory. Please note that bpf_task/sk/inode_storage_update_elem functions are invoked by userspace applications through syscalls. Preemption is disabled before bpf_task/sk/inode_storage_update_elem is called, which means they will always have to allocate memory atomically. Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220318045553.3091807-2-joannekoong@fb.com --- include/linux/bpf_local_storage.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 37b3906af8b1..493e63258497 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -154,16 +154,17 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, - bool charge_mem); + bool charge_mem, gfp_t gfp_flags); int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *first_selem); + struct bpf_local_storage_elem *first_selem, + gfp_t gfp_flags); struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags); + void *value, u64 map_flags, gfp_t gfp_flags); void bpf_local_storage_free_rcu(struct rcu_head *rcu); -- cgit v1.2.3 From ee2a098851bfbe8bcdd964c0121f4246f00ff41e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 14 Mar 2022 11:20:41 -0700 Subject: bpf: Adjust BPF stack helper functions to accommodate skip > 0 Let's say that the caller has storage for num_elem stack frames. Then, the BPF stack helper functions walk the stack for only num_elem frames. This means that if skip > 0, one keeps only 'num_elem - skip' frames. This is because it sets init_nr in the perf_callchain_entry to the end of the buffer to save num_elem entries only. I believe it was because the perf callchain code unwound the stack frames until it reached the global max size (sysctl_perf_event_max_stack). However it now has perf_callchain_entry_ctx.max_stack to limit the iteration locally. This simplifies the code to handle init_nr in the BPF callstack entries and removes the confusion with the perf_event's __PERF_SAMPLE_CALLCHAIN_EARLY which sets init_nr to 0. Also change the comment on bpf_get_stack() in the header file to be more explicit what the return value means. Fixes: c195651e565a ("bpf: add bpf_get_stack helper") Signed-off-by: Namhyung Kim Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/30a7b5d5-6726-1cc2-eaee-8da2828a9a9c@oracle.com Link: https://lore.kernel.org/bpf/20220314182042.71025-1-namhyung@kernel.org Based-on-patch-by: Eugene Loh --- include/uapi/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7604e7d5438f..d14b10b85e51 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3009,8 +3009,8 @@ union bpf_attr { * * # sysctl kernel.perf_event_max_stack= * Return - * A non-negative value equal to or less than *size* on success, - * or a negative error in case of failure. + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. * * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) * Description @@ -4316,8 +4316,8 @@ union bpf_attr { * * # sysctl kernel.perf_event_max_stack= * Return - * A non-negative value equal to or less than *size* on success, - * or a negative error in case of failure. + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. * * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) * Description -- cgit v1.2.3 From 8879b32a3a809a183e6e2998725a0f33c4e42d41 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 18 Mar 2022 12:23:41 -0700 Subject: devlink: add explicitly locked flavor of the rate node APIs We'll need an explicitly locked rate node API for netdevsim to switch eswitch mode setting to locked. Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/devlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index fd89a17adea1..a30180c0988a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1490,6 +1490,10 @@ int devl_port_register(struct devlink *devlink, unsigned int port_index); void devl_port_unregister(struct devlink_port *devlink_port); +int devl_rate_leaf_create(struct devlink_port *port, void *priv); +void devl_rate_leaf_destroy(struct devlink_port *devlink_port); +void devl_rate_nodes_destroy(struct devlink *devlink); + struct ib_device; struct net *devlink_net(const struct devlink *devlink); -- cgit v1.2.3 From 351bdbb6419ca988802882badadc321d384d0254 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 21 Mar 2022 10:22:37 +0100 Subject: net: Revert the softirq will run annotation in ____napi_schedule(). The lockdep annotation lockdep_assert_softirq_will_run() expects that either hard or soft interrupts are disabled because both guaranty that the "raised" soft-interrupts will be processed once the context is left. This triggers in flush_smp_call_function_from_idle() but it this case it explicitly calls do_softirq() in case of pending softirqs. Revert the "softirq will run" annotation in ____napi_schedule() and move the check back to __netif_rx() as it was. Keep the IRQ-off assert in ____napi_schedule() because this is always required. Fixes: fbd9a2ceba5c7 ("net: Add lockdep asserts to ____napi_schedule().") Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Jason A. Donenfeld Link: https://lore.kernel.org/r/YjhD3ZKWysyw8rc6@linutronix.de Signed-off-by: Jakub Kicinski --- include/linux/lockdep.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 0cc65d216701..467b94257105 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -329,12 +329,6 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_assert_none_held_once() \ lockdep_assert_once(!current->lockdep_depth) -/* - * Ensure that softirq is handled within the callchain and not delayed and - * handled by chance. - */ -#define lockdep_assert_softirq_will_run() \ - lockdep_assert_once(hardirq_count() | softirq_count()) #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) @@ -420,7 +414,6 @@ extern int lockdep_is_held(const void *); #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_assert_none_held_once() do { } while (0) -#define lockdep_assert_softirq_will_run() do { } while (0) #define lockdep_recursing(tsk) (0) -- cgit v1.2.3 From 4a0cb83ba6e0cd73a50fa4f84736846bf0029f2b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 21 Mar 2022 22:10:53 -0700 Subject: netdevice: add missing dm_private kdoc Building htmldocs complains: include/linux/netdevice.h:2295: warning: Function parameter or member 'dm_private' not described in 'net_device' Fixes: b26ef81c46ed ("drop_monitor: remove quadratic behavior") Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220322051053.1883186-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e01a8ce7181f..cd7a597c55b1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1899,6 +1899,8 @@ enum netdev_ml_priv_type { * @garp_port: GARP * @mrp_port: MRP * + * @dm_private: Drop monitor private + * * @dev: Class/net/name entry * @sysfs_groups: Space for optional device, statistics and wireless * sysfs groups -- cgit v1.2.3 From 2af7e566a8616c278e1d7287ce86cd3900bed943 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 22 Mar 2022 10:22:24 -0700 Subject: net/mlx5e: Fix build warning, detected write beyond size of field When merged with Linus tree, the cited patch below will cause the following build warning: In function 'fortify_memset_chk', inlined from 'mlx5e_xmit_xdp_frame' at drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c:438:3: include/linux/fortify-string.h:242:25: error: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning] 242 | __write_overflow_field(p_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Fix that by grouping the fields to memeset in struct_group() to avoid the false alarm. Fixes: 9ded70fa1d81 ("net/mlx5e: Don't prefill WQEs in XDP SQ in the multi buffer mode") Reported-by: Stephen Rothwell Suggested-by: Stephen Rothwell Signed-off-by: Saeed Mahameed Link: https://lore.kernel.org/r/20220322172224.31849-1-saeed@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/mlx5/qp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 61e48d459b23..8bda3ba5b109 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -202,6 +202,9 @@ struct mlx5_wqe_fmr_seg { struct mlx5_wqe_ctrl_seg { __be32 opmod_idx_opcode; __be32 qpn_ds; + + struct_group(trailer, + u8 signature; u8 rsvd[2]; u8 fm_ce_se; @@ -211,6 +214,8 @@ struct mlx5_wqe_ctrl_seg { __be32 umr_mkey; __be32 tis_tir_num; }; + + ); /* end of trailer group */ }; #define MLX5_WQE_CTRL_DS_MASK 0x3f -- cgit v1.2.3 From 054d5575cd6ed2792611a7cbb8c88663cc873780 Mon Sep 17 00:00:00 2001 From: Louis Peens Date: Wed, 23 Mar 2022 11:25:06 +0200 Subject: net/sched: fix incorrect vlan_push_eth dest field Seems like a potential copy-paste bug slipped in here, the second memcpy should of course be populating src and not dest. Fixes: ab95465cde23 ("net/sched: add vlan push_eth and pop_eth action to the hardware IR") Signed-off-by: Louis Peens Link: https://lore.kernel.org/r/20220323092506.21639-1-louis.peens@corigine.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_vlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index a97600f742de..904eddfc1826 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -84,7 +84,7 @@ static inline void tcf_vlan_push_eth(unsigned char *src, unsigned char *dest, { rcu_read_lock(); memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_dst, ETH_ALEN); - memcpy(dest, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_src, ETH_ALEN); + memcpy(src, rcu_dereference(to_vlan(a)->vlan_p)->tcfv_push_src, ETH_ALEN); rcu_read_unlock(); } -- cgit v1.2.3