From 74523c06ae20b83c5508a98af62393ac34913362 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 6 Nov 2023 20:57:23 -0800 Subject: bpf: Add __bpf_dynptr_data* for in kernel use Different types of bpf dynptr have different internal data storage. Specifically, SKB and XDP type of dynptr may have non-continuous data. Therefore, it is not always safe to directly access dynptr->data. Add __bpf_dynptr_data and __bpf_dynptr_data_rw to replace direct access to dynptr->data. Update bpf_verify_pkcs7_signature to use __bpf_dynptr_data instead of dynptr->data. Signed-off-by: Song Liu Signed-off-by: Andrii Nakryiko Acked-by: Vadim Fedorenko Link: https://lore.kernel.org/bpf/20231107045725.2278852-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b4825d3cdb29..eb84caf133df 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1222,6 +1222,8 @@ enum bpf_dynptr_type { int bpf_dynptr_check_size(u32 size); u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); -- cgit v1.2.3 From 790ce3cfefb1b768dccd4eee324ddef0f0ce3db4 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Tue, 7 Nov 2023 00:56:37 -0800 Subject: bpf: Move GRAPH_{ROOT,NODE}_MASK macros into btf_field_type enum This refactoring patch removes the unused BPF_GRAPH_NODE_OR_ROOT btf_field_type and moves BPF_GRAPH_{NODE,ROOT} macros into the btf_field_type enum. Further patches in the series will use BPF_GRAPH_NODE, so let's move this useful definition out of btf.c. Signed-off-by: Dave Marchevsky Link: https://lore.kernel.org/r/20231107085639.3016113-5-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eb84caf133df..4001d11be151 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -186,8 +186,8 @@ enum btf_field_type { BPF_LIST_NODE = (1 << 6), BPF_RB_ROOT = (1 << 7), BPF_RB_NODE = (1 << 8), - BPF_GRAPH_NODE_OR_ROOT = BPF_LIST_NODE | BPF_LIST_HEAD | - BPF_RB_NODE | BPF_RB_ROOT, + BPF_GRAPH_NODE = BPF_RB_NODE | BPF_LIST_NODE, + BPF_GRAPH_ROOT = BPF_RB_ROOT | BPF_LIST_HEAD, BPF_REFCOUNT = (1 << 9), }; -- cgit v1.2.3 From 155addf0814a92d08fce26a11b27e3315cdba977 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 3 Nov 2023 19:49:00 -0700 Subject: bpf: Use named fields for certain bpf uapi structs Martin and Vadim reported a verifier failure with bpf_dynptr usage. The issue is mentioned but Vadim workarounded the issue with source change ([1]). The below describes what is the issue and why there is a verification failure. int BPF_PROG(skb_crypto_setup) { struct bpf_dynptr algo, key; ... bpf_dynptr_from_mem(..., ..., 0, &algo); ... } The bpf program is using vmlinux.h, so we have the following definition in vmlinux.h: struct bpf_dynptr { long: 64; long: 64; }; Note that in uapi header bpf.h, we have struct bpf_dynptr { long: 64; long: 64; } __attribute__((aligned(8))); So we lost alignment information for struct bpf_dynptr by using vmlinux.h. Let us take a look at a simple program below: $ cat align.c typedef unsigned long long __u64; struct bpf_dynptr_no_align { __u64 :64; __u64 :64; }; struct bpf_dynptr_yes_align { __u64 :64; __u64 :64; } __attribute__((aligned(8))); void bar(void *, void *); int foo() { struct bpf_dynptr_no_align a; struct bpf_dynptr_yes_align b; bar(&a, &b); return 0; } $ clang --target=bpf -O2 -S -emit-llvm align.c Look at the generated IR file align.ll: ... %a = alloca %struct.bpf_dynptr_no_align, align 1 %b = alloca %struct.bpf_dynptr_yes_align, align 8 ... The compiler dictates the alignment for struct bpf_dynptr_no_align is 1 and the alignment for struct bpf_dynptr_yes_align is 8. So theoretically compiler could allocate variable %a with alignment 1 although in reallity the compiler may choose a different alignment by considering other local variables. In [1], the verification failure happens because variable 'algo' is allocated on the stack with alignment 4 (fp-28). But the verifer wants its alignment to be 8. To fix the issue, the RFC patch ([1]) tried to add '__attribute__((aligned(8)))' to struct bpf_dynptr plus other similar structs. Andrii suggested that we could directly modify uapi struct with named fields like struct 'bpf_iter_num': struct bpf_iter_num { /* opaque iterator state; having __u64 here allows to preserve correct * alignment requirements in vmlinux.h, generated from BTF */ __u64 __opaque[1]; } __attribute__((aligned(8))); Indeed, adding named fields for those affected structs in this patch can preserve alignment when bpf program references them in vmlinux.h. With this patch, the verification failure in [1] can also be resolved. [1] https://lore.kernel.org/bpf/1b100f73-7625-4c1f-3ae5-50ecf84d3ff0@linux.dev/ [2] https://lore.kernel.org/bpf/20231103055218.2395034-1-yonghong.song@linux.dev/ Cc: Vadim Fedorenko Cc: Martin KaFai Lau Suggested-by: Andrii Nakryiko Signed-off-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231104024900.1539182-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0f6cdf52b1da..095ca7238ac2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7151,40 +7151,31 @@ struct bpf_spin_lock { }; struct bpf_timer { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_dynptr { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_list_head { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_list_node { - __u64 :64; - __u64 :64; - __u64 :64; + __u64 __opaque[3]; } __attribute__((aligned(8))); struct bpf_rb_root { - __u64 :64; - __u64 :64; + __u64 __opaque[2]; } __attribute__((aligned(8))); struct bpf_rb_node { - __u64 :64; - __u64 :64; - __u64 :64; - __u64 :64; + __u64 __opaque[4]; } __attribute__((aligned(8))); struct bpf_refcount { - __u32 :32; + __u32 __opaque[1]; } __attribute__((aligned(4))); struct bpf_sysctl { -- cgit v1.2.3 From 689b097a06bafb461ec162fc3b3ecc9765cea67b Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 6 Nov 2023 03:18:02 +0000 Subject: compiler-gcc: Suppress -Wmissing-prototypes warning for all supported GCC The kernel supports a minimum GCC version of 5.1.0 for building. However, the "__diag_ignore_all" directive only suppresses the "-Wmissing-prototypes" warning for GCC versions >= 8.0.0. As a result, when building the kernel with older GCC versions, warnings may be triggered. The example below illustrates the warnings reported by the kernel test robot using GCC 7.5.0: compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0 All warnings (new ones prefixed by >>): kernel/bpf/helpers.c:1893:19: warning: no previous prototype for 'bpf_obj_new_impl' [-Wmissing-prototypes] __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) ^~~~~~~~~~~~~~~~ kernel/bpf/helpers.c:1907:19: warning: no previous prototype for 'bpf_percpu_obj_new_impl' [-Wmissing-prototypes] __bpf_kfunc void *bpf_percpu_obj_new_impl(u64 local_type_id__k, void *meta__ign) [...] To address this, we should also suppress the "-Wmissing-prototypes" warning for older GCC versions. "#pragma GCC diagnostic push" is supported as of GCC 4.6, and both "-Wmissing-prototypes" and "-Wmissing-declarations" are supported for all the GCC versions that we currently support. Therefore, it is reasonable to suppress these warnings for all supported GCC versions. With this adjustment, it's important to note that after implementing "__diag_ignore_all", it will effectively suppress warnings for all the supported GCC versions. In the future, if you wish to suppress warnings that are only supported on higher GCC versions, it is advisable to explicitly use "__diag_ignore" to specify the GCC version you are targeting. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311031651.A7crZEur-lkp@intel.com/ Suggested-by: Arnd Bergmann Signed-off-by: Yafang Shao Cc: Kumar Kartikeya Dwivedi Cc: Arnd Bergmann Acked-by: Arnd Bergmann Link: https://lore.kernel.org/r/20231106031802.4188-1-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/compiler-gcc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 2ceba3fe4ec1..aebb65bf95a7 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -136,7 +136,7 @@ #endif #define __diag_ignore_all(option, comment) \ - __diag_GCC(8, ignore, option) + __diag(__diag_GCC_ignore option) /* * Prior to 9.1, -Wno-alloc-size-larger-than (and therefore the "alloc_size" -- cgit v1.2.3 From b8e3a87a627b575896e448021e5c2f8a3bc19931 Mon Sep 17 00:00:00 2001 From: Jordan Rome Date: Wed, 8 Nov 2023 03:23:34 -0800 Subject: bpf: Add crosstask check to __bpf_get_stack Currently get_perf_callchain only supports user stack walking for the current task. Passing the correct *crosstask* param will return 0 frames if the task passed to __bpf_get_stack isn't the current one instead of a single incorrect frame/address. This change passes the correct *crosstask* param but also does a preemptive check in __bpf_get_stack if the task is current and returns -EOPNOTSUPP if it is not. This issue was found using bpf_get_task_stack inside a BPF iterator ("iter/task"), which iterates over all tasks. bpf_get_task_stack works fine for fetching kernel stacks but because get_perf_callchain relies on the caller to know if the requested *task* is the current one (via *crosstask*) it was failing in a confusing way. It might be possible to get user stacks for all tasks utilizing something like access_process_vm but that requires the bpf program calling bpf_get_task_stack to be sleepable and would therefore be a breaking change. Fixes: fa28dcb82a38 ("bpf: Introduce helper bpf_get_task_stack()") Signed-off-by: Jordan Rome Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231108112334.3433136-1-jordalgo@meta.com --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 095ca7238ac2..7cf8bcf9f6a2 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4517,6 +4517,8 @@ union bpf_attr { * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) * Description * Return a user or a kernel stack in bpf program provided buffer. + * Note: the user stack will only be populated if the *task* is + * the current task; all other tasks will return -EOPNOTSUPP. * To achieve this, the helper needs *task*, which is a valid * pointer to **struct task_struct**. To store the stacktrace, the * bpf program provides *buf* with a nonnegative *size*. @@ -4528,6 +4530,7 @@ union bpf_attr { * * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. + * The *task* must be the current task. * **BPF_F_USER_BUILD_ID** * Collect buildid+offset instead of ips for user stack, * only valid if **BPF_F_USER_STACK** is also specified. -- cgit v1.2.3 From 07afe1ba288c04280622fa002ed385f1ac0b6fe6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Thu, 7 Sep 2023 03:09:08 +0200 Subject: batman-adv: mcast: implement multicast packet reception and forwarding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement functionality to receive and forward a new TVLV capable multicast packet type. The new batman-adv multicast packet type allows to contain several originator destination addresses within a TVLV. Routers on the way will potentially split the batman-adv multicast packet and adjust its tracker TVLV contents. Routing decisions are still based on the selected BATMAN IV or BATMAN V routing algorithm. So this new batman-adv multicast packet type retains the same loop-free properties. Also a new OGM multicast TVLV flag is introduced to signal to other nodes that we are capable of handling a batman-adv multicast packet and multicast tracker TVLV. And that all of our hard interfaces have an MTU of at least 1280 bytes (IPv6 minimum MTU), as a simple solution for now to avoid MTU issues while forwarding. Signed-off-by: Linus Lüssing Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 45 +++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 9204e4494b25..6e25753015df 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -116,6 +116,9 @@ enum batadv_icmp_packettype { * only need routable IPv4 multicast packets we signed up for explicitly * @BATADV_MCAST_WANT_NO_RTR6: we have no IPv6 multicast router and therefore * only need routable IPv6 multicast packets we signed up for explicitly + * @BATADV_MCAST_HAVE_MC_PTYPE_CAPA: we can parse, receive and forward + * batman-adv multicast packets with a multicast tracker TVLV. And all our + * hard interfaces have an MTU of at least 1280 bytes. */ enum batadv_mcast_flags { BATADV_MCAST_WANT_ALL_UNSNOOPABLES = 1UL << 0, @@ -123,6 +126,7 @@ enum batadv_mcast_flags { BATADV_MCAST_WANT_ALL_IPV6 = 1UL << 2, BATADV_MCAST_WANT_NO_RTR4 = 1UL << 3, BATADV_MCAST_WANT_NO_RTR6 = 1UL << 4, + BATADV_MCAST_HAVE_MC_PTYPE_CAPA = 1UL << 5, }; /* tt data subtypes */ @@ -174,14 +178,16 @@ enum batadv_bla_claimframe { * @BATADV_TVLV_TT: translation table tvlv * @BATADV_TVLV_ROAM: roaming advertisement tvlv * @BATADV_TVLV_MCAST: multicast capability tvlv + * @BATADV_TVLV_MCAST_TRACKER: multicast tracker tvlv */ enum batadv_tvlv_type { - BATADV_TVLV_GW = 0x01, - BATADV_TVLV_DAT = 0x02, - BATADV_TVLV_NC = 0x03, - BATADV_TVLV_TT = 0x04, - BATADV_TVLV_ROAM = 0x05, - BATADV_TVLV_MCAST = 0x06, + BATADV_TVLV_GW = 0x01, + BATADV_TVLV_DAT = 0x02, + BATADV_TVLV_NC = 0x03, + BATADV_TVLV_TT = 0x04, + BATADV_TVLV_ROAM = 0x05, + BATADV_TVLV_MCAST = 0x06, + BATADV_TVLV_MCAST_TRACKER = 0x07, }; #pragma pack(2) @@ -487,6 +493,25 @@ struct batadv_bcast_packet { */ }; +/** + * struct batadv_mcast_packet - multicast packet for network payload + * @packet_type: batman-adv packet type, part of the general header + * @version: batman-adv protocol version, part of the general header + * @ttl: time to live for this packet, part of the general header + * @reserved: reserved byte for alignment + * @tvlv_len: length of the appended tvlv buffer (in bytes) + */ +struct batadv_mcast_packet { + __u8 packet_type; + __u8 version; + __u8 ttl; + __u8 reserved; + __be16 tvlv_len; + /* "4 bytes boundary + 2 bytes" long to make the payload after the + * following ethernet header again 4 bytes boundary aligned + */ +}; + /** * struct batadv_coded_packet - network coded packet * @packet_type: batman-adv packet type, part of the general header @@ -628,6 +653,14 @@ struct batadv_tvlv_mcast_data { __u8 reserved[3]; }; +/** + * struct batadv_tvlv_mcast_tracker - payload of a multicast tracker tvlv + * @num_dests: number of subsequent destination originator MAC addresses + */ +struct batadv_tvlv_mcast_tracker { + __be16 num_dests; +}; + #pragma pack() #endif /* _UAPI_LINUX_BATADV_PACKET_H_ */ -- cgit v1.2.3 From 4aea6a6d61cd6e3df9ed98345638abad1b1e5276 Mon Sep 17 00:00:00 2001 From: Rahul Rameshbabu Date: Thu, 30 Mar 2023 15:05:38 -0700 Subject: net/mlx5: Query maximum frequency adjustment of the PTP hardware clock Some mlx5 devices do not support the default advertised maximum frequency adjustment value for the PTP hardware clock that is set by the driver. These devices need to be queried when initializing the clock functionality in order to get the maximum supported frequency adjustment value. This value can be greater than the minimum supported frequency adjustment across mlx5 devices (50 million ppb). Signed-off-by: Rahul Rameshbabu Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6f3631425f38..ce2e71cd6d2a 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10103,7 +10103,10 @@ enum { struct mlx5_ifc_mtutc_reg_bits { u8 reserved_at_0[0x5]; u8 freq_adj_units[0x3]; - u8 reserved_at_8[0x14]; + u8 reserved_at_8[0x3]; + u8 log_max_freq_adjustment[0x5]; + + u8 reserved_at_10[0xc]; u8 operation[0x4]; u8 freq_adjustment[0x20]; -- cgit v1.2.3 From 67420501e8681ae18f9f0ea0a69cd2f432100e70 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:05:57 -0800 Subject: bpf: generalize reg_set_min_max() to handle non-const register comparisons Generalize bounds adjustment logic of reg_set_min_max() to handle not just register vs constant case, but in general any register vs any register cases. For most of the operations it's trivial extension based on range vs range comparison logic, we just need to properly pick min/max of a range to compare against min/max of the other range. For BPF_JSET we keep the original capabilities, just make sure JSET is integrated in the common framework. This is manifested in the internal-only BPF_JSET + BPF_X "opcode" to allow for simpler and more uniform rev_opcode() handling. See the code for details. This allows to reuse the same code exactly both for TRUE and FALSE branches without explicitly handling both conditions with custom code. Note also that now we don't need a special handling of BPF_JEQ/BPF_JNE case none of the registers are constants. This is now just a normal generic case handled by reg_set_min_max(). To make tnum handling cleaner, tnum_with_subreg() helper is added, as that's a common operator when dealing with 32-bit subregister bounds. This keeps the overall logic much less noisy when it comes to tnums. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231112010609.848406-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/tnum.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 1c3948a1d6ad..3c13240077b8 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -106,6 +106,10 @@ int tnum_sbin(char *str, size_t size, struct tnum a); struct tnum tnum_subreg(struct tnum a); /* Returns the tnum with the lower 32-bit subreg cleared */ struct tnum tnum_clear_subreg(struct tnum a); +/* Returns the tnum with the lower 32-bit subreg in *reg* set to the lower + * 32-bit subreg in *subreg* + */ +struct tnum tnum_with_subreg(struct tnum reg, struct tnum subreg); /* Returns the tnum with the lower 32-bit subreg set to value */ struct tnum tnum_const_subreg(struct tnum a, u32 value); /* Returns true if 32-bit subreg @a is a known constant*/ -- cgit v1.2.3 From 5f99f312bd3bedb3b266b0d26376a8c500cdc97f Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 11 Nov 2023 17:06:00 -0800 Subject: bpf: add register bounds sanity checks and sanitization Add simple sanity checks that validate well-formed ranges (min <= max) across u64, s64, u32, and s32 ranges. Also for cases when the value is constant (either 64-bit or 32-bit), we validate that ranges and tnums are in agreement. These bounds checks are performed at the end of BPF_ALU/BPF_ALU64 operations, on conditional jumps, and for LDX instructions (where subreg zero/sign extension is probably the most important to check). This covers most of the interesting cases. Also, we validate the sanity of the return register when manually adjusting it for some special helpers. By default, sanity violation will trigger a warning in verifier log and resetting register bounds to "unbounded" ones. But to aid development and debugging, BPF_F_TEST_SANITY_STRICT flag is added, which will trigger hard failure of verification with -EFAULT on register bounds violations. This allows selftests to catch such issues. veristat will also gain a CLI option to enable this behavior. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Shung-Hsi Yu Link: https://lore.kernel.org/r/20231112010609.848406-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + include/uapi/linux/bpf.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 24213a99cc79..402b6bc44a1b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -602,6 +602,7 @@ struct bpf_verifier_env { int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ bool test_state_freq; /* test verifier with different pruning frequency */ + bool test_sanity_strict; /* fail verification on sanity violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7cf8bcf9f6a2..8a5855fcee69 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1200,6 +1200,9 @@ enum bpf_perf_event_type { */ #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_SANITY_STRICT (1U << 7) + /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. */ -- cgit v1.2.3 From 3185d57cfcd34fadbe28f4ed57a6cb5122277ece Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 14 Nov 2023 11:42:02 +0100 Subject: indirect_call_wrapper: Fix typo in INDIRECT_CALL_$NR kerneldoc Fix a small typo in the kerneldoc comment of the INDIRECT_CALL_$NR macro. Signed-off-by: Tobias Klauser Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20231114104202.4680-1-tklauser@distanz.ch Signed-off-by: Paolo Abeni --- include/linux/indirect_call_wrapper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index c1c76a70a6ce..adb83a42a6b9 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -11,7 +11,7 @@ * @__VA_ARGS__: arguments for @f * * Avoid retpoline overhead for known builtin, checking @f vs each of them and - * eventually invoking directly the builtin function. The functions are check + * eventually invoking directly the builtin function. The functions are checked * in the given order. Fallback to the indirect call. */ #define INDIRECT_CALL_1(f, f1, ...) \ -- cgit v1.2.3 From c6e9dba3be5ef3b701b29b143609561915e5d0e9 Mon Sep 17 00:00:00 2001 From: Alce Lafranque Date: Tue, 14 Nov 2023 11:36:57 -0600 Subject: vxlan: add support for flowlabel inherit By default, VXLAN encapsulation over IPv6 sets the flow label to 0, with an option for a fixed value. This commits add the ability to inherit the flow label from the inner packet, like for other tunnel implementations. This enables devices using only L3 headers for ECMP to correctly balance VXLAN-encapsulated IPv6 packets. ``` $ ./ip/ip link add dummy1 type dummy $ ./ip/ip addr add 2001:db8::2/64 dev dummy1 $ ./ip/ip link set up dev dummy1 $ ./ip/ip link add vxlan1 type vxlan id 100 flowlabel inherit remote 2001:db8::1 local 2001:db8::2 $ ./ip/ip link set up dev vxlan1 $ ./ip/ip addr add 2001:db8:1::2/64 dev vxlan1 $ ./ip/ip link set arp off dev vxlan1 $ ping -q 2001:db8:1::1 & $ tshark -d udp.port==8472,vxlan -Vpni dummy1 -c1 [...] Internet Protocol Version 6, Src: 2001:db8::2, Dst: 2001:db8::1 0110 .... = Version: 6 .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT) .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0) .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0) .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb [...] Virtual eXtensible Local Area Network Flags: 0x0800, VXLAN Network ID (VNI) Group Policy ID: 0 VXLAN Network Identifier (VNI): 100 [...] Internet Protocol Version 6, Src: 2001:db8:1::2, Dst: 2001:db8:1::1 0110 .... = Version: 6 .... 0000 0000 .... .... .... .... .... = Traffic Class: 0x00 (DSCP: CS0, ECN: Not-ECT) .... 0000 00.. .... .... .... .... .... = Differentiated Services Codepoint: Default (0) .... .... ..00 .... .... .... .... .... = Explicit Congestion Notification: Not ECN-Capable Transport (0) .... 1011 0001 1010 1111 1011 = Flow Label: 0xb1afb ``` Signed-off-by: Alce Lafranque Co-developed-by: Vincent Bernat Signed-off-by: Vincent Bernat Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 11 +++++++++++ include/net/vxlan.h | 33 +++++++++++++++++---------------- include/uapi/linux/if_link.h | 8 ++++++++ 3 files changed, 36 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index f346b4efbc30..2d746f4c9a0a 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -416,6 +416,17 @@ static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph, return 0; } +static inline __be32 ip_tunnel_get_flowlabel(const struct iphdr *iph, + const struct sk_buff *skb) +{ + __be16 payload_protocol = skb_protocol(skb, true); + + if (payload_protocol == htons(ETH_P_IPV6)) + return ip6_flowlabel((const struct ipv6hdr *)iph); + else + return 0; +} + static inline u8 ip_tunnel_get_ttl(const struct iphdr *iph, const struct sk_buff *skb) { diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 6a9f8a5f387c..33ba6fc151cf 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -210,22 +210,23 @@ struct vxlan_rdst { }; struct vxlan_config { - union vxlan_addr remote_ip; - union vxlan_addr saddr; - __be32 vni; - int remote_ifindex; - int mtu; - __be16 dst_port; - u16 port_min; - u16 port_max; - u8 tos; - u8 ttl; - __be32 label; - u32 flags; - unsigned long age_interval; - unsigned int addrmax; - bool no_share; - enum ifla_vxlan_df df; + union vxlan_addr remote_ip; + union vxlan_addr saddr; + __be32 vni; + int remote_ifindex; + int mtu; + __be16 dst_port; + u16 port_min; + u16 port_max; + u8 tos; + u8 ttl; + __be32 label; + enum ifla_vxlan_label_policy label_policy; + u32 flags; + unsigned long age_interval; + unsigned int addrmax; + bool no_share; + enum ifla_vxlan_df df; }; enum { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 29ff80da2775..8181ef23a7a2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -856,6 +856,7 @@ enum { IFLA_VXLAN_DF, IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ IFLA_VXLAN_LOCALBYPASS, + IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) @@ -873,6 +874,13 @@ enum ifla_vxlan_df { VXLAN_DF_MAX = __VXLAN_DF_END - 1, }; +enum ifla_vxlan_label_policy { + VXLAN_LABEL_FIXED = 0, + VXLAN_LABEL_INHERIT = 1, + __VXLAN_LABEL_END, + VXLAN_LABEL_MAX = __VXLAN_LABEL_END - 1, +}; + /* GENEVE section */ enum { IFLA_GENEVE_UNSPEC, -- cgit v1.2.3 From 96fa96e198f9707285003075fbbce7db6a485112 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 15 Nov 2023 11:39:18 +0000 Subject: net: linkmode: add linkmode_fill() helper Add a linkmode_fill() helper, which will allow us to convert phylink's open coded bitmap_fill() operations. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/linkmode.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/linkmode.h b/include/linux/linkmode.h index 7303b4bc2ce0..287f590ed56b 100644 --- a/include/linux/linkmode.h +++ b/include/linux/linkmode.h @@ -10,6 +10,11 @@ static inline void linkmode_zero(unsigned long *dst) bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); } +static inline void linkmode_fill(unsigned long *dst) +{ + bitmap_fill(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); +} + static inline void linkmode_copy(unsigned long *dst, const unsigned long *src) { bitmap_copy(dst, src, __ETHTOOL_LINK_MODE_MASK_NBITS); -- cgit v1.2.3 From ff8867af01daa7ea770bebf5f91199b7434b74e5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 09:14:04 -0800 Subject: bpf: rename BPF_F_TEST_SANITY_STRICT to BPF_F_TEST_REG_INVARIANTS Rename verifier internal flag BPF_F_TEST_SANITY_STRICT to more neutral BPF_F_TEST_REG_INVARIANTS. This is a follow up to [0]. A few selftests and veristat need to be adjusted in the same patch as well. [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231112010609.848406-5-andrii@kernel.org/ Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231117171404.225508-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 402b6bc44a1b..52a4012b8255 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -602,7 +602,7 @@ struct bpf_verifier_env { int stack_size; /* number of states to be processed */ bool strict_alignment; /* perform strict pointer alignment checks */ bool test_state_freq; /* test verifier with different pruning frequency */ - bool test_sanity_strict; /* fail verification on sanity violations */ + bool test_reg_invariants; /* fail verification on register invariants violations */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8a5855fcee69..7a5498242eaa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1201,7 +1201,7 @@ enum bpf_perf_event_type { #define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) /* The verifier internal test flag. Behavior is undefined */ -#define BPF_F_TEST_SANITY_STRICT (1U << 7) +#define BPF_F_TEST_REG_INVARIANTS (1U << 7) /* link_create.kprobe_multi.flags used in LINK_CREATE command for * BPF_TRACE_KPROBE_MULTI attach type to create return probe. -- cgit v1.2.3 From 446e2305827b76e8081057ce56bbd2703b4da8a9 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:29 +0100 Subject: net: Convert PHYs hwtstamp callback to use kernel_hwtstamp_config The PHYs hwtstamp callback are still getting the timestamp config from ifreq and using copy_from/to_user. Get rid of these functions by using timestamp configuration in parameter. This also allow to move on to kernel_hwtstamp_config and be similar to net devices using the new ndo_hwstamp_get/set. This adds the possibility to manipulate the timestamp configuration from the kernel which was not possible with the copy_from/to_user. Signed-off-by: Kory Maincent Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/mii_timestamper.h | 4 +++- include/linux/phy.h | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h index fa940bbaf8ae..26b04f73f214 100644 --- a/include/linux/mii_timestamper.h +++ b/include/linux/mii_timestamper.h @@ -9,6 +9,7 @@ #include #include #include +#include struct phy_device; @@ -51,7 +52,8 @@ struct mii_timestamper { struct sk_buff *skb, int type); int (*hwtstamp)(struct mii_timestamper *mii_ts, - struct ifreq *ifreq); + struct kernel_hwtstamp_config *kernel_config, + struct netlink_ext_ack *extack); void (*link_state)(struct mii_timestamper *mii_ts, struct phy_device *phydev); diff --git a/include/linux/phy.h b/include/linux/phy.h index 3cc52826f18e..e5f1f41e399c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1560,9 +1560,11 @@ static inline bool phy_has_txtstamp(struct phy_device *phydev) return phydev && phydev->mii_ts && phydev->mii_ts->txtstamp; } -static inline int phy_hwtstamp(struct phy_device *phydev, struct ifreq *ifr) +static inline int phy_hwtstamp(struct phy_device *phydev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) { - return phydev->mii_ts->hwtstamp(phydev->mii_ts, ifr); + return phydev->mii_ts->hwtstamp(phydev->mii_ts, cfg, extack); } static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb, -- cgit v1.2.3 From b8768dc4077712915f045ba1b198f521493c7914 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Tue, 14 Nov 2023 12:28:31 +0100 Subject: net: ethtool: Refactor identical get_ts_info implementations. The vlan, macvlan and the bonding drivers call their "real" device driver in order to report the time stamping capabilities. Provide a core ethtool helper function to avoid copy/paste in the stack. Signed-off-by: Richard Cochran Signed-off-by: Kory Maincent Reviewed-by: Florian Fainelli Reviewed-by: Jay Vosburgh Signed-off-by: David S. Miller --- include/linux/ethtool.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 689028257fcc..c2bb74143eda 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1043,6 +1043,14 @@ static inline int ethtool_mm_frag_size_min_to_add(u32 val_min, u32 *val_add, return -EINVAL; } +/** + * ethtool_get_ts_info_by_layer - Obtains time stamping capabilities from the MAC or PHY layer. + * @dev: pointer to net_device structure + * @info: buffer to hold the result + * Returns zero on success, non-zero otherwise. + */ +int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info *info); + /** * ethtool_sprintf - Write formatted string to ethtool string data * @data: Pointer to a pointer to the start of string to update -- cgit v1.2.3 From 011dd3b3f83f9c89605c640424e05845b84f2dad Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:33 +0100 Subject: net: Make dev_set_hwtstamp_phylib accessible Make the dev_set_hwtstamp_phylib function accessible in prevision to use it from ethtool to reset the tstamp current configuration. Reviewed-by: Florian Fainelli Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a16c9cc063fe..2d840d7056f2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3942,6 +3942,9 @@ int generic_hwtstamp_get_lower(struct net_device *dev, int generic_hwtstamp_set_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg, struct netlink_ext_ack *extack); +int dev_set_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata); unsigned int dev_get_flags(const struct net_device *); int __dev_change_flags(struct net_device *dev, unsigned int flags, -- cgit v1.2.3 From acec05fb78abb74fdab2195bfca9a6d38a732643 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:35 +0100 Subject: net_tstamp: Add TIMESTAMPING SOFTWARE and HARDWARE mask Timestamping software or hardware flags are often used as a group, therefore adding these masks will easier future use. I did not use SOF_TIMESTAMPING_SYS_HARDWARE flag as it is deprecated and not use at all. Signed-off-by: Kory Maincent Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/net_tstamp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index a2c66b3d7f0f..df8091998c8d 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -48,6 +48,14 @@ enum { SOF_TIMESTAMPING_TX_SCHED | \ SOF_TIMESTAMPING_TX_ACK) +#define SOF_TIMESTAMPING_SOFTWARE_MASK (SOF_TIMESTAMPING_RX_SOFTWARE | \ + SOF_TIMESTAMPING_TX_SOFTWARE | \ + SOF_TIMESTAMPING_SOFTWARE) + +#define SOF_TIMESTAMPING_HARDWARE_MASK (SOF_TIMESTAMPING_RX_HARDWARE | \ + SOF_TIMESTAMPING_TX_HARDWARE | \ + SOF_TIMESTAMPING_RAW_HARDWARE) + /** * struct so_timestamping - SO_TIMESTAMPING parameter * -- cgit v1.2.3 From 11d55be06df0aedf19b05ab61c2d26b31a3c7e64 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:36 +0100 Subject: net: ethtool: Add a command to expose current time stamping layer Time stamping on network packets may happen either in the MAC or in the PHY, but not both. In preparation for making the choice selectable, expose both the current layers via ethtool. In accordance with the kernel implementation as it stands, the current layer will always read as "phy" when a PHY time stamping device is present. Future patches will allow changing the current layer administratively. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/uapi/linux/ethtool_netlink.h | 14 ++++++++++++++ include/uapi/linux/net_tstamp.h | 10 ++++++++++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 73e2c10dc2cc..cb51136328cf 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -57,6 +57,7 @@ enum { ETHTOOL_MSG_PLCA_GET_STATUS, ETHTOOL_MSG_MM_GET, ETHTOOL_MSG_MM_SET, + ETHTOOL_MSG_TS_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -109,6 +110,7 @@ enum { ETHTOOL_MSG_PLCA_NTF, ETHTOOL_MSG_MM_GET_REPLY, ETHTOOL_MSG_MM_NTF, + ETHTOOL_MSG_TS_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -975,6 +977,18 @@ enum { ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) }; +/* TS LAYER */ + +enum { + ETHTOOL_A_TS_UNSPEC, + ETHTOOL_A_TS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_TS_LAYER, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_TS_CNT, + ETHTOOL_A_TS_MAX = (__ETHTOOL_A_TS_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index df8091998c8d..4551fb3d7720 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -13,6 +13,16 @@ #include #include /* for SO_TIMESTAMPING */ +/* Layer of the TIMESTAMPING provider */ +enum timestamping_layer { + NO_TIMESTAMPING, + SOFTWARE_TIMESTAMPING, + MAC_TIMESTAMPING, + PHY_TIMESTAMPING, + + __TIMESTAMPING_COUNT, +}; + /* SO_TIMESTAMPING flags */ enum { SOF_TIMESTAMPING_TX_HARDWARE = (1<<0), -- cgit v1.2.3 From d905f9c753295ee5a30af265f4b724f10050e7d3 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:38 +0100 Subject: net: ethtool: Add a command to list available time stamping layers Introduce a new netlink message that lists all available time stamping layers on a given interface. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/uapi/linux/ethtool_netlink.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index cb51136328cf..62b885d44d06 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -58,6 +58,7 @@ enum { ETHTOOL_MSG_MM_GET, ETHTOOL_MSG_MM_SET, ETHTOOL_MSG_TS_GET, + ETHTOOL_MSG_TS_LIST_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -111,6 +112,7 @@ enum { ETHTOOL_MSG_MM_GET_REPLY, ETHTOOL_MSG_MM_NTF, ETHTOOL_MSG_TS_GET_REPLY, + ETHTOOL_MSG_TS_LIST_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -989,6 +991,18 @@ enum { ETHTOOL_A_TS_MAX = (__ETHTOOL_A_TS_CNT - 1) }; +/* TS LIST LAYER */ + +enum { + ETHTOOL_A_TS_LIST_UNSPEC, + ETHTOOL_A_TS_LIST_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_TS_LIST_LAYER, /* array, u32 */ + + /* add new constants above here */ + __ETHTOOL_A_TS_LIST_CNT, + ETHTOOL_A_TS_LIST_MAX = (__ETHTOOL_A_TS_LIST_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 -- cgit v1.2.3 From 51bdf3165f012827644c474a6d905baa3de3f1ea Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:40 +0100 Subject: net: Replace hwtstamp_source by timestamping layer Replace hwtstamp_source which is only used by the kernel_hwtstamp_config structure by the more widely use timestamp_layer structure. This is done to prepare the support of selectable timestamping source. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/net_tstamp.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index eb01c37e71e0..bb289c2ad376 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -5,11 +5,6 @@ #include -enum hwtstamp_source { - HWTSTAMP_SOURCE_NETDEV, - HWTSTAMP_SOURCE_PHYLIB, -}; - /** * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config * @@ -20,8 +15,8 @@ enum hwtstamp_source { * a legacy implementation of a lower driver * @copied_to_user: request was passed to a legacy implementation which already * copied the ioctl request back to user space - * @source: indication whether timestamps should come from the netdev or from - * an attached phylib PHY + * @source: indication whether timestamps should come from software, the netdev + * or from an attached phylib PHY * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config @@ -33,7 +28,7 @@ struct kernel_hwtstamp_config { int rx_filter; struct ifreq *ifr; bool copied_to_user; - enum hwtstamp_source source; + enum timestamping_layer source; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, -- cgit v1.2.3 From 0f7f463d4821a4f52fa5c0a961389e651d50c384 Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:41 +0100 Subject: net: Change the API of PHY default timestamp to MAC Change the API to select MAC default time stamping instead of the PHY. Indeed the PHY is closer to the wire therefore theoretically it has less delay than the MAC timestamping but the reality is different. Due to lower time stamping clock frequency, latency in the MDIO bus and no PHC hardware synchronization between different PHY, the PHY PTP is often less precise than the MAC. The exception is for PHY designed specially for PTP case but these devices are not very widespread. For not breaking the compatibility I introduce a default_timestamp flag in phy_device that is set by the phy driver to know we are using the old API behavior. The phy_set_timestamp function is called at each call of phy_attach_direct. In case of MAC driver using phylink this function is called when the interface is turned up. Then if the interface goes down and up again the last choice of timestamp will be overwritten by the default choice. A solution could be to cache the timestamp status but it can bring other issues. In case of SFP, if we change the module, it doesn't make sense to blindly re-set the timestamp back to PHY, if the new module has a PHY with mediocre timestamping capabilities. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 +++++ include/linux/phy.h | 4 ++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2d840d7056f2..f020d2790c12 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -2074,6 +2075,8 @@ enum netdev_ml_priv_type { * * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. + * @ts_layer: Tracks which network device + * performs packet time stamping. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2435,6 +2438,8 @@ struct net_device { #if IS_ENABLED(CONFIG_DPLL) struct dpll_pin *dpll_pin; #endif + + enum timestamping_layer ts_layer; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/include/linux/phy.h b/include/linux/phy.h index e5f1f41e399c..317def2a7843 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -604,6 +604,8 @@ struct macsec_ops; * handling shall be postponed until PHY has resumed * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume + * @default_timestamp: Flag indicating whether we are using the phy + * timestamp as the default one * @interface: enum phy_interface_t value * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics @@ -667,6 +669,8 @@ struct phy_device { unsigned irq_suspended:1; unsigned irq_rerun:1; + unsigned default_timestamp:1; + int rate_matching; enum phy_state state; -- cgit v1.2.3 From 152c75e1d00200edc4da1beb67dd099a462ea86b Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 14 Nov 2023 12:28:43 +0100 Subject: net: ethtool: ts: Let the active time stamping layer be selectable Now that the current timestamp is saved in a variable lets add the ETHTOOL_MSG_TS_SET ethtool netlink socket to make it selectable. Signed-off-by: Kory Maincent Signed-off-by: David S. Miller --- include/uapi/linux/ethtool_netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 62b885d44d06..df6c4fcc62c1 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -59,6 +59,7 @@ enum { ETHTOOL_MSG_MM_SET, ETHTOOL_MSG_TS_GET, ETHTOOL_MSG_TS_LIST_GET, + ETHTOOL_MSG_TS_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, -- cgit v1.2.3 From db840d389bad60ce6f3aadc1079da13e7e993a16 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:16 -0800 Subject: bpf: move verbose_linfo() into kernel/bpf/log.c verifier.c is huge. Let's try to move out parts that are logging-related into log.c, as we previously did with bpf_log() and other related stuff. This patch moves line info verbose output routines: it's pretty self-contained and isolated code, so there is no problem with this. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 52a4012b8255..d896f3db6a22 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -680,6 +680,10 @@ int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos); int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual); +__printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, + u32 insn_off, + const char *prefix_fmt, ...); + static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; -- cgit v1.2.3 From 42feb6620accded89cad5f455665e21281813d79 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 17 Nov 2023 19:46:17 -0800 Subject: bpf: move verifier state printing code to kernel/bpf/log.c Move a good chunk of code from verifier.c to log.c: verifier state verbose printing logic. This is an important and very much logging/debugging oriented code. It fits the overlall log.c's focus on verifier logging, and moving it allows to keep growing it without unnecessarily adding to verifier.c code that otherwise contains a core verification logic. There are not many shared dependencies between this code and the rest of verifier.c code, except a few single-line helpers for various register type checks and a bit of state "scratching" helpers. We move all such trivial helpers into include/bpf/bpf_verifier.h as static inlines. No functional changes in this patch. Acked-by: Eduard Zingerman Acked-by: Stanislav Fomichev Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231118034623.3320920-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 72 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d896f3db6a22..39edc76f436e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -783,4 +783,76 @@ static inline bool bpf_type_has_unsafe_modifiers(u32 type) return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS; } +static inline bool type_is_ptr_alloc_obj(u32 type) +{ + return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; +} + +static inline bool type_is_non_owning_ref(u32 type) +{ + return type_is_ptr_alloc_obj(type) && type_flag(type) & NON_OWN_REF; +} + +static inline bool type_is_pkt_pointer(enum bpf_reg_type type) +{ + type = base_type(type); + return type == PTR_TO_PACKET || + type == PTR_TO_PACKET_META; +} + +static inline bool type_is_sk_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_TCP_SOCK || + type == PTR_TO_XDP_SOCK; +} + +static inline void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) +{ + env->scratched_regs |= 1U << regno; +} + +static inline void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) +{ + env->scratched_stack_slots |= 1ULL << spi; +} + +static inline bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) +{ + return (env->scratched_regs >> regno) & 1; +} + +static inline bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) +{ + return (env->scratched_stack_slots >> regno) & 1; +} + +static inline bool verifier_state_scratched(const struct bpf_verifier_env *env) +{ + return env->scratched_regs || env->scratched_stack_slots; +} + +static inline void mark_verifier_state_clean(struct bpf_verifier_env *env) +{ + env->scratched_regs = 0U; + env->scratched_stack_slots = 0ULL; +} + +/* Used for printing the entire verifier state. */ +static inline void mark_verifier_state_scratched(struct bpf_verifier_env *env) +{ + env->scratched_regs = ~0U; + env->scratched_stack_slots = ~0ULL; +} + +const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); +const char *dynptr_type_str(enum bpf_dynptr_type type); +const char *iter_type_str(const struct btf *btf, u32 btf_id); +const char *iter_state_str(enum bpf_iter_state state); + +void print_verifier_state(struct bpf_verifier_env *env, + const struct bpf_func_state *state, bool print_all); +void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state); + #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit v1.2.3 From 289354f21b2c3fac93e956efd45f256a88a4d997 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Nov 2023 18:38:05 -0800 Subject: net: partial revert of the "Make timestamping selectable: series Revert following commits: commit acec05fb78ab ("net_tstamp: Add TIMESTAMPING SOFTWARE and HARDWARE mask") commit 11d55be06df0 ("net: ethtool: Add a command to expose current time stamping layer") commit bb8645b00ced ("netlink: specs: Introduce new netlink command to get current timestamp") commit d905f9c75329 ("net: ethtool: Add a command to list available time stamping layers") commit aed5004ee7a0 ("netlink: specs: Introduce new netlink command to list available time stamping layers") commit 51bdf3165f01 ("net: Replace hwtstamp_source by timestamping layer") commit 0f7f463d4821 ("net: Change the API of PHY default timestamp to MAC") commit 091fab122869 ("net: ethtool: ts: Update GET_TS to reply the current selected timestamp") commit 152c75e1d002 ("net: ethtool: ts: Let the active time stamping layer be selectable") commit ee60ea6be0d3 ("netlink: specs: Introduce time stamping set command") They need more time for reviews. Link: https://lore.kernel.org/all/20231118183529.6e67100c@kernel.org/ Signed-off-by: Jakub Kicinski --- include/linux/net_tstamp.h | 11 ++++++++--- include/linux/netdevice.h | 5 ----- include/linux/phy.h | 4 ---- include/uapi/linux/ethtool_netlink.h | 29 ----------------------------- include/uapi/linux/net_tstamp.h | 18 ------------------ 5 files changed, 8 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/include/linux/net_tstamp.h b/include/linux/net_tstamp.h index bb289c2ad376..eb01c37e71e0 100644 --- a/include/linux/net_tstamp.h +++ b/include/linux/net_tstamp.h @@ -5,6 +5,11 @@ #include +enum hwtstamp_source { + HWTSTAMP_SOURCE_NETDEV, + HWTSTAMP_SOURCE_PHYLIB, +}; + /** * struct kernel_hwtstamp_config - Kernel copy of struct hwtstamp_config * @@ -15,8 +20,8 @@ * a legacy implementation of a lower driver * @copied_to_user: request was passed to a legacy implementation which already * copied the ioctl request back to user space - * @source: indication whether timestamps should come from software, the netdev - * or from an attached phylib PHY + * @source: indication whether timestamps should come from the netdev or from + * an attached phylib PHY * * Prefer using this structure for in-kernel processing of hardware * timestamping configuration, over the inextensible struct hwtstamp_config @@ -28,7 +33,7 @@ struct kernel_hwtstamp_config { int rx_filter; struct ifreq *ifr; bool copied_to_user; - enum timestamping_layer source; + enum hwtstamp_source source; }; static inline void hwtstamp_config_to_kernel(struct kernel_hwtstamp_config *kernel_cfg, diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f020d2790c12..2d840d7056f2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -47,7 +47,6 @@ #include #include #include -#include #include #include #include @@ -2075,8 +2074,6 @@ enum netdev_ml_priv_type { * * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. - * @ts_layer: Tracks which network device - * performs packet time stamping. * * FIXME: cleanup struct net_device such that network protocol info * moves out. @@ -2438,8 +2435,6 @@ struct net_device { #if IS_ENABLED(CONFIG_DPLL) struct dpll_pin *dpll_pin; #endif - - enum timestamping_layer ts_layer; }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/include/linux/phy.h b/include/linux/phy.h index 317def2a7843..e5f1f41e399c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -604,8 +604,6 @@ struct macsec_ops; * handling shall be postponed until PHY has resumed * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume - * @default_timestamp: Flag indicating whether we are using the phy - * timestamp as the default one * @interface: enum phy_interface_t value * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics @@ -669,8 +667,6 @@ struct phy_device { unsigned irq_suspended:1; unsigned irq_rerun:1; - unsigned default_timestamp:1; - int rate_matching; enum phy_state state; diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index df6c4fcc62c1..73e2c10dc2cc 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -57,9 +57,6 @@ enum { ETHTOOL_MSG_PLCA_GET_STATUS, ETHTOOL_MSG_MM_GET, ETHTOOL_MSG_MM_SET, - ETHTOOL_MSG_TS_GET, - ETHTOOL_MSG_TS_LIST_GET, - ETHTOOL_MSG_TS_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -112,8 +109,6 @@ enum { ETHTOOL_MSG_PLCA_NTF, ETHTOOL_MSG_MM_GET_REPLY, ETHTOOL_MSG_MM_NTF, - ETHTOOL_MSG_TS_GET_REPLY, - ETHTOOL_MSG_TS_LIST_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -980,30 +975,6 @@ enum { ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) }; -/* TS LAYER */ - -enum { - ETHTOOL_A_TS_UNSPEC, - ETHTOOL_A_TS_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_TS_LAYER, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_TS_CNT, - ETHTOOL_A_TS_MAX = (__ETHTOOL_A_TS_CNT - 1) -}; - -/* TS LIST LAYER */ - -enum { - ETHTOOL_A_TS_LIST_UNSPEC, - ETHTOOL_A_TS_LIST_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_TS_LIST_LAYER, /* array, u32 */ - - /* add new constants above here */ - __ETHTOOL_A_TS_LIST_CNT, - ETHTOOL_A_TS_LIST_MAX = (__ETHTOOL_A_TS_LIST_CNT - 1) -}; - /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 4551fb3d7720..a2c66b3d7f0f 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -13,16 +13,6 @@ #include #include /* for SO_TIMESTAMPING */ -/* Layer of the TIMESTAMPING provider */ -enum timestamping_layer { - NO_TIMESTAMPING, - SOFTWARE_TIMESTAMPING, - MAC_TIMESTAMPING, - PHY_TIMESTAMPING, - - __TIMESTAMPING_COUNT, -}; - /* SO_TIMESTAMPING flags */ enum { SOF_TIMESTAMPING_TX_HARDWARE = (1<<0), @@ -58,14 +48,6 @@ enum { SOF_TIMESTAMPING_TX_SCHED | \ SOF_TIMESTAMPING_TX_ACK) -#define SOF_TIMESTAMPING_SOFTWARE_MASK (SOF_TIMESTAMPING_RX_SOFTWARE | \ - SOF_TIMESTAMPING_TX_SOFTWARE | \ - SOF_TIMESTAMPING_SOFTWARE) - -#define SOF_TIMESTAMPING_HARDWARE_MASK (SOF_TIMESTAMPING_RX_HARDWARE | \ - SOF_TIMESTAMPING_TX_HARDWARE | \ - SOF_TIMESTAMPING_RAW_HARDWARE) - /** * struct so_timestamping - SO_TIMESTAMPING parameter * -- cgit v1.2.3 From ac40916a3f7243efbe6e129ebf495b5c33a3adfe Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Wed, 15 Nov 2023 20:01:08 +0800 Subject: rtnetlink: introduce nlmsg_new_large and use it in rtnl_getlink if a PF has 256 or more VFs, ip link command will allocate an order 3 memory or more, and maybe trigger OOM due to memory fragment, the VFs needed memory size is computed in rtnl_vfinfo_size. so introduce nlmsg_new_large which calls netlink_alloc_large_skb in which vmalloc is used for large memory, to avoid the failure of allocating memory ip invoked oom-killer: gfp_mask=0xc2cc0(GFP_KERNEL|__GFP_NOWARN|\ __GFP_COMP|__GFP_NOMEMALLOC), order=3, oom_score_adj=0 CPU: 74 PID: 204414 Comm: ip Kdump: loaded Tainted: P OE Call Trace: dump_stack+0x57/0x6a dump_header+0x4a/0x210 oom_kill_process+0xe4/0x140 out_of_memory+0x3e8/0x790 __alloc_pages_slowpath.constprop.116+0x953/0xc50 __alloc_pages_nodemask+0x2af/0x310 kmalloc_large_node+0x38/0xf0 __kmalloc_node_track_caller+0x417/0x4d0 __kmalloc_reserve.isra.61+0x2e/0x80 __alloc_skb+0x82/0x1c0 rtnl_getlink+0x24f/0x370 rtnetlink_rcv_msg+0x12c/0x350 netlink_rcv_skb+0x50/0x100 netlink_unicast+0x1b2/0x280 netlink_sendmsg+0x355/0x4a0 sock_sendmsg+0x5b/0x60 ____sys_sendmsg+0x1ea/0x250 ___sys_sendmsg+0x88/0xd0 __sys_sendmsg+0x5e/0xa0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f95a65a5b70 Cc: Yunsheng Lin Signed-off-by: Li RongQing Link: https://lore.kernel.org/r/20231115120108.3711-1-lirongqing@baidu.com Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 1 + include/net/netlink.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 75d7de34c908..abe91ed6b9aa 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -351,5 +351,6 @@ bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *ns, int cap); bool netlink_capable(const struct sk_buff *skb, int cap); bool netlink_net_capable(const struct sk_buff *skb, int cap); +struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast); #endif /* __LINUX_NETLINK_H */ diff --git a/include/net/netlink.h b/include/net/netlink.h index 83bdf787aeee..167b91348e57 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1010,6 +1010,20 @@ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags) return alloc_skb(nlmsg_total_size(payload), flags); } +/** + * nlmsg_new_large - Allocate a new netlink message with non-contiguous + * physical memory + * @payload: size of the message payload + * + * The allocated skb is unable to have frag page for shinfo->frags*, + * as the NULL setting for skb->head in netlink_skb_destructor() will + * bypass most of the handling in skb_release_data() + */ +static inline struct sk_buff *nlmsg_new_large(size_t payload) +{ + return netlink_alloc_large_skb(nlmsg_total_size(payload), 0); +} + /** * nlmsg_end - Finalize a netlink message * @skb: socket buffer the message is stored in -- cgit v1.2.3 From 2e7ed75e92fc493ff5484f61aed6489262c78f3e Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 27 Sep 2023 20:12:05 +0200 Subject: ieee802154: Internal PAN management Introduce structures to describe peer devices in a PAN as well as a few related helpers. We basically care about: - Our unique parent after associating with a coordinator. - Peer devices, children, which successfully associated with us. Signed-off-by: Miquel Raynal Acked-by: Stefan Schmidt Acked-by: Alexander Aring Link: https://lore.kernel.org/linux-wpan/20230927181214.129346-3-miquel.raynal@bootlin.com --- include/net/cfg802154.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index f79ce133e51a..a89f1c9cea3f 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -303,6 +303,22 @@ struct ieee802154_coord_desc { bool gts_permit; }; +/** + * struct ieee802154_pan_device - PAN device information + * @pan_id: the PAN ID of this device + * @mode: the preferred mode to reach the device + * @short_addr: the short address of this device + * @extended_addr: the extended address of this device + * @node: the list node + */ +struct ieee802154_pan_device { + __le16 pan_id; + u8 mode; + __le16 short_addr; + __le64 extended_addr; + struct list_head node; +}; + /** * struct cfg802154_scan_request - Scan request * @@ -478,6 +494,11 @@ struct wpan_dev { /* fallback for acknowledgment bit setting */ bool ackreq; + + /* Associations */ + struct mutex association_lock; + struct ieee802154_pan_device *parent; + struct list_head children; }; #define to_phy(_dev) container_of(_dev, struct wpan_phy, dev) @@ -529,4 +550,30 @@ static inline const char *wpan_phy_name(struct wpan_phy *phy) void ieee802154_configure_durations(struct wpan_phy *phy, unsigned int page, unsigned int channel); +/** + * cfg802154_device_is_associated - Checks whether we are associated to any device + * @wpan_dev: the wpan device + * @return: true if we are associated + */ +bool cfg802154_device_is_associated(struct wpan_dev *wpan_dev); + +/** + * cfg802154_device_is_parent - Checks if a device is our coordinator + * @wpan_dev: the wpan device + * @target: the expected parent + * @return: true if @target is our coordinator + */ +bool cfg802154_device_is_parent(struct wpan_dev *wpan_dev, + struct ieee802154_addr *target); + +/** + * cfg802154_device_is_child - Checks whether a device is associated to us + * @wpan_dev: the wpan device + * @target: the expected child + * @return: the PAN device + */ +struct ieee802154_pan_device * +cfg802154_device_is_child(struct wpan_dev *wpan_dev, + struct ieee802154_addr *target); + #endif /* __NET_CFG802154_H */ -- cgit v1.2.3 From 05db59a0619969a47ab87050985344177c662cab Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 27 Sep 2023 20:12:06 +0200 Subject: ieee802154: Add support for user association requests Users may decide to associate with a peer, which becomes our parent coordinator. Let's add the necessary netlink support for this. Signed-off-by: Miquel Raynal Acked-by: Stefan Schmidt Acked-by: Alexander Aring Link: https://lore.kernel.org/linux-wpan/20230927181214.129346-4-miquel.raynal@bootlin.com --- include/net/cfg802154.h | 4 ++++ include/net/ieee802154_netdev.h | 38 ++++++++++++++++++++++++++++++++++++++ include/net/nl802154.h | 1 + 3 files changed, 43 insertions(+) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index a89f1c9cea3f..d0c033176220 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -20,6 +20,7 @@ struct wpan_phy; struct wpan_phy_cca; struct cfg802154_scan_request; struct cfg802154_beacon_request; +struct ieee802154_addr; #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL struct ieee802154_llsec_device_key; @@ -77,6 +78,9 @@ struct cfg802154_ops { struct cfg802154_beacon_request *request); int (*stop_beacons)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev); + int (*associate)(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_addr *coord); #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL void (*get_llsec_table)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h index 063313df447d..ca8c827d0d7f 100644 --- a/include/net/ieee802154_netdev.h +++ b/include/net/ieee802154_netdev.h @@ -125,6 +125,30 @@ struct ieee802154_hdr_fc { #endif }; +struct ieee802154_assoc_req_pl { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved1:1, + device_type:1, + power_source:1, + rx_on_when_idle:1, + assoc_type:1, + reserved2:1, + security_cap:1, + alloc_addr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 alloc_addr:1, + security_cap:1, + reserved2:1, + assoc_type:1, + rx_on_when_idle:1, + power_source:1, + device_type:1, + reserved1:1; +#else +#error "Please fix " +#endif +} __packed; + enum ieee802154_frame_version { IEEE802154_2003_STD, IEEE802154_2006_STD, @@ -140,6 +164,14 @@ enum ieee802154_addressing_mode { IEEE802154_EXTENDED_ADDRESSING, }; +enum ieee802154_association_status { + IEEE802154_ASSOCIATION_SUCCESSFUL = 0x00, + IEEE802154_PAN_AT_CAPACITY = 0x01, + IEEE802154_PAN_ACCESS_DENIED = 0x02, + IEEE802154_HOPPING_SEQUENCE_OFFSET_DUP = 0x03, + IEEE802154_FAST_ASSOCIATION_SUCCESSFUL = 0x80, +}; + struct ieee802154_hdr { struct ieee802154_hdr_fc fc; u8 seq; @@ -163,6 +195,12 @@ struct ieee802154_beacon_req_frame { struct ieee802154_mac_cmd_pl mac_pl; }; +struct ieee802154_association_req_frame { + struct ieee802154_hdr mhr; + struct ieee802154_mac_cmd_pl mac_pl; + struct ieee802154_assoc_req_pl assoc_req_pl; +}; + /* pushes hdr onto the skb. fields of hdr->fc that can be calculated from * the contents of hdr will be, and the actual value of those bits in * hdr->fc will be ignored. this includes the INTRA_PAN bit and the frame diff --git a/include/net/nl802154.h b/include/net/nl802154.h index 8cd9d141f5af..830e1c51d3df 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -78,6 +78,7 @@ enum nl802154_commands { NL802154_CMD_SCAN_DONE, NL802154_CMD_SEND_BEACONS, NL802154_CMD_STOP_BEACONS, + NL802154_CMD_ASSOCIATE, /* add new commands above here */ -- cgit v1.2.3 From fefd19807fe9c65002366c749e809996a1ca4e68 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 27 Sep 2023 20:12:07 +0200 Subject: mac802154: Handle associating Joining a PAN officially goes by associating with a coordinator. This coordinator may have been discovered thanks to the beacons it sent in the past. Add support to the MAC layer for these associations, which require: - Sending an association request - Receiving an association response The association response contains the association status, eventually a reason if the association was unsuccessful, and finally a short address that we should use for intra-PAN communication from now on, if we required one (which is the default, and not yet configurable). Signed-off-by: Miquel Raynal Acked-by: Stefan Schmidt Acked-by: Alexander Aring Link: https://lore.kernel.org/linux-wpan/20230927181214.129346-5-miquel.raynal@bootlin.com --- include/net/ieee802154_netdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h index ca8c827d0d7f..e26ffd079556 100644 --- a/include/net/ieee802154_netdev.h +++ b/include/net/ieee802154_netdev.h @@ -149,6 +149,11 @@ struct ieee802154_assoc_req_pl { #endif } __packed; +struct ieee802154_assoc_resp_pl { + __le16 short_addr; + u8 status; +} __packed; + enum ieee802154_frame_version { IEEE802154_2003_STD, IEEE802154_2006_STD, -- cgit v1.2.3 From 7b18313e84eb62c3e4071f9679480159d8da5107 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 27 Sep 2023 20:12:08 +0200 Subject: ieee802154: Add support for user disassociation requests A device may decide at some point to disassociate from a PAN, let's introduce a netlink command for this purpose. Signed-off-by: Miquel Raynal Acked-by: Stefan Schmidt Acked-by: Alexander Aring Link: https://lore.kernel.org/linux-wpan/20230927181214.129346-6-miquel.raynal@bootlin.com --- include/net/cfg802154.h | 3 +++ include/net/ieee802154_netdev.h | 11 +++++++++++ include/net/nl802154.h | 1 + 3 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index d0c033176220..9b036ab20079 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -81,6 +81,9 @@ struct cfg802154_ops { int (*associate)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_addr *coord); + int (*disassociate)(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, + struct ieee802154_addr *target); #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL void (*get_llsec_table)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h index e26ffd079556..16194356cfe7 100644 --- a/include/net/ieee802154_netdev.h +++ b/include/net/ieee802154_netdev.h @@ -177,6 +177,11 @@ enum ieee802154_association_status { IEEE802154_FAST_ASSOCIATION_SUCCESSFUL = 0x80, }; +enum ieee802154_disassociation_reason { + IEEE802154_COORD_WISHES_DEVICE_TO_LEAVE = 0x1, + IEEE802154_DEVICE_WISHES_TO_LEAVE = 0x2, +}; + struct ieee802154_hdr { struct ieee802154_hdr_fc fc; u8 seq; @@ -206,6 +211,12 @@ struct ieee802154_association_req_frame { struct ieee802154_assoc_req_pl assoc_req_pl; }; +struct ieee802154_disassociation_notif_frame { + struct ieee802154_hdr mhr; + struct ieee802154_mac_cmd_pl mac_pl; + u8 disassoc_pl; +}; + /* pushes hdr onto the skb. fields of hdr->fc that can be calculated from * the contents of hdr will be, and the actual value of those bits in * hdr->fc will be ignored. this includes the INTRA_PAN bit and the frame diff --git a/include/net/nl802154.h b/include/net/nl802154.h index 830e1c51d3df..8a47c14c72f0 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -79,6 +79,7 @@ enum nl802154_commands { NL802154_CMD_SEND_BEACONS, NL802154_CMD_STOP_BEACONS, NL802154_CMD_ASSOCIATE, + NL802154_CMD_DISASSOCIATE, /* add new commands above here */ -- cgit v1.2.3 From 601f160b61b2152ef84a663f856350d5dd9e752a Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 27 Sep 2023 20:12:10 +0200 Subject: mac802154: Handle association requests from peers Coordinators may have to handle association requests from peers which want to join the PAN. The logic involves: - Acknowledging the request (done by hardware) - If requested, a random short address that is free on this PAN should be chosen for the device. - Sending an association response with the short address allocated for the peer and expecting it to be ack'ed. If anything fails during this procedure, the peer is considered not associated. Signed-off-by: Miquel Raynal Acked-by: Stefan Schmidt Acked-by: Alexander Aring Link: https://lore.kernel.org/linux-wpan/20230927181214.129346-8-miquel.raynal@bootlin.com --- include/net/cfg802154.h | 7 +++++++ include/net/ieee802154_netdev.h | 6 ++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 9b036ab20079..c844ae63bc04 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -583,4 +583,11 @@ struct ieee802154_pan_device * cfg802154_device_is_child(struct wpan_dev *wpan_dev, struct ieee802154_addr *target); +/** + * cfg802154_get_free_short_addr - Get a free address among the known devices + * @wpan_dev: the wpan device + * @return: a random short address expectedly unused on our PAN + */ +__le16 cfg802154_get_free_short_addr(struct wpan_dev *wpan_dev); + #endif /* __NET_CFG802154_H */ diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h index 16194356cfe7..4de858f9929e 100644 --- a/include/net/ieee802154_netdev.h +++ b/include/net/ieee802154_netdev.h @@ -211,6 +211,12 @@ struct ieee802154_association_req_frame { struct ieee802154_assoc_req_pl assoc_req_pl; }; +struct ieee802154_association_resp_frame { + struct ieee802154_hdr mhr; + struct ieee802154_mac_cmd_pl mac_pl; + struct ieee802154_assoc_resp_pl assoc_resp_pl; +}; + struct ieee802154_disassociation_notif_frame { struct ieee802154_hdr mhr; struct ieee802154_mac_cmd_pl mac_pl; -- cgit v1.2.3 From ce93b9378c306e6bcc4e0bd817acf4195b4a0288 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 27 Sep 2023 20:12:11 +0200 Subject: ieee802154: Add support for limiting the number of associated devices Coordinators may refuse associations. We need a user input for that. Let's add a new netlink command which can provide a maximum number of devices we accept to associate with as a first step. Later, we could also forward the request to userspace and check whether the association should be accepted or not. Signed-off-by: Miquel Raynal Acked-by: Stefan Schmidt Acked-by: Alexander Aring Link: https://lore.kernel.org/linux-wpan/20230927181214.129346-9-miquel.raynal@bootlin.com --- include/net/cfg802154.h | 8 ++++++++ include/net/nl802154.h | 2 ++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index c844ae63bc04..0d3e9af00198 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -506,6 +506,7 @@ struct wpan_dev { struct mutex association_lock; struct ieee802154_pan_device *parent; struct list_head children; + unsigned int max_associations; }; #define to_phy(_dev) container_of(_dev, struct wpan_phy, dev) @@ -583,6 +584,13 @@ struct ieee802154_pan_device * cfg802154_device_is_child(struct wpan_dev *wpan_dev, struct ieee802154_addr *target); +/** + * cfg802154_set_max_associations - Limit the number of future associations + * @wpan_dev: the wpan device + * @max: the maximum number of devices we accept to associate + */ +void cfg802154_set_max_associations(struct wpan_dev *wpan_dev, unsigned int max); + /** * cfg802154_get_free_short_addr - Get a free address among the known devices * @wpan_dev: the wpan device diff --git a/include/net/nl802154.h b/include/net/nl802154.h index 8a47c14c72f0..8b26faae49e8 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -80,6 +80,7 @@ enum nl802154_commands { NL802154_CMD_STOP_BEACONS, NL802154_CMD_ASSOCIATE, NL802154_CMD_DISASSOCIATE, + NL802154_CMD_SET_MAX_ASSOCIATIONS, /* add new commands above here */ @@ -149,6 +150,7 @@ enum nl802154_attrs { NL802154_ATTR_SCAN_DURATION, NL802154_ATTR_SCAN_DONE_REASON, NL802154_ATTR_BEACON_INTERVAL, + NL802154_ATTR_MAX_ASSOCIATIONS, /* add attributes here, update the policy in nl802154.c */ -- cgit v1.2.3 From 80f8bf9a2a7f603662e08f7663643a58087a2cd4 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 27 Sep 2023 20:12:12 +0200 Subject: mac802154: Follow the number of associated devices Track the count of associated devices. Limit the number of associations using the value provided by the user if any. If we reach the maximum number of associations, we tell the device we are at capacity. If the user do not want to accept any more associations, it may specify the value 0 to the maximum number of associations, which will lead to an access denied error status returned to the peers trying to associate. Signed-off-by: Miquel Raynal Acked-by: Stefan Schmidt Acked-by: Alexander Aring Link: https://lore.kernel.org/linux-wpan/20230927181214.129346-10-miquel.raynal@bootlin.com --- include/net/cfg802154.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 0d3e9af00198..a64bbcd71f10 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -507,6 +507,7 @@ struct wpan_dev { struct ieee802154_pan_device *parent; struct list_head children; unsigned int max_associations; + unsigned int nchildren; }; #define to_phy(_dev) container_of(_dev, struct wpan_phy, dev) -- cgit v1.2.3 From 83fcf26b00d77e4a0ec920524fe85350a27e9c05 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 27 Sep 2023 20:12:14 +0200 Subject: ieee802154: Give the user the association list Upon request, we must be able to provide to the user the list of associations currently in place. Let's add a new netlink command and attribute for this purpose. Signed-off-by: Miquel Raynal Acked-by: Stefan Schmidt Acked-by: Alexander Aring Link: https://lore.kernel.org/linux-wpan/20230927181214.129346-12-miquel.raynal@bootlin.com --- include/net/nl802154.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/nl802154.h b/include/net/nl802154.h index 8b26faae49e8..4c752f799957 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -81,6 +81,7 @@ enum nl802154_commands { NL802154_CMD_ASSOCIATE, NL802154_CMD_DISASSOCIATE, NL802154_CMD_SET_MAX_ASSOCIATIONS, + NL802154_CMD_LIST_ASSOCIATIONS, /* add new commands above here */ @@ -151,6 +152,7 @@ enum nl802154_attrs { NL802154_ATTR_SCAN_DONE_REASON, NL802154_ATTR_BEACON_INTERVAL, NL802154_ATTR_MAX_ASSOCIATIONS, + NL802154_ATTR_PEER, /* add attributes here, update the policy in nl802154.c */ @@ -389,8 +391,6 @@ enum nl802154_supported_bool_states { NL802154_SUPPORTED_BOOL_MAX = __NL802154_SUPPORTED_BOOL_AFTER_LAST - 1 }; -#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL - enum nl802154_dev_addr_modes { NL802154_DEV_ADDR_NONE, __NL802154_DEV_ADDR_INVALID, @@ -410,12 +410,26 @@ enum nl802154_dev_addr_attrs { NL802154_DEV_ADDR_ATTR_SHORT, NL802154_DEV_ADDR_ATTR_EXTENDED, NL802154_DEV_ADDR_ATTR_PAD, + NL802154_DEV_ADDR_ATTR_PEER_TYPE, /* keep last */ __NL802154_DEV_ADDR_ATTR_AFTER_LAST, NL802154_DEV_ADDR_ATTR_MAX = __NL802154_DEV_ADDR_ATTR_AFTER_LAST - 1 }; +enum nl802154_peer_type { + NL802154_PEER_TYPE_UNSPEC, + + NL802154_PEER_TYPE_PARENT, + NL802154_PEER_TYPE_CHILD, + + /* keep last */ + __NL802154_PEER_TYPE_AFTER_LAST, + NL802154_PEER_TYPE_MAX = __NL802154_PEER_TYPE_AFTER_LAST - 1 +}; + +#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL + enum nl802154_key_id_modes { NL802154_KEY_ID_MODE_IMPLICIT, NL802154_KEY_ID_MODE_INDEX, -- cgit v1.2.3 From 5027ec19f1049a07df5b0a37b1f462514cf2724b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2023 16:00:34 -0800 Subject: net: page_pool: split the page_pool_params into fast and slow struct page_pool is rather performance critical and we use 16B of the first cache line to store 2 pointers used only by test code. Future patches will add more informational (non-fast path) attributes. It's convenient for the user of the API to not have to worry which fields are fast and which are slow path. Use struct groups to split the params into the two categories internally. Acked-by: Jesper Dangaard Brouer Reviewed-by: Mina Almasry Reviewed-by: Ilias Apalodimas Link: https://lore.kernel.org/r/20231121000048.789613-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 6fc5134095ed..23950fcc4eca 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -54,18 +54,22 @@ struct pp_alloc_cache { * @offset: DMA sync address offset for PP_FLAG_DMA_SYNC_DEV */ struct page_pool_params { - unsigned int flags; - unsigned int order; - unsigned int pool_size; - int nid; - struct device *dev; - struct napi_struct *napi; - enum dma_data_direction dma_dir; - unsigned int max_len; - unsigned int offset; + struct_group_tagged(page_pool_params_fast, fast, + unsigned int flags; + unsigned int order; + unsigned int pool_size; + int nid; + struct device *dev; + struct napi_struct *napi; + enum dma_data_direction dma_dir; + unsigned int max_len; + unsigned int offset; + ); + struct_group_tagged(page_pool_params_slow, slow, /* private: used by test code only */ - void (*init_callback)(struct page *page, void *arg); - void *init_arg; + void (*init_callback)(struct page *page, void *arg); + void *init_arg; + ); }; #ifdef CONFIG_PAGE_POOL_STATS @@ -119,7 +123,7 @@ struct page_pool_stats { #endif struct page_pool { - struct page_pool_params p; + struct page_pool_params_fast p; long frag_users; struct page *frag_page; @@ -178,6 +182,9 @@ struct page_pool { refcount_t user_cnt; u64 destroy_cnt; + + /* Slow/Control-path information follows */ + struct page_pool_params_slow slow; }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); -- cgit v1.2.3 From 2da0cac1e9494f34c5a3438e5c4c7e662e1b7445 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 20 Nov 2023 16:00:35 -0800 Subject: net: page_pool: avoid touching slow on the fastpath To fully benefit from previous commit add one byte of state in the first cache line recording if we need to look at the slow part. The packing isn't all that impressive right now, we create a 7B hole. I'm expecting Olek's rework will reshuffle this, anyway. Acked-by: Jesper Dangaard Brouer Reviewed-by: Ilias Apalodimas Reviewed-by: Mina Almasry Link: https://lore.kernel.org/r/20231121000048.789613-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 23950fcc4eca..e1bb92c192de 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -125,6 +125,8 @@ struct page_pool_stats { struct page_pool { struct page_pool_params_fast p; + bool has_init_callback; + long frag_users; struct page *frag_page; unsigned int frag_offset; -- cgit v1.2.3 From 2afae08c9dcb8ac648414277cec70c2fe6a34d9e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 23 Nov 2023 19:59:36 -0800 Subject: bpf: Validate global subprogs lazily Slightly change BPF verifier logic around eagerness and order of global subprog validation. Instead of going over every global subprog eagerly and validating it before main (entry) BPF program is verified, turn it around. Validate main program first, mark subprogs that were called from main program for later verification, but otherwise assume it is valid. Afterwards, go over marked global subprogs and validate those, potentially marking some more global functions as being called. Continue this process until all (transitively) callable global subprogs are validated. It's a BFS traversal at its heart and will always converge. This is an important change because it allows to feature-gate some subprograms that might not be verifiable on some older kernel, depending on supported set of features. E.g., at some point, global functions were allowed to accept a pointer to memory, which size is identified by user-provided type. Unfortunately, older kernels don't support this feature. With BPF CO-RE approach, the natural way would be to still compile BPF object file once and guard calls to this global subprog with some CO-RE check or using .rodata variables. That's what people do to guard usage of new helpers or kfuncs, and any other new BPF-side feature that might be missing on old kernels. That's currently impossible to do with global subprogs, unfortunately, because they are eagerly and unconditionally validated. This patch set aims to change this, so that in the future when global funcs gain new features, those can be guarded using BPF CO-RE techniques in the same fashion as any other new kernel feature. Two selftests had to be adjusted in sync with these changes. test_global_func12 relied on eager global subprog validation failing before main program failure is detected (unknown return value). Fix by making sure that main program is always valid. verifier_subprog_precision's parent_stack_slot_precise subtest relied on verifier checkpointing heuristic to do a checkpoint at instruction #5, but that's no longer true because we don't have enough jumps validated before reaching insn #5 due to global subprogs being validated later. Other than that, no changes, as one would expect. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Eduard Zingerman Acked-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20231124035937.403208-3-andrii@kernel.org --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 258ba232e302..eb447b0a9423 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1347,6 +1347,8 @@ static inline bool bpf_prog_has_trampoline(const struct bpf_prog *prog) struct bpf_func_info_aux { u16 linkage; bool unreliable; + bool called : 1; + bool verified : 1; }; enum bpf_jit_poke_reason { -- cgit v1.2.3 From f8e80fc4acebab0447567e77d6abc30fb44250cc Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 22 Nov 2023 21:52:57 +0800 Subject: net/smc: add sysctl for max links per lgr for SMC-R v2.1 Add a new sysctl: net.smc.smcr_max_links_per_lgr, which is used to control the preferred max links per lgr for SMC-R v2.1. The default value of this sysctl is 2, and the acceptable value ranges from 1 to 2. Signed-off-by: Guangguan Wang Reviewed-by: Dust Li Signed-off-by: David S. Miller --- include/net/netns/smc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 582212ada3ba..da7023587824 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -22,5 +22,6 @@ struct netns_smc { int sysctl_smcr_testlink_time; int sysctl_wmem; int sysctl_rmem; + int sysctl_max_links_per_lgr; }; #endif -- cgit v1.2.3 From 1f2c9dd73f0a279214f9b3c382b3f1df272b3253 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 22 Nov 2023 21:52:58 +0800 Subject: net/smc: add sysctl for max conns per lgr for SMC-R v2.1 Add a new sysctl: net.smc.smcr_max_conns_per_lgr, which is used to control the preferred max connections per lgr for SMC-R v2.1. The default value of this sysctl is 255, and the acceptable value ranges from 16 to 255. Signed-off-by: Guangguan Wang Reviewed-by: Dust Li Signed-off-by: David S. Miller --- include/net/netns/smc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index da7023587824..fc752a50f91b 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -23,5 +23,6 @@ struct netns_smc { int sysctl_wmem; int sysctl_rmem; int sysctl_max_links_per_lgr; + int sysctl_max_conns_per_lgr; }; #endif -- cgit v1.2.3 From d3ca4ab4f16eb81dc3e7721251adcba49b229d54 Mon Sep 17 00:00:00 2001 From: Liam Kearney Date: Wed, 25 Oct 2023 11:27:55 +1100 Subject: wifi: ieee80211: fix PV1 frame control field name Update PV1 frame control field TODS to FROMDS to match 802.11 standard Signed-off-by: Liam Kearney Reviewed-by: Jeff Johnson Link: https://lore.kernel.org/r/20231025002755.1752983-1-liam.kearney@morsemicro.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 958771bac9c0..5e5ea216f341 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -172,11 +172,11 @@ #define IEEE80211_SN_MODULO (IEEE80211_MAX_SN + 1) -/* PV1 Layout 11ah 9.8.3.1 */ +/* PV1 Layout IEEE 802.11-2020 9.8.3.1 */ #define IEEE80211_PV1_FCTL_VERS 0x0003 #define IEEE80211_PV1_FCTL_FTYPE 0x001c #define IEEE80211_PV1_FCTL_STYPE 0x00e0 -#define IEEE80211_PV1_FCTL_TODS 0x0100 +#define IEEE80211_PV1_FCTL_FROMDS 0x0100 #define IEEE80211_PV1_FCTL_MOREFRAGS 0x0200 #define IEEE80211_PV1_FCTL_PM 0x0400 #define IEEE80211_PV1_FCTL_MOREDATA 0x0800 -- cgit v1.2.3 From 6285ee30caa1a0fbd9537496578085c143127eee Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 13 Nov 2023 11:35:00 +0200 Subject: wifi: cfg80211: Extend support for scanning while MLO connected To extend the support of TSF accounting in scan results for MLO connections, allow to indicate in the scan request the link ID corresponding to the BSS whose TSF should be used for the TSF accounting. Signed-off-by: Ilan Peer Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20231113112844.d4490bcdefb1.I8fcd158b810adddef4963727e9153096416b30ce@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +++ include/uapi/linux/nl80211.h | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b137a33a1b68..d36ad4cedf3b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2608,6 +2608,8 @@ struct cfg80211_scan_6ghz_params { * @n_6ghz_params: number of 6 GHz params * @scan_6ghz_params: 6 GHz params * @bssid: BSSID to scan for (most commonly, the wildcard BSSID) + * @tsf_report_link_id: for MLO, indicates the link ID of the BSS that should be + * used for TSF reporting. Can be set to -1 to indicate no preference. */ struct cfg80211_scan_request { struct cfg80211_ssid *ssids; @@ -2636,6 +2638,7 @@ struct cfg80211_scan_request { bool scan_6ghz; u32 n_6ghz_params; struct cfg80211_scan_6ghz_params *scan_6ghz_params; + s8 tsf_report_link_id; /* keep last */ struct ieee80211_channel *channels[] __counted_by(n_channels); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index dced2c49daec..03e44823355e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6241,9 +6241,11 @@ enum nl80211_feature_flags { * the BSS that the interface that requested the scan is connected to * (if available). * @NL80211_EXT_FEATURE_BSS_PARENT_TSF: Per BSS, this driver reports the - * time the last beacon/probe was received. The time is the TSF of the - * BSS that the interface that requested the scan is connected to - * (if available). + * time the last beacon/probe was received. For a non MLO connection, the + * time is the TSF of the BSS that the interface that requested the scan is + * connected to (if available). For an MLO connection, the time is the TSF + * of the BSS corresponding with link ID specified in the scan request (if + * specified). * @NL80211_EXT_FEATURE_SET_SCAN_DWELL: This driver supports configuration of * channel dwell time. * @NL80211_EXT_FEATURE_BEACON_RATE_LEGACY: Driver supports beacon rate -- cgit v1.2.3 From 0cc3f50f42d262d6175ee2834aeb56e98934cfcc Mon Sep 17 00:00:00 2001 From: Vinayak Yadawad Date: Thu, 9 Nov 2023 12:03:44 +0530 Subject: wifi: nl80211: Documentation update for NL80211_CMD_PORT_AUTHORIZED event Drivers supporting 4-way handshake offload for AP/P2p-GO and STA/P2P-client should use this event to indicate that port has been authorized and open for regular data traffic, sending this event on completion of successful 4-way handshake. Signed-off-by: Vinayak Yadawad Link: https://lore.kernel.org/r/f746b59f41436e9df29c24688035fbc6eb91ab06.1699510229.git.vinayak.yadawad@broadcom.com [rewrite it all to not use the term 'GC' that we don't use in place of P2P-client] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 03e44823355e..0cd1da2c2902 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1135,11 +1135,15 @@ * @NL80211_CMD_DEL_PMK: For offloaded 4-Way handshake, delete the previously * configured PMK for the authenticator address identified by * %NL80211_ATTR_MAC. - * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates an 802.1X FT roam was - * completed successfully. Drivers that support 4 way handshake offload - * should send this event after indicating 802.1X FT assocation with - * %NL80211_CMD_ROAM. If the 4 way handshake failed %NL80211_CMD_DISCONNECT - * should be indicated instead. + * @NL80211_CMD_PORT_AUTHORIZED: An event that indicates port is authorized and + * open for regular data traffic. For STA/P2P-client, this event is sent + * with AP MAC address and for AP/P2P-GO, the event carries the STA/P2P- + * client MAC address. + * Drivers that support 4 way handshake offload should send this event for + * STA/P2P-client after successful 4-way HS or after 802.1X FT following + * NL80211_CMD_CONNECT or NL80211_CMD_ROAM. Drivers using AP/P2P-GO 4-way + * handshake offload should send this event on successful completion of + * 4-way handshake with the peer (STA/P2P-client). * @NL80211_CMD_CONTROL_PORT_FRAME: Control Port (e.g. PAE) frame TX request * and RX notification. This command is used both as a request to transmit * a control port frame and as a notification that a control port frame -- cgit v1.2.3 From a066f906ba396ab00d4af19fc5fad42b2605582a Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Wed, 22 Nov 2023 14:52:43 +0100 Subject: firmware_loader: Expand Firmware upload error codes with firmware invalid error No error code are available to signal an invalid firmware content. Drivers that can check the firmware content validity can not return this specific failure to the user-space Expand the firmware error code with an additional code: - "firmware invalid" code which can be used when the provided firmware is invalid Sync lib/test_firmware.c file accordingly. Acked-by: Luis Chamberlain Acked-by: Greg Kroah-Hartman Signed-off-by: Kory Maincent Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20231122-feature_firmware_error_code-v3-1-04ec753afb71@bootlin.com Signed-off-by: Jakub Kicinski --- include/linux/firmware.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/firmware.h b/include/linux/firmware.h index de7fea3bca51..0311858b46ce 100644 --- a/include/linux/firmware.h +++ b/include/linux/firmware.h @@ -27,6 +27,7 @@ struct firmware { * @FW_UPLOAD_ERR_INVALID_SIZE: invalid firmware image size * @FW_UPLOAD_ERR_RW_ERROR: read or write to HW failed, see kernel log * @FW_UPLOAD_ERR_WEAROUT: FLASH device is approaching wear-out, wait & retry + * @FW_UPLOAD_ERR_FW_INVALID: invalid firmware file * @FW_UPLOAD_ERR_MAX: Maximum error code marker */ enum fw_upload_err { @@ -38,6 +39,7 @@ enum fw_upload_err { FW_UPLOAD_ERR_INVALID_SIZE, FW_UPLOAD_ERR_RW_ERROR, FW_UPLOAD_ERR_WEAROUT, + FW_UPLOAD_ERR_FW_INVALID, FW_UPLOAD_ERR_MAX }; -- cgit v1.2.3 From e1df5202e879bce09845ac62bae30206e1edfb80 Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Fri, 24 Nov 2023 05:02:30 -0800 Subject: net :mana :Add remaining GDMA stats for MANA to ethtool Extend performance counter stats in 'ethtool -S ' for MANA VF to include all GDMA stat counter. Tested-on: Ubuntu22 Testcases: 1. LISA testcase: PERF-NETWORK-TCP-THROUGHPUT-MULTICONNECTION-NTTTCP-Synthetic 2. LISA testcase: PERF-NETWORK-TCP-THROUGHPUT-MULTICONNECTION-NTTTCP-SRIOV Signed-off-by: Shradha Gupta Link: https://lore.kernel.org/r/1700830950-803-1-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Paolo Abeni --- include/net/mana/mana.h | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 6e3e9c1363db..5567f5bc8eb6 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -353,6 +353,25 @@ struct mana_tx_qp { struct mana_ethtool_stats { u64 stop_queue; u64 wake_queue; + u64 hc_rx_discards_no_wqe; + u64 hc_rx_err_vport_disabled; + u64 hc_rx_bytes; + u64 hc_rx_ucast_pkts; + u64 hc_rx_ucast_bytes; + u64 hc_rx_bcast_pkts; + u64 hc_rx_bcast_bytes; + u64 hc_rx_mcast_pkts; + u64 hc_rx_mcast_bytes; + u64 hc_tx_err_gf_disabled; + u64 hc_tx_err_vport_disabled; + u64 hc_tx_err_inval_vportoffset_pkt; + u64 hc_tx_err_vlan_enforcement; + u64 hc_tx_err_eth_type_enforcement; + u64 hc_tx_err_sa_enforcement; + u64 hc_tx_err_sqpdid_enforecement; + u64 hc_tx_err_cqpdid_enforcement; + u64 hc_tx_err_mtu_violation; + u64 hc_tx_err_inval_oob; u64 hc_tx_bytes; u64 hc_tx_ucast_pkts; u64 hc_tx_ucast_bytes; @@ -360,6 +379,7 @@ struct mana_ethtool_stats { u64 hc_tx_bcast_bytes; u64 hc_tx_mcast_pkts; u64 hc_tx_mcast_bytes; + u64 hc_tx_err_gdma; u64 tx_cqe_err; u64 tx_cqe_unknown_type; u64 rx_coalesced_err; @@ -602,8 +622,8 @@ struct mana_query_gf_stat_resp { struct gdma_resp_hdr hdr; u64 reported_stats; /* rx errors/discards */ - u64 discard_rx_nowqe; - u64 err_rx_vport_disabled; + u64 rx_discards_nowqe; + u64 rx_err_vport_disabled; /* rx bytes/packets */ u64 hc_rx_bytes; u64 hc_rx_ucast_pkts; @@ -613,16 +633,16 @@ struct mana_query_gf_stat_resp { u64 hc_rx_mcast_pkts; u64 hc_rx_mcast_bytes; /* tx errors */ - u64 err_tx_gf_disabled; - u64 err_tx_vport_disabled; - u64 err_tx_inval_vport_offset_pkt; - u64 err_tx_vlan_enforcement; - u64 err_tx_ethtype_enforcement; - u64 err_tx_SA_enforecement; - u64 err_tx_SQPDID_enforcement; - u64 err_tx_CQPDID_enforcement; - u64 err_tx_mtu_violation; - u64 err_tx_inval_oob; + u64 tx_err_gf_disabled; + u64 tx_err_vport_disabled; + u64 tx_err_inval_vport_offset_pkt; + u64 tx_err_vlan_enforcement; + u64 tx_err_ethtype_enforcement; + u64 tx_err_SA_enforcement; + u64 tx_err_SQPDID_enforcement; + u64 tx_err_CQPDID_enforcement; + u64 tx_err_mtu_violation; + u64 tx_err_inval_oob; /* tx bytes/packets */ u64 hc_tx_bytes; u64 hc_tx_ucast_pkts; @@ -632,7 +652,7 @@ struct mana_query_gf_stat_resp { u64 hc_tx_mcast_pkts; u64 hc_tx_mcast_bytes; /* tx error */ - u64 err_tx_gdma; + u64 tx_err_gdma; }; /* HW DATA */ /* Configure vPort Rx Steering */ -- cgit v1.2.3 From 243ad8df7a1bd24c2e01bd99d9f0bb88844dae91 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 24 Nov 2023 12:27:52 +0000 Subject: net: phy: add possible interfaces Add a possible_interfaces member to struct phy_device to indicate which interfaces a clause 45 PHY may switch between depending on the media. This must be populated by the PHY driver by the time the .config_init() method completes according to the PHYs host-side configuration. For example, the Marvell 88x3310 PHY can switch between 10GBASE-R, 5GBASE-R, 2500BASE-X, and SGMII on the host side depending on the media side speed, so all these interface modes are set in the possible_interfaces member. This allows phylib users (such as phylink) to know in advance which interface modes to expect, which allows them to appropriately restrict the advertised link modes according to the capabilities of other parts of the link. Tested-by: Luo Jie Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/E1r6VHk-00DDLN-I7@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index e5f1f41e399c..6e7ebcc50b85 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -605,6 +605,8 @@ struct macsec_ops; * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, * requiring a rerun of the interrupt handler after resume * @interface: enum phy_interface_t value + * @possible_interfaces: bitmap if interface modes that the attached PHY + * will switch between depending on media speed. * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics * @ehdr: nNtlink header for cable diagnostics @@ -674,6 +676,7 @@ struct phy_device { u32 dev_flags; phy_interface_t interface; + DECLARE_PHY_INTERFACE_MASK(possible_interfaces); /* * forced speed & duplex (no autoneg) -- cgit v1.2.3 From f17c69649c698e4df3cfe0010b7bbf142dec3e40 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:29 -0800 Subject: net: page_pool: id the page pools To give ourselves the flexibility of creating netlink commands and ability to refer to page pool instances in uAPIs create IDs for page pools. Reviewed-by: Ilias Apalodimas Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Reviewed-by: Shakeel Butt Signed-off-by: Paolo Abeni --- include/net/page_pool/types.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index e1bb92c192de..c19f0df3bf0b 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -187,6 +187,10 @@ struct page_pool { /* Slow/Control-path information follows */ struct page_pool_params_slow slow; + /* User-facing fields, protected by page_pools_lock */ + struct { + u32 id; + } user; }; struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp); -- cgit v1.2.3 From 083772c9f972dcc248913b52a0dec1025baa1e16 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:30 -0800 Subject: net: page_pool: record pools per netdev Link the page pools with netdevs. This needs to be netns compatible so we have two options. Either we record the pools per netns and have to worry about moving them as the netdev gets moved. Or we record them directly on the netdev so they move with the netdev without any extra work. Implement the latter option. Since pools may outlast netdev we need a place to store orphans. In time honored tradition use loopback for this purpose. Reviewed-by: Mina Almasry Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/list.h | 20 ++++++++++++++++++++ include/linux/netdevice.h | 4 ++++ include/linux/poison.h | 2 ++ include/net/page_pool/types.h | 4 ++++ 4 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/linux/list.h b/include/linux/list.h index 1837caedf723..059aa1fff41e 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -1119,6 +1119,26 @@ static inline void hlist_move_list(struct hlist_head *old, old->first = NULL; } +/** + * hlist_splice_init() - move all entries from one list to another + * @from: hlist_head from which entries will be moved + * @last: last entry on the @from list + * @to: hlist_head to which entries will be moved + * + * @to can be empty, @from must contain at least @last. + */ +static inline void hlist_splice_init(struct hlist_head *from, + struct hlist_node *last, + struct hlist_head *to) +{ + if (to->first) + to->first->pprev = &last->next; + last->next = to->first; + to->first = from->first; + from->first->pprev = &to->first; + from->first = NULL; +} + #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e87caa81f70c..998c7aaa98b8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2447,6 +2447,10 @@ struct net_device { #if IS_ENABLED(CONFIG_DPLL) struct dpll_pin *dpll_pin; #endif +#if IS_ENABLED(CONFIG_PAGE_POOL) + /** @page_pools: page pools created for this netdevice */ + struct hlist_head page_pools; +#endif }; #define to_net_dev(d) container_of(d, struct net_device, dev) diff --git a/include/linux/poison.h b/include/linux/poison.h index 851a855d3868..27a7dad17eef 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -83,6 +83,8 @@ /********** net/core/skbuff.c **********/ #define SKB_LIST_POISON_NEXT ((void *)(0x800 + POISON_POINTER_DELTA)) +/********** net/ **********/ +#define NET_PTR_POISON ((void *)(0x801 + POISON_POINTER_DELTA)) /********** kernel/bpf/ **********/ #define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA)) diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index c19f0df3bf0b..b258a571201e 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -5,6 +5,7 @@ #include #include +#include #define PP_FLAG_DMA_MAP BIT(0) /* Should page_pool do the DMA * map/unmap @@ -48,6 +49,7 @@ struct pp_alloc_cache { * @pool_size: size of the ptr_ring * @nid: NUMA node id to allocate from pages from * @dev: device, for DMA pre-mapping purposes + * @netdev: netdev this pool will serve (leave as NULL if none or multiple) * @napi: NAPI which is the sole consumer of pages, otherwise NULL * @dma_dir: DMA mapping direction * @max_len: max DMA sync memory size for PP_FLAG_DMA_SYNC_DEV @@ -66,6 +68,7 @@ struct page_pool_params { unsigned int offset; ); struct_group_tagged(page_pool_params_slow, slow, + struct net_device *netdev; /* private: used by test code only */ void (*init_callback)(struct page *page, void *arg); void *init_arg; @@ -189,6 +192,7 @@ struct page_pool { struct page_pool_params_slow slow; /* User-facing fields, protected by page_pools_lock */ struct { + struct hlist_node list; u32 id; } user; }; -- cgit v1.2.3 From 02b3de80c5f879f92e5f4bb3f535d172e0fc0ea0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:31 -0800 Subject: net: page_pool: stash the NAPI ID for easier access To avoid any issues with race conditions on accessing napi and having to think about the lifetime of NAPI objects in netlink GET - stash the napi_id to which page pool was linked at creation time. Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/net/page_pool/types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index b258a571201e..7e47d7bb2c1e 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -193,6 +193,7 @@ struct page_pool { /* User-facing fields, protected by page_pools_lock */ struct { struct hlist_node list; + u32 napi_id; u32 id; } user; }; -- cgit v1.2.3 From 950ab53b77ab829defeb22bc98d40a5e926ae018 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:34 -0800 Subject: net: page_pool: implement GET in the netlink API Expose the very basic page pool information via netlink. Example using ynl-py for a system with 9 queues: $ ./cli.py --no-schema --spec netlink/specs/netdev.yaml \ --dump page-pool-get [{'id': 19, 'ifindex': 2, 'napi-id': 147}, {'id': 18, 'ifindex': 2, 'napi-id': 146}, {'id': 17, 'ifindex': 2, 'napi-id': 145}, {'id': 16, 'ifindex': 2, 'napi-id': 144}, {'id': 15, 'ifindex': 2, 'napi-id': 143}, {'id': 14, 'ifindex': 2, 'napi-id': 142}, {'id': 13, 'ifindex': 2, 'napi-id': 141}, {'id': 12, 'ifindex': 2, 'napi-id': 140}, {'id': 11, 'ifindex': 2, 'napi-id': 139}, {'id': 10, 'ifindex': 2, 'napi-id': 138}] Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/uapi/linux/netdev.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 2943a151d4f1..176665bcf0da 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -64,11 +64,21 @@ enum { NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) }; +enum { + NETDEV_A_PAGE_POOL_ID = 1, + NETDEV_A_PAGE_POOL_IFINDEX, + NETDEV_A_PAGE_POOL_NAPI_ID, + + __NETDEV_A_PAGE_POOL_MAX, + NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) +}; + enum { NETDEV_CMD_DEV_GET = 1, NETDEV_CMD_DEV_ADD_NTF, NETDEV_CMD_DEV_DEL_NTF, NETDEV_CMD_DEV_CHANGE_NTF, + NETDEV_CMD_PAGE_POOL_GET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From d2ef6aa077bdd0b3495dba5dcae6d3f19579b20b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:35 -0800 Subject: net: page_pool: add netlink notifications for state changes Generate netlink notifications about page pool state changes. Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/uapi/linux/netdev.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 176665bcf0da..beb158872226 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -79,11 +79,15 @@ enum { NETDEV_CMD_DEV_DEL_NTF, NETDEV_CMD_DEV_CHANGE_NTF, NETDEV_CMD_PAGE_POOL_GET, + NETDEV_CMD_PAGE_POOL_ADD_NTF, + NETDEV_CMD_PAGE_POOL_DEL_NTF, + NETDEV_CMD_PAGE_POOL_CHANGE_NTF, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) }; #define NETDEV_MCGRP_MGMT "mgmt" +#define NETDEV_MCGRP_PAGE_POOL "page-pool" #endif /* _UAPI_LINUX_NETDEV_H */ -- cgit v1.2.3 From 7aee8429eedd0970d8add2fb5b856bfc5f5f1fc1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:36 -0800 Subject: net: page_pool: report amount of memory held by page pools Advanced deployments need the ability to check memory use of various system components. It makes it possible to make informed decisions about memory allocation and to find regressions and leaks. Report memory use of page pools. Report both number of references and bytes held. Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/uapi/linux/netdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index beb158872226..26ae5bdd3187 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -68,6 +68,8 @@ enum { NETDEV_A_PAGE_POOL_ID = 1, NETDEV_A_PAGE_POOL_IFINDEX, NETDEV_A_PAGE_POOL_NAPI_ID, + NETDEV_A_PAGE_POOL_INFLIGHT, + NETDEV_A_PAGE_POOL_INFLIGHT_MEM, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) -- cgit v1.2.3 From 69cb4952b6f6a226c1c0a7ca400398aaa8f75cf2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:37 -0800 Subject: net: page_pool: report when page pool was destroyed Report when page pool was destroyed. Together with the inflight / memory use reporting this can serve as a replacement for the warning about leaked page pools we currently print to dmesg. Example output for a fake leaked page pool using some hacks in netdevsim (one "live" pool, and one "leaked" on the same dev): $ ./cli.py --no-schema --spec netlink/specs/netdev.yaml \ --dump page-pool-get [{'id': 2, 'ifindex': 3}, {'id': 1, 'ifindex': 3, 'destroyed': 133, 'inflight': 1}] Tested-by: Dragos Tatulea Reviewed-by: Eric Dumazet Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/net/page_pool/types.h | 1 + include/uapi/linux/netdev.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index 7e47d7bb2c1e..ac286ea8ce2d 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -193,6 +193,7 @@ struct page_pool { /* User-facing fields, protected by page_pools_lock */ struct { struct hlist_node list; + u64 detach_time; u32 napi_id; u32 id; } user; diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 26ae5bdd3187..756410274120 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -70,6 +70,7 @@ enum { NETDEV_A_PAGE_POOL_NAPI_ID, NETDEV_A_PAGE_POOL_INFLIGHT, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, + NETDEV_A_PAGE_POOL_DETACH_TIME, __NETDEV_A_PAGE_POOL_MAX, NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) -- cgit v1.2.3 From d49010adae737638447369a4eff8f1aab736b076 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 26 Nov 2023 15:07:38 -0800 Subject: net: page_pool: expose page pool stats via netlink Dump the stats into netlink. More clever approaches like dumping the stats per-CPU for each CPU individually to see where the packets get consumed can be implemented in the future. A trimmed example from a real (but recently booted system): $ ./cli.py --no-schema --spec netlink/specs/netdev.yaml \ --dump page-pool-stats-get [{'info': {'id': 19, 'ifindex': 2}, 'alloc-empty': 48, 'alloc-fast': 3024, 'alloc-refill': 0, 'alloc-slow': 48, 'alloc-slow-high-order': 0, 'alloc-waive': 0, 'recycle-cache-full': 0, 'recycle-cached': 0, 'recycle-released-refcnt': 0, 'recycle-ring': 0, 'recycle-ring-full': 0}, {'info': {'id': 18, 'ifindex': 2}, 'alloc-empty': 66, 'alloc-fast': 11811, 'alloc-refill': 35, 'alloc-slow': 66, 'alloc-slow-high-order': 0, 'alloc-waive': 0, 'recycle-cache-full': 1145, 'recycle-cached': 6541, 'recycle-released-refcnt': 0, 'recycle-ring': 1275, 'recycle-ring-full': 0}, {'info': {'id': 17, 'ifindex': 2}, 'alloc-empty': 73, 'alloc-fast': 62099, 'alloc-refill': 413, ... Acked-by: Jesper Dangaard Brouer Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/net/page_pool/helpers.h | 8 ++------ include/uapi/linux/netdev.h | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 4ebd544ae977..7dc65774cde5 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -55,16 +55,12 @@ #include #ifdef CONFIG_PAGE_POOL_STATS +/* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); -/* - * Drivers that wish to harvest page pool stats and report them to users - * (perhaps via ethtool, debugfs, or another mechanism) can allocate a - * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool. - */ -bool page_pool_get_stats(struct page_pool *pool, +bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); #else static inline int page_pool_ethtool_stats_get_count(void) diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 756410274120..2b37233e00c0 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -76,6 +76,24 @@ enum { NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) }; +enum { + NETDEV_A_PAGE_POOL_STATS_INFO = 1, + NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST = 8, + NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW, + NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER, + NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY, + NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL, + NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE, + NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED, + NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL, + NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING, + NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL, + NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT, + + __NETDEV_A_PAGE_POOL_STATS_MAX, + NETDEV_A_PAGE_POOL_STATS_MAX = (__NETDEV_A_PAGE_POOL_STATS_MAX - 1) +}; + enum { NETDEV_CMD_DEV_GET = 1, NETDEV_CMD_DEV_ADD_NTF, @@ -85,6 +103,7 @@ enum { NETDEV_CMD_PAGE_POOL_ADD_NTF, NETDEV_CMD_PAGE_POOL_DEL_NTF, NETDEV_CMD_PAGE_POOL_CHANGE_NTF, + NETDEV_CMD_PAGE_POOL_STATS_GET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From e56fdbfb06e26a7066b070967badef4148528df2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 25 Nov 2023 20:31:27 +0100 Subject: bpf: Add link_info support for uprobe multi link Adding support to get uprobe_link details through bpf_link_info interface. Adding new struct uprobe_multi to struct bpf_link_info to carry the uprobe_multi link details. The uprobe_multi.count is passed from user space to denote size of array fields (offsets/ref_ctr_offsets/cookies). The actual array size is stored back to uprobe_multi.count (allowing user to find out the actual array size) and array fields are populated up to the user passed size. All the non-array fields (path/count/flags/pid) are always set. Signed-off-by: Jiri Olsa Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20231125193130.834322-4-jolsa@kernel.org --- include/uapi/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7a5498242eaa..e88746ba7d21 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6562,6 +6562,16 @@ struct bpf_link_info { __u32 flags; __u64 missed; } kprobe_multi; + struct { + __aligned_u64 path; + __aligned_u64 offsets; + __aligned_u64 ref_ctr_offsets; + __aligned_u64 cookies; + __u32 path_size; /* in/out: real path size on success, including zero byte */ + __u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */ + __u32 flags; + __u32 pid; + } uprobe_multi; struct { __u32 type; /* enum bpf_perf_event_type */ __u32 :32; -- cgit v1.2.3 From 341ac980eab90ac1f6c22ee9f9da83ed9604d899 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:07 -0800 Subject: xsk: Support tx_metadata_len For zerocopy mode, tx_desc->addr can point to an arbitrary offset and carry some TX metadata in the headroom. For copy mode, there is no way currently to populate skb metadata. Introduce new tx_metadata_len umem config option that indicates how many bytes to treat as metadata. Metadata bytes come prior to tx_desc address (same as in RX case). The size of the metadata has mostly the same constraints as XDP: - less than 256 bytes - 8-byte aligned (compared to 4-byte alignment on xdp, due to 8-byte timestamp in the completion) - non-zero This data is not interpreted in any way right now. Reviewed-by: Song Yoong Siang Signed-off-by: Stanislav Fomichev Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20231127190319.1190813-2-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 1 + include/net/xsk_buff_pool.h | 1 + include/uapi/linux/if_xdp.h | 1 + 3 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index f83128007fb0..bcf765124f72 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -30,6 +30,7 @@ struct xdp_umem { struct user_struct *user; refcount_t users; u8 flags; + u8 tx_metadata_len; bool zc; struct page **pgs; int id; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index b0bdff26fc88..1985ffaf9b0c 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -77,6 +77,7 @@ struct xsk_buff_pool { u32 chunk_size; u32 chunk_shift; u32 frame_len; + u8 tx_metadata_len; /* inherited from umem */ u8 cached_need_wakeup; bool uses_need_wakeup; bool dma_need_sync; diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 8d48863472b9..2ecf79282c26 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -76,6 +76,7 @@ struct xdp_umem_reg { __u32 chunk_size; __u32 headroom; __u32 flags; + __u32 tx_metadata_len; }; struct xdp_statistics { -- cgit v1.2.3 From 48eb03dd26304c24f03bdbb9382e89c8564e71df Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:08 -0800 Subject: xsk: Add TX timestamp and TX checksum offload support This change actually defines the (initial) metadata layout that should be used by AF_XDP userspace (xsk_tx_metadata). The first field is flags which requests appropriate offloads, followed by the offload-specific fields. The supported per-device offloads are exported via netlink (new xsk-flags). The offloads themselves are still implemented in a bit of a framework-y fashion that's left from my initial kfunc attempt. I'm introducing new xsk_tx_metadata_ops which drivers are supposed to implement. The drivers are also supposed to call xsk_tx_metadata_request/xsk_tx_metadata_complete in the right places. Since xsk_tx_metadata_{request,_complete} are static inline, we don't incur any extra overhead doing indirect calls. The benefit of this scheme is as follows: - keeps all metadata layout parsing away from driver code - makes it easy to grep and see which drivers implement what - don't need any extra flags to maintain to keep track of what offloads are implemented; if the callback is implemented - the offload is supported (used by netlink reporting code) Two offloads are defined right now: 1. XDP_TXMD_FLAGS_CHECKSUM: skb-style csum_start+csum_offset 2. XDP_TXMD_FLAGS_TIMESTAMP: writes TX timestamp back into metadata area upon completion (tx_timestamp field) XDP_TXMD_FLAGS_TIMESTAMP is also implemented for XDP_COPY mode: it writes SW timestamp from the skb destructor (note I'm reusing hwtstamps to pass metadata pointer). The struct is forward-compatible and can be extended in the future by appending more fields. Reviewed-by: Song Yoong Siang Signed-off-by: Stanislav Fomichev Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20231127190319.1190813-3-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 2 + include/linux/skbuff.h | 14 +++++- include/net/xdp_sock.h | 110 ++++++++++++++++++++++++++++++++++++++++++++ include/net/xdp_sock_drv.h | 13 ++++++ include/net/xsk_buff_pool.h | 6 +++ include/uapi/linux/if_xdp.h | 38 +++++++++++++++ include/uapi/linux/netdev.h | 16 +++++++ 7 files changed, 198 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e87caa81f70c..08da8b28c816 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1865,6 +1865,7 @@ enum netdev_stat_type { * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions * @xdp_metadata_ops: Includes pointers to XDP metadata callbacks. + * @xsk_tx_metadata_ops: Includes pointers to AF_XDP TX metadata callbacks. * @ethtool_ops: Management operations * @l3mdev_ops: Layer 3 master device operations * @ndisc_ops: Includes callbacks for different IPv6 neighbour @@ -2128,6 +2129,7 @@ struct net_device { unsigned long long priv_flags; const struct net_device_ops *netdev_ops; const struct xdp_metadata_ops *xdp_metadata_ops; + const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops; int ifindex; unsigned short gflags; unsigned short hard_header_len; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 27998f73183e..b370eb8d70f7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -566,6 +566,15 @@ struct ubuf_info_msgzc { int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); +/* Preserve some data across TX submission and completion. + * + * Note, this state is stored in the driver. Extending the layout + * might need some special care. + */ +struct xsk_tx_metadata_compl { + __u64 *tx_timestamp; +}; + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -578,7 +587,10 @@ struct skb_shared_info { /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; struct sk_buff *frag_list; - struct skb_shared_hwtstamps hwtstamps; + union { + struct skb_shared_hwtstamps hwtstamps; + struct xsk_tx_metadata_compl xsk_meta; + }; unsigned int gso_type; u32 tskey; diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index bcf765124f72..3cb4dc9bd70e 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -93,12 +93,105 @@ struct xdp_sock { struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; +/* + * AF_XDP TX metadata hooks for network devices. + * The following hooks can be defined; unless noted otherwise, they are + * optional and can be filled with a null pointer. + * + * void (*tmo_request_timestamp)(void *priv) + * Called when AF_XDP frame requested egress timestamp. + * + * u64 (*tmo_fill_timestamp)(void *priv) + * Called when AF_XDP frame, that had requested egress timestamp, + * received a completion. The hook needs to return the actual HW timestamp. + * + * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv) + * Called when AF_XDP frame requested HW checksum offload. csum_start + * indicates position where checksumming should start. + * csum_offset indicates position where checksum should be stored. + * + */ +struct xsk_tx_metadata_ops { + void (*tmo_request_timestamp)(void *priv); + u64 (*tmo_fill_timestamp)(void *priv); + void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv); +}; + #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(void); +/** + * xsk_tx_metadata_to_compl - Save enough relevant metadata information + * to perform tx completion in the future. + * @meta: pointer to AF_XDP metadata area + * @compl: pointer to output struct xsk_tx_metadata_to_compl + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. The value of @compl should be stored + * and passed to xsk_tx_metadata_complete upon TX completion. + */ +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ + if (!meta) + return; + + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + compl->tx_timestamp = &meta->completion.tx_timestamp; + else + compl->tx_timestamp = NULL; +} + +/** + * xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission + * and call appropriate xsk_tx_metadata_ops operation. + * @meta: pointer to AF_XDP metadata area + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device when + * it prepares AF_XDP egress packet. + */ +static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!meta) + return; + + if (ops->tmo_request_timestamp) + if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) + ops->tmo_request_timestamp(priv); + + if (ops->tmo_request_checksum) + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) + ops->tmo_request_checksum(meta->request.csum_start, + meta->request.csum_offset, priv); +} + +/** + * xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion + * and call appropriate xsk_tx_metadata_ops operation. + * @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl + * @ops: pointer to struct xsk_tx_metadata_ops + * @priv: pointer to driver-private aread + * + * This function should be called by the networking device upon + * AF_XDP egress completion. + */ +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ + if (!compl) + return; + + *compl->tx_timestamp = ops->tmo_fill_timestamp(priv); +} + #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) @@ -115,6 +208,23 @@ static inline void __xsk_map_flush(void) { } +static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, + struct xsk_tx_metadata_compl *compl) +{ +} + +static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + +static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, + const struct xsk_tx_metadata_ops *ops, + void *priv) +{ +} + #endif /* CONFIG_XDP_SOCKETS */ #if defined(CONFIG_XDP_SOCKETS) && defined(CONFIG_DEBUG_NET) diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 1f6fc8c7a84c..e2558ac3e195 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -165,6 +165,14 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } +static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + if (!pool->tx_metadata_len) + return NULL; + + return xp_raw_get_data(pool, addr) - pool->tx_metadata_len; +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); @@ -324,6 +332,11 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } +static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) +{ + return NULL; +} + static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) { } diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 1985ffaf9b0c..97f5cc10d79e 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -33,6 +33,7 @@ struct xdp_buff_xsk { }; #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) +#define XSK_TX_COMPL_FITS(t) BUILD_BUG_ON(sizeof(struct xsk_tx_metadata_compl) > sizeof(t)) struct xsk_dma_map { dma_addr_t *dma_pages; @@ -234,4 +235,9 @@ static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } +static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool) +{ + return pool->tx_metadata_len > 0; +} + #endif /* XSK_BUFF_POOL_H_ */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 2ecf79282c26..95de66d5a26c 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -106,6 +106,41 @@ struct xdp_options { #define XSK_UNALIGNED_BUF_ADDR_MASK \ ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1) +/* Request transmit timestamp. Upon completion, put it into tx_timestamp + * field of struct xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0) + +/* Request transmit checksum offload. Checksum start position and offset + * are communicated via csum_start and csum_offset fields of struct + * xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1) + +/* AF_XDP offloads request. 'request' union member is consumed by the driver + * when the packet is being transmitted. 'completion' union member is + * filled by the driver when the transmit completion arrives. + */ +struct xsk_tx_metadata { + __u64 flags; + + union { + struct { + /* XDP_TXMD_FLAGS_CHECKSUM */ + + /* Offset from desc->addr where checksumming should start. */ + __u16 csum_start; + /* Offset from csum_start where checksum should be stored. */ + __u16 csum_offset; + } request; + + struct { + /* XDP_TXMD_FLAGS_TIMESTAMP */ + __u64 tx_timestamp; + } completion; + }; +}; + /* Rx/Tx descriptor */ struct xdp_desc { __u64 addr; @@ -122,4 +157,7 @@ struct xdp_desc { */ #define XDP_PKT_CONTD (1 << 0) +/* TX packet carries valid metadata. */ +#define XDP_TX_METADATA (1 << 1) + #endif /* _LINUX_IF_XDP_H */ diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 2943a151d4f1..48d5477a668c 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -53,12 +53,28 @@ enum netdev_xdp_rx_metadata { NETDEV_XDP_RX_METADATA_MASK = 3, }; +/** + * enum netdev_xsk_flags + * @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported + * by the driver. + * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the + * driver. + */ +enum netdev_xsk_flags { + NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1, + NETDEV_XSK_FLAGS_TX_CHECKSUM = 2, + + /* private: */ + NETDEV_XSK_FLAGS_MASK = 3, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, NETDEV_A_DEV_XDP_FEATURES, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, + NETDEV_A_DEV_XSK_FEATURES, __NETDEV_A_DEV_MAX, NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) -- cgit v1.2.3 From ce59f9686e0eca19431571c7403394a6c36371e2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:13 -0800 Subject: xsk: Validate xsk_tx_metadata flags Accept only the flags that the kernel knows about to make sure we can extend this field in the future. Note that only in XDP_COPY mode we propagate the error signal back to the user (via sendmsg). For zerocopy mode we silently skip the metadata for the descriptors that have wrong flags (since we process the descriptors deep in the driver). Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20231127190319.1190813-8-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock_drv.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index e2558ac3e195..81e02de3f453 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -165,12 +165,28 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return xp_raw_get_data(pool, addr); } +#define XDP_TXMD_FLAGS_VALID ( \ + XDP_TXMD_FLAGS_TIMESTAMP | \ + XDP_TXMD_FLAGS_CHECKSUM | \ + 0) + +static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +{ + return !(meta->flags & ~XDP_TXMD_FLAGS_VALID); +} + static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { + struct xsk_tx_metadata *meta; + if (!pool->tx_metadata_len) return NULL; - return xp_raw_get_data(pool, addr) - pool->tx_metadata_len; + meta = xp_raw_get_data(pool, addr) - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + return NULL; /* no way to signal the error to the user */ + + return meta; } static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool) @@ -332,6 +348,11 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr) return NULL; } +static inline bool xsk_buff_valid_tx_metadata(struct xsk_tx_metadata *meta) +{ + return false; +} + static inline struct xsk_tx_metadata *xsk_buff_get_metadata(struct xsk_buff_pool *pool, u64 addr) { return NULL; -- cgit v1.2.3 From 11614723af26e7c32fcb704d8f30fdf60c1122dc Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 27 Nov 2023 11:03:14 -0800 Subject: xsk: Add option to calculate TX checksum in SW For XDP_COPY mode, add a UMEM option XDP_UMEM_TX_SW_CSUM to call skb_checksum_help in transmit path. Might be useful to debugging issues with real hardware. I also use this mode in the selftests. Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20231127190319.1190813-9-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/net/xsk_buff_pool.h | 1 + include/uapi/linux/if_xdp.h | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 97f5cc10d79e..8d48d37ab7c0 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -83,6 +83,7 @@ struct xsk_buff_pool { bool uses_need_wakeup; bool dma_need_sync; bool unaligned; + bool tx_sw_csum; void *addrs; /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 95de66d5a26c..d31698410410 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -33,7 +33,13 @@ #define XDP_USE_SG (1 << 4) /* Flags for xsk_umem_config flags */ -#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) +#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) + +/* Force checksum calculation in software. Can be used for testing or + * working around potential HW issues. This option causes performance + * degradation and only works in XDP_COPY mode. + */ +#define XDP_UMEM_TX_SW_CSUM (1 << 1) struct sockaddr_xdp { __u16 sxdp_family; -- cgit v1.2.3 From 6ebf6f90ab4ac09a76172a6d387e8819d3259595 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 28 Nov 2023 15:18:45 -0800 Subject: mptcp: add mptcpi_subflows_total counter If the initial subflow has been removed, we cannot know without checking other counters, e.g. ss -ti | grep -c tcp-ulp-mptcp or getsockopt(SOL_MPTCP, MPTCP_FULL_INFO, ...) (or others except MPTCP_INFO of course) and then check mptcp_subflow_data->num_subflows to get the total amount of subflows. This patch adds a new counter mptcpi_subflows_total in mptcpi_flags to store the total amount of subflows, including the initial one. A new helper __mptcp_has_initial_subflow() is added to check whether the initial subflow has been removed or not. With this helper, we can then compute the total amount of subflows from mptcp_info by doing something like: mptcpi_subflows_total = mptcpi_subflows + __mptcp_has_initial_subflow(msk). Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/428 Reviewed-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20231128-send-net-next-2023107-v4-1-8d6b94150f6b@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index a6451561f3f8..74cfe496891e 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -57,6 +57,7 @@ struct mptcp_info { __u64 mptcpi_bytes_sent; __u64 mptcpi_bytes_received; __u64 mptcpi_bytes_acked; + __u8 mptcpi_subflows_total; }; /* MPTCP Reset reason codes, rfc8684 */ -- cgit v1.2.3 From f422544118cbdfc3bba7b7c5189e18147acb9047 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 28 Nov 2023 09:53:04 +0000 Subject: net: mana: Fix spelling mistake "enforecement" -> "enforcement" There is a spelling mistake in struct field hc_tx_err_sqpdid_enforecement. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Shradha Gupta Link: https://lore.kernel.org/r/20231128095304.515492-1-colin.i.king@gmail.com Signed-off-by: Jakub Kicinski --- include/net/mana/mana.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 5567f5bc8eb6..76147feb0d10 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -368,7 +368,7 @@ struct mana_ethtool_stats { u64 hc_tx_err_vlan_enforcement; u64 hc_tx_err_eth_type_enforcement; u64 hc_tx_err_sa_enforcement; - u64 hc_tx_err_sqpdid_enforecement; + u64 hc_tx_err_sqpdid_enforcement; u64 hc_tx_err_cqpdid_enforcement; u64 hc_tx_err_mtu_violation; u64 hc_tx_err_inval_oob; -- cgit v1.2.3 From 7577bc8249c3fc86096ef1b1c9a8f4b6232231e7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 28 Nov 2023 18:29:20 -0800 Subject: tcp: Don't pass cookie to __cookie_v[46]_check(). tcp_hdr(skb) and SYN Cookie are passed to __cookie_v[46]_check(), but none of the callers passes cookie other than ntohl(th->ack_seq) - 1. Let's fetch it in __cookie_v[46]_check() instead of passing the cookie over and over. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20231129022924.96156-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/linux/netfilter_ipv6.h | 8 ++++---- include/net/tcp.h | 6 ++---- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 7834c0be2831..61aa48f46dd7 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -51,7 +51,7 @@ struct nf_ipv6_ops { u32 (*cookie_init_sequence)(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp); int (*cookie_v6_check)(const struct ipv6hdr *iph, - const struct tcphdr *th, __u32 cookie); + const struct tcphdr *th); #endif void (*route_input)(struct sk_buff *skb); int (*fragment)(struct net *net, struct sock *sk, struct sk_buff *skb, @@ -179,16 +179,16 @@ static inline u32 nf_ipv6_cookie_init_sequence(const struct ipv6hdr *iph, } static inline int nf_cookie_v6_check(const struct ipv6hdr *iph, - const struct tcphdr *th, __u32 cookie) + const struct tcphdr *th) { #if IS_ENABLED(CONFIG_SYN_COOKIES) #if IS_MODULE(CONFIG_IPV6) const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops(); if (v6_ops) - return v6_ops->cookie_v6_check(iph, th, cookie); + return v6_ops->cookie_v6_check(iph, th); #elif IS_BUILTIN(CONFIG_IPV6) - return __cookie_v6_check(iph, th, cookie); + return __cookie_v6_check(iph, th); #endif #endif return 0; diff --git a/include/net/tcp.h b/include/net/tcp.h index d2f0736b76b8..2b2c79c7bbcd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -491,8 +491,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, u32 tsoff); -int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, - u32 cookie); +int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, const struct tcp_request_sock_ops *af_ops, @@ -586,8 +585,7 @@ bool cookie_ecn_ok(const struct tcp_options_received *opt, const struct net *net, const struct dst_entry *dst); /* From net/ipv6/syncookies.c */ -int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, - u32 cookie); +int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, -- cgit v1.2.3 From efce3d1fdff53ef45b11ff407f16bc2f6f292b93 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 28 Nov 2023 18:29:21 -0800 Subject: tcp: Don't initialise tp->tsoffset in tcp_get_cookie_sock(). When we create a full socket from SYN Cookie, we initialise tcp_sk(sk)->tsoffset redundantly in tcp_get_cookie_sock() as the field is inherited from tcp_rsk(req)->ts_off. cookie_v[46]_check |- treq->ts_off = 0 `- tcp_get_cookie_sock |- tcp_v[46]_syn_recv_sock | `- tcp_create_openreq_child | `- newtp->tsoffset = treq->ts_off `- tcp_sk(child)->tsoffset = tsoff Let's initialise tcp_rsk(req)->ts_off with the correct offset and remove the second initialisation of tcp_sk(sk)->tsoffset. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20231129022924.96156-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 2b2c79c7bbcd..cc7143a781da 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -490,7 +490,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst, u32 tsoff); + struct dst_entry *dst); int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, -- cgit v1.2.3 From 7b0f570f879adecf12329ecd60485e7e6b4783c1 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 28 Nov 2023 18:29:22 -0800 Subject: tcp: Move TCP-AO bits from cookie_v[46]_check() to tcp_ao_syncookie(). We initialise treq->af_specific in cookie_tcp_reqsk_alloc() so that we can look up a key later in tcp_create_openreq_child(). Initially, that change was added for MD5 by commit ba5a4fdd63ae ("tcp: make sure treq->af_specific is initialized"), but it has not been used since commit d0f2b7a9ca0a ("tcp: Disable header prediction for MD5 flow."). Now, treq->af_specific is used only by TCP-AO, so, we can move that initialisation into tcp_ao_syncookie(). In addition to that, l3index in cookie_v[46]_check() is only used for tcp_ao_syncookie(), so let's move it as well. While at it, we move down tcp_ao_syncookie() in cookie_v4_check() so that it will be called after security_inet_conn_request() to make functions order consistent with cookie_v6_check(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20231129022924.96156-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 1 - include/net/tcp_ao.h | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index cc7143a781da..d4d0e9763175 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -494,7 +494,6 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, - const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES diff --git a/include/net/tcp_ao.h b/include/net/tcp_ao.h index b56be10838f0..4d900ef8dfc3 100644 --- a/include/net/tcp_ao.h +++ b/include/net/tcp_ao.h @@ -265,8 +265,7 @@ void tcp_ao_established(struct sock *sk); void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb); void tcp_ao_connect_init(struct sock *sk); void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, - struct tcp_request_sock *treq, - unsigned short int family, int l3index); + struct request_sock *req, unsigned short int family); #else /* CONFIG_TCP_AO */ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, @@ -277,8 +276,7 @@ static inline int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb, } static inline void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb, - struct tcp_request_sock *treq, - unsigned short int family, int l3index) + struct request_sock *req, unsigned short int family) { } -- cgit v1.2.3 From 8e7bab6b9652cd07a05ee0380b623c0e00e3eb00 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 28 Nov 2023 18:29:24 -0800 Subject: tcp: Factorise cookie-dependent fields initialisation in cookie_v[46]_check() We will support arbitrary SYN Cookie with BPF, and then kfunc at TC will preallocate reqsk and initialise some fields that should not be overwritten later by cookie_v[46]_check(). To simplify the flow in cookie_v[46]_check(), we move such fields' initialisation to cookie_tcp_reqsk_alloc() and factorise non-BPF SYN Cookie handling into cookie_tcp_check(), where we validate the cookie and allocate reqsk, as done by kfunc later. Note that we set ireq->ecn_ok in two steps, the latter of which will be shared by the BPF case. As cookie_ecn_ok() is one-liner, now it's inlined. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Simon Horman Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20231129022924.96156-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index d4d0e9763175..973555cb1d3f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -494,7 +494,10 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, - struct sock *sk, struct sk_buff *skb); + struct sock *sk, struct sk_buff *skb, + struct tcp_options_received *tcp_opt, + int mss, u32 tsoff); + #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. @@ -580,8 +583,12 @@ __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); -bool cookie_ecn_ok(const struct tcp_options_received *opt, - const struct net *net, const struct dst_entry *dst); + +static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *dst) +{ + return READ_ONCE(net->ipv4.sysctl_tcp_ecn) || + dst_feature(dst, RTAX_FEATURE_ECN); +} /* From net/ipv6/syncookies.c */ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th); -- cgit v1.2.3 From df16c1c51d8166958f533c0c886766f7ee9dd50f Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Mon, 27 Nov 2023 15:41:10 -0600 Subject: net: phy: mdio_device: Reset device only when necessary Currently the phy reset sequence is as shown below for a devicetree described mdio phy on boot: 1. Assert the phy_device's reset as part of registering 2. Deassert the phy_device's reset as part of registering 3. Deassert the phy_device's reset as part of phy_probe 4. Deassert the phy_device's reset as part of phy_hw_init The extra two deasserts include waiting the deassert delay afterwards, which is adding unnecessary delay. This applies to both possible types of resets (reset controller reference and a reset gpio) that can be used. Here's some snipped tracing output using the following command line params "trace_event=gpio:* trace_options=stacktrace" illustrating the reset handling and where its coming from: /* Assert */ systemd-udevd-283 [002] ..... 6.780434: gpio_value: 544 set 0 systemd-udevd-283 [002] ..... 6.783849: => gpiod_set_raw_value_commit => gpiod_set_value_nocheck => gpiod_set_value_cansleep => mdio_device_reset => mdiobus_register_device => phy_device_register => fwnode_mdiobus_phy_device_register => fwnode_mdiobus_register_phy => __of_mdiobus_register => stmmac_mdio_register => stmmac_dvr_probe => stmmac_pltfr_probe => devm_stmmac_pltfr_probe => qcom_ethqos_probe => platform_probe /* Deassert */ systemd-udevd-283 [002] ..... 6.802480: gpio_value: 544 set 1 systemd-udevd-283 [002] ..... 6.805886: => gpiod_set_raw_value_commit => gpiod_set_value_nocheck => gpiod_set_value_cansleep => mdio_device_reset => phy_device_register => fwnode_mdiobus_phy_device_register => fwnode_mdiobus_register_phy => __of_mdiobus_register => stmmac_mdio_register => stmmac_dvr_probe => stmmac_pltfr_probe => devm_stmmac_pltfr_probe => qcom_ethqos_probe => platform_probe /* Deassert */ systemd-udevd-283 [002] ..... 6.882601: gpio_value: 544 set 1 systemd-udevd-283 [002] ..... 6.886014: => gpiod_set_raw_value_commit => gpiod_set_value_nocheck => gpiod_set_value_cansleep => mdio_device_reset => phy_probe => really_probe => __driver_probe_device => driver_probe_device => __device_attach_driver => bus_for_each_drv => __device_attach => device_initial_probe => bus_probe_device => device_add => phy_device_register => fwnode_mdiobus_phy_device_register => fwnode_mdiobus_register_phy => __of_mdiobus_register => stmmac_mdio_register => stmmac_dvr_probe => stmmac_pltfr_probe => devm_stmmac_pltfr_probe => qcom_ethqos_probe => platform_probe /* Deassert */ NetworkManager-477 [000] ..... 7.023144: gpio_value: 544 set 1 NetworkManager-477 [000] ..... 7.026596: => gpiod_set_raw_value_commit => gpiod_set_value_nocheck => gpiod_set_value_cansleep => mdio_device_reset => phy_init_hw => phy_attach_direct => phylink_fwnode_phy_connect => __stmmac_open => stmmac_open There's a lot of paths where the device is getting its reset asserted and deasserted. Let's track the state and only actually do the assert/deassert when it changes. Reported-by: Sagar Cheluvegowda Signed-off-by: Andrew Halaney Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20231127-net-phy-reset-once-v2-1-448e8658779e@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 007fd9c3e4b6..79ceee3c8673 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -38,6 +38,7 @@ struct mdio_device { /* Bus address of the MDIO device (0-31) */ int addr; int flags; + int reset_state; struct gpio_desc *reset_gpio; struct reset_control *reset_ctrl; unsigned int reset_assert_delay; -- cgit v1.2.3 From 45b5623f2d721c25d1a2fdc8c4600fb4b7b61c75 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:56:55 -0800 Subject: bpf: rearrange bpf_func_state fields to save a bit of memory It's a trivial rearrangement saving 8 bytes. We have 4 bytes of padding at the end which can be filled with another field without increasing struct bpf_func_state. copy_func_state() logic remains correct without any further changes. BEFORE ====== struct bpf_func_state { struct bpf_reg_state regs[11]; /* 0 1320 */ /* --- cacheline 20 boundary (1280 bytes) was 40 bytes ago --- */ int callsite; /* 1320 4 */ u32 frameno; /* 1324 4 */ u32 subprogno; /* 1328 4 */ u32 async_entry_cnt; /* 1332 4 */ bool in_callback_fn; /* 1336 1 */ /* XXX 7 bytes hole, try to pack */ /* --- cacheline 21 boundary (1344 bytes) --- */ struct tnum callback_ret_range; /* 1344 16 */ bool in_async_callback_fn; /* 1360 1 */ bool in_exception_callback_fn; /* 1361 1 */ /* XXX 2 bytes hole, try to pack */ int acquired_refs; /* 1364 4 */ struct bpf_reference_state * refs; /* 1368 8 */ int allocated_stack; /* 1376 4 */ /* XXX 4 bytes hole, try to pack */ struct bpf_stack_state * stack; /* 1384 8 */ /* size: 1392, cachelines: 22, members: 13 */ /* sum members: 1379, holes: 3, sum holes: 13 */ /* last cacheline: 48 bytes */ }; AFTER ===== struct bpf_func_state { struct bpf_reg_state regs[11]; /* 0 1320 */ /* --- cacheline 20 boundary (1280 bytes) was 40 bytes ago --- */ int callsite; /* 1320 4 */ u32 frameno; /* 1324 4 */ u32 subprogno; /* 1328 4 */ u32 async_entry_cnt; /* 1332 4 */ struct tnum callback_ret_range; /* 1336 16 */ /* --- cacheline 21 boundary (1344 bytes) was 8 bytes ago --- */ bool in_callback_fn; /* 1352 1 */ bool in_async_callback_fn; /* 1353 1 */ bool in_exception_callback_fn; /* 1354 1 */ /* XXX 1 byte hole, try to pack */ int acquired_refs; /* 1356 4 */ struct bpf_reference_state * refs; /* 1360 8 */ struct bpf_stack_state * stack; /* 1368 8 */ int allocated_stack; /* 1376 4 */ /* size: 1384, cachelines: 22, members: 13 */ /* sum members: 1379, holes: 1, sum holes: 1 */ /* padding: 4 */ /* last cacheline: 40 bytes */ }; Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d99a636d36a7..0c0e1bccad45 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -297,8 +297,8 @@ struct bpf_func_state { * void foo(void) { bpf_timer_set_callback(,foo); } */ u32 async_entry_cnt; - bool in_callback_fn; struct tnum callback_ret_range; + bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; /* For callback calling functions that limit number of possible @@ -316,8 +316,8 @@ struct bpf_func_state { /* The following fields should be last. See copy_func_state() */ int acquired_refs; struct bpf_reference_state *refs; - int allocated_stack; struct bpf_stack_state *stack; + int allocated_stack; }; struct bpf_idx_pair { -- cgit v1.2.3 From 8fa4ecd49b81ccd9d1d87f1c8b2260e218644878 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 2 Dec 2023 09:56:58 -0800 Subject: bpf: enforce exact retval range on subprog/callback exit Instead of relying on potentially imprecise tnum representation of expected return value range for callbacks and subprogs, validate that smin/smax range satisfy exact expected range of return values. E.g., if callback would need to return [0, 2] range, tnum can't represent this precisely and instead will allow [0, 3] range. By checking smin/smax range, we can make sure that subprog/callback indeed returns only valid [0, 2] range. Acked-by: Eduard Zingerman Acked-by: Shung-Hsi Yu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231202175705.885270-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0c0e1bccad45..3378cc753061 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -275,6 +275,11 @@ struct bpf_reference_state { int callback_ref; }; +struct bpf_retval_range { + s32 minval; + s32 maxval; +}; + /* state of the program: * type of all registers and stack info */ @@ -297,7 +302,7 @@ struct bpf_func_state { * void foo(void) { bpf_timer_set_callback(,foo); } */ u32 async_entry_cnt; - struct tnum callback_ret_range; + struct bpf_retval_range callback_ret_range; bool in_callback_fn; bool in_async_callback_fn; bool in_exception_callback_fn; -- cgit v1.2.3 From aeb9ce058d7c6193dc41e06b3a5b29d22c446b14 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Wed, 29 Nov 2023 07:27:53 +0000 Subject: cache: enforce cache groups Set up build time warnings to safeguard against future header changes of organized structs. Warning includes: 1) whether all variables are still in the same cache group 2) whether all the cache groups have the sum of the members size (in the maximum condition, including all members defined in configs) The __cache_group* variables are ignored in kernel-doc check in the various header files they appear in to enforce the cache groups. Suggested-by: Daniel Borkmann Acked-by: Daniel Borkmann Signed-off-by: Coco Li Reviewed-by: Eric Dumazet Reviewed-by: Shakeel Butt Signed-off-by: David S. Miller --- include/linux/cache.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/cache.h b/include/linux/cache.h index 9900d20b76c2..0ecb17bb6883 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -85,6 +85,31 @@ #define cache_line_size() L1_CACHE_BYTES #endif +#ifndef __cacheline_group_begin +#define __cacheline_group_begin(GROUP) \ + __u8 __cacheline_group_begin__##GROUP[0] +#endif + +#ifndef __cacheline_group_end +#define __cacheline_group_end(GROUP) \ + __u8 __cacheline_group_end__##GROUP[0] +#endif + +#ifndef CACHELINE_ASSERT_GROUP_MEMBER +#define CACHELINE_ASSERT_GROUP_MEMBER(TYPE, GROUP, MEMBER) \ + BUILD_BUG_ON(!(offsetof(TYPE, MEMBER) >= \ + offsetofend(TYPE, __cacheline_group_begin__##GROUP) && \ + offsetofend(TYPE, MEMBER) <= \ + offsetof(TYPE, __cacheline_group_end__##GROUP))) +#endif + +#ifndef CACHELINE_ASSERT_GROUP_SIZE +#define CACHELINE_ASSERT_GROUP_SIZE(TYPE, GROUP, SIZE) \ + BUILD_BUG_ON(offsetof(TYPE, __cacheline_group_end__##GROUP) - \ + offsetofend(TYPE, __cacheline_group_begin__##GROUP) > \ + SIZE) +#endif + /* * Helper to add padding within a struct to ensure data fall into separate * cachelines. -- cgit v1.2.3 From 18fd64d2542292713b0322e6815be059bdee440c Mon Sep 17 00:00:00 2001 From: Coco Li Date: Wed, 29 Nov 2023 07:27:54 +0000 Subject: netns-ipv4: reorganize netns_ipv4 fast path variables Reorganize fast path variables on tx-txrx-rx order. Fastpath cacheline ends after sysctl_tcp_rmem. There are only read-only variables here. (write is on the control path and not considered in this case) Below data generated with pahole on x86 architecture. Fast path variables span cache lines before change: 4 Fast path variables span cache lines after change: 2 Suggested-by: Eric Dumazet Reviewed-by: Wei Wang Reviewed-by: David Ahern Signed-off-by: Coco Li Reviewed-by: Eric Dumazet Reviewed-by: Shakeel Butt Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 73f43f699199..ea882964c71e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -42,6 +42,38 @@ struct inet_timewait_death_row { struct tcp_fastopen_context; struct netns_ipv4 { + /* Cacheline organization can be found documented in + * Documentation/networking/net_cachelines/netns_ipv4_sysctl.rst. + * Please update the document when adding new fields. + */ + + /* TX readonly hotpath cache lines */ + __cacheline_group_begin(netns_ipv4_read_tx); + u8 sysctl_tcp_early_retrans; + u8 sysctl_tcp_tso_win_divisor; + u8 sysctl_tcp_tso_rtt_log; + u8 sysctl_tcp_autocorking; + int sysctl_tcp_min_snd_mss; + unsigned int sysctl_tcp_notsent_lowat; + int sysctl_tcp_limit_output_bytes; + int sysctl_tcp_min_rtt_wlen; + int sysctl_tcp_wmem[3]; + u8 sysctl_ip_fwd_use_pmtu; + __cacheline_group_end(netns_ipv4_read_tx); + + /* TXRX readonly hotpath cache lines */ + __cacheline_group_begin(netns_ipv4_read_txrx); + u8 sysctl_tcp_moderate_rcvbuf; + __cacheline_group_end(netns_ipv4_read_txrx); + + /* RX readonly hotpath cache line */ + __cacheline_group_begin(netns_ipv4_read_rx); + u8 sysctl_ip_early_demux; + u8 sysctl_tcp_early_demux; + int sysctl_tcp_reordering; + int sysctl_tcp_rmem[3]; + __cacheline_group_end(netns_ipv4_read_rx); + struct inet_timewait_death_row tcp_death_row; struct udp_table *udp_table; @@ -96,17 +128,14 @@ struct netns_ipv4 { u8 sysctl_ip_default_ttl; u8 sysctl_ip_no_pmtu_disc; - u8 sysctl_ip_fwd_use_pmtu; u8 sysctl_ip_fwd_update_priority; u8 sysctl_ip_nonlocal_bind; u8 sysctl_ip_autobind_reuse; /* Shall we try to damage output packets if routing dev changes? */ u8 sysctl_ip_dynaddr; - u8 sysctl_ip_early_demux; #ifdef CONFIG_NET_L3_MASTER_DEV u8 sysctl_raw_l3mdev_accept; #endif - u8 sysctl_tcp_early_demux; u8 sysctl_udp_early_demux; u8 sysctl_nexthop_compat_mode; @@ -119,7 +148,6 @@ struct netns_ipv4 { u8 sysctl_tcp_mtu_probing; int sysctl_tcp_mtu_probe_floor; int sysctl_tcp_base_mss; - int sysctl_tcp_min_snd_mss; int sysctl_tcp_probe_threshold; u32 sysctl_tcp_probe_interval; @@ -135,17 +163,14 @@ struct netns_ipv4 { u8 sysctl_tcp_backlog_ack_defer; u8 sysctl_tcp_pingpong_thresh; - int sysctl_tcp_reordering; u8 sysctl_tcp_retries1; u8 sysctl_tcp_retries2; u8 sysctl_tcp_orphan_retries; u8 sysctl_tcp_tw_reuse; int sysctl_tcp_fin_timeout; - unsigned int sysctl_tcp_notsent_lowat; u8 sysctl_tcp_sack; u8 sysctl_tcp_window_scaling; u8 sysctl_tcp_timestamps; - u8 sysctl_tcp_early_retrans; u8 sysctl_tcp_recovery; u8 sysctl_tcp_thin_linear_timeouts; u8 sysctl_tcp_slow_start_after_idle; @@ -161,21 +186,13 @@ struct netns_ipv4 { u8 sysctl_tcp_frto; u8 sysctl_tcp_nometrics_save; u8 sysctl_tcp_no_ssthresh_metrics_save; - u8 sysctl_tcp_moderate_rcvbuf; - u8 sysctl_tcp_tso_win_divisor; u8 sysctl_tcp_workaround_signed_windows; - int sysctl_tcp_limit_output_bytes; int sysctl_tcp_challenge_ack_limit; - int sysctl_tcp_min_rtt_wlen; u8 sysctl_tcp_min_tso_segs; - u8 sysctl_tcp_tso_rtt_log; - u8 sysctl_tcp_autocorking; u8 sysctl_tcp_reflect_tos; int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; - int sysctl_tcp_wmem[3]; - int sysctl_tcp_rmem[3]; unsigned int sysctl_tcp_child_ehash_entries; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; -- cgit v1.2.3 From 91051f003948432f83b5d2766eeb83b2b4993649 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 1 Dec 2023 15:49:52 +0100 Subject: tcp: Dump bound-only sockets in inet_diag. Walk the hashinfo->bhash2 table so that inet_diag can dump TCP sockets that are bound but haven't yet called connect() or listen(). The code is inspired by the ->lhash2 loop. However there's no manual test of the source port, since this kind of filtering is already handled by inet_diag_bc_sk(). Also, a maximum of 16 sockets are dumped at a time, to avoid running with bh disabled for too long. There's no TCP state for bound but otherwise inactive sockets. Such sockets normally map to TCP_CLOSE. However, "ss -l", which is supposed to only dump listening sockets, actually requests the kernel to dump sockets in either the TCP_LISTEN or TCP_CLOSE states. To avoid dumping bound-only sockets with "ss -l", we therefore need to define a new pseudo-state (TCP_BOUND_INACTIVE) that user space will be able to set explicitly. With an IPv4, an IPv6 and an IPv6-only socket, bound respectively to 40000, 64000, 60000, an updated version of iproute2 could work as follow: $ ss -t state bound-inactive Recv-Q Send-Q Local Address:Port Peer Address:Port Process 0 0 0.0.0.0:40000 0.0.0.0:* 0 0 [::]:60000 [::]:* 0 0 *:64000 *:* Reviewed-by: Eric Dumazet Signed-off-by: Guillaume Nault Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/b3a84ae61e19c06806eea9c602b3b66e8f0cfc81.1701362867.git.gnault@redhat.com Signed-off-by: Jakub Kicinski --- include/net/tcp_states.h | 2 ++ include/uapi/linux/bpf.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h index cc00118acca1..d60e8148ff4c 100644 --- a/include/net/tcp_states.h +++ b/include/net/tcp_states.h @@ -22,6 +22,7 @@ enum { TCP_LISTEN, TCP_CLOSING, /* Now a valid state */ TCP_NEW_SYN_RECV, + TCP_BOUND_INACTIVE, /* Pseudo-state for inet_diag */ TCP_MAX_STATES /* Leave at the end! */ }; @@ -43,6 +44,7 @@ enum { TCPF_LISTEN = (1 << TCP_LISTEN), TCPF_CLOSING = (1 << TCP_CLOSING), TCPF_NEW_SYN_RECV = (1 << TCP_NEW_SYN_RECV), + TCPF_BOUND_INACTIVE = (1 << TCP_BOUND_INACTIVE), }; #endif /* _LINUX_TCP_STATES_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e88746ba7d21..b1e8c5bdfc82 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6902,6 +6902,7 @@ enum { BPF_TCP_LISTEN, BPF_TCP_CLOSING, /* Now a valid state */ BPF_TCP_NEW_SYN_RECV, + BPF_TCP_BOUND_INACTIVE, BPF_TCP_MAX_STATES /* Leave at the end! */ }; -- cgit v1.2.3 From 20c20bd11a0702ce4dc9300c3da58acf551d9725 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:20 +0800 Subject: bpf: Add map and need_defer parameters to .map_fd_put_ptr() map is the pointer of outer map, and need_defer needs some explanation. need_defer tells the implementation to defer the reference release of the passed element and ensure that the element is still alive before the bpf program, which may manipulate it, exits. The following three cases will invoke map_fd_put_ptr() and different need_defer values will be passed to these callers: 1) release the reference of the old element in the map during map update or map deletion. The release must be deferred, otherwise the bpf program may incur use-after-free problem, so need_defer needs to be true. 2) release the reference of the to-be-added element in the error path of map update. The to-be-added element is not visible to any bpf program, so it is OK to pass false for need_defer parameter. 3) release the references of all elements in the map during map release. Any bpf program which has access to the map must have been exited and released, so need_defer=false will be OK. These two parameters will be used by the following patches to fix the potential use-after-free problem for map-in-map. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eb447b0a9423..d273348cfb2f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -106,7 +106,11 @@ struct bpf_map_ops { /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); - void (*map_fd_put_ptr)(void *ptr); + /* If need_defer is true, the implementation should guarantee that + * the to-be-put element is still alive before the bpf program, which + * may manipulate it, exists. + */ + void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, -- cgit v1.2.3 From 876673364161da50eed6b472d746ef88242b2368 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:22 +0800 Subject: bpf: Defer the free of inner map when necessary When updating or deleting an inner map in map array or map htab, the map may still be accessed by non-sleepable program or sleepable program. However bpf_map_fd_put_ptr() decreases the ref-counter of the inner map directly through bpf_map_put(), if the ref-counter is the last one (which is true for most cases), the inner map will be freed by ops->map_free() in a kworker. But for now, most .map_free() callbacks don't use synchronize_rcu() or its variants to wait for the elapse of a RCU grace period, so after the invocation of ops->map_free completes, the bpf program which is accessing the inner map may incur use-after-free problem. Fix the free of inner map by invoking bpf_map_free_deferred() after both one RCU grace period and one tasks trace RCU grace period if the inner map has been removed from the outer map before. The deferment is accomplished by using call_rcu() or call_rcu_tasks_trace() when releasing the last ref-counter of bpf map. The newly-added rcu_head field in bpf_map shares the same storage space with work field to reduce the size of bpf_map. Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.") Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d273348cfb2f..de3bd03cbeea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -276,7 +276,11 @@ struct bpf_map { */ atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; - struct work_struct work; + /* rcu is used before freeing and work is only used during freeing */ + union { + struct work_struct work; + struct rcu_head rcu; + }; struct mutex freeze_mutex; atomic64_t writecnt; /* 'Ownership' of program-containing map is claimed by the first program @@ -292,6 +296,7 @@ struct bpf_map { } owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ + bool free_after_mult_rcu_gp; s64 __percpu *elem_count; }; -- cgit v1.2.3 From af66bfd3c8538ed21cf72af18426fc4a408665cf Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:23 +0800 Subject: bpf: Optimize the free of inner map When removing the inner map from the outer map, the inner map will be freed after one RCU grace period and one RCU tasks trace grace period, so it is certain that the bpf program, which may access the inner map, has exited before the inner map is freed. However there is no need to wait for one RCU tasks trace grace period if the outer map is only accessed by non-sleepable program. So adding sleepable_refcnt in bpf_map and increasing sleepable_refcnt when adding the outer map into env->used_maps for sleepable program. Although the max number of bpf program is INT_MAX - 1, the number of bpf programs which are being loaded may be greater than INT_MAX, so using atomic64_t instead of atomic_t for sleepable_refcnt. When removing the inner map from the outer map, using sleepable_refcnt to decide whether or not a RCU tasks trace grace period is needed before freeing the inner map. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index de3bd03cbeea..10e5e4d8a00f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -297,6 +297,8 @@ struct bpf_map { bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ bool free_after_mult_rcu_gp; + bool free_after_rcu_gp; + atomic64_t sleepable_refcnt; s64 __percpu *elem_count; }; -- cgit v1.2.3 From bc877956272f0521fef107838555817112a450dc Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:28:29 -0800 Subject: netdev-genl: spec: Extend netdev netlink spec in YAML for queue Add support in netlink spec(netdev.yaml) for queue information. Add code generated from the spec. Note: The "queue-type" attribute takes values 0 and 1 for rx and tx queue type respectively. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147330963.5260.2576294626647300472.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/netdev.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 6244c0164976..f857960a7f06 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -62,6 +62,11 @@ enum netdev_xsk_flags { NETDEV_XSK_FLAGS_TX_CHECKSUM = 2, }; +enum netdev_queue_type { + NETDEV_QUEUE_TYPE_RX, + NETDEV_QUEUE_TYPE_TX, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, @@ -104,6 +109,16 @@ enum { NETDEV_A_PAGE_POOL_STATS_MAX = (__NETDEV_A_PAGE_POOL_STATS_MAX - 1) }; +enum { + NETDEV_A_QUEUE_ID = 1, + NETDEV_A_QUEUE_IFINDEX, + NETDEV_A_QUEUE_TYPE, + NETDEV_A_QUEUE_NAPI_ID, + + __NETDEV_A_QUEUE_MAX, + NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) +}; + enum { NETDEV_CMD_DEV_GET = 1, NETDEV_CMD_DEV_ADD_NTF, @@ -114,6 +129,7 @@ enum { NETDEV_CMD_PAGE_POOL_DEL_NTF, NETDEV_CMD_PAGE_POOL_CHANGE_NTF, NETDEV_CMD_PAGE_POOL_STATS_GET, + NETDEV_CMD_QUEUE_GET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From 2a502ff0c4e42a739b5aa550c901bf3852795532 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:28:34 -0800 Subject: net: Add queue and napi association Add the napi pointer in netdev queue for tracking the napi instance for each queue. This achieves the queue<->napi mapping. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147331483.5260.15723438819994285695.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 8 ++++++++ include/net/netdev_rx_queue.h | 4 ++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c2d74bc112dd..5ddff11cbe26 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -665,6 +665,10 @@ struct netdev_queue { #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif + /* NAPI instance for the queue + * Readers and writers must hold RTNL + */ + struct napi_struct *napi; /* * write-mostly part */ @@ -2657,6 +2661,10 @@ static inline void *netdev_priv(const struct net_device *dev) */ #define SET_NETDEV_DEVTYPE(net, devtype) ((net)->dev.type = (devtype)) +void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, + enum netdev_queue_type type, + struct napi_struct *napi); + /* Default NAPI poll() weight * Device drivers are strongly advised to not use bigger value */ diff --git a/include/net/netdev_rx_queue.h b/include/net/netdev_rx_queue.h index cdcafb30d437..aa1716fb0e53 100644 --- a/include/net/netdev_rx_queue.h +++ b/include/net/netdev_rx_queue.h @@ -21,6 +21,10 @@ struct netdev_rx_queue { #ifdef CONFIG_XDP_SOCKETS struct xsk_buff_pool *pool; #endif + /* NAPI instance for the queue + * Readers and writers must hold RTNL + */ + struct napi_struct *napi; } ____cacheline_aligned_in_smp; /* -- cgit v1.2.3 From ff9991499fb53575c45eb92cd064bcd7141bb572 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:28:51 -0800 Subject: netdev-genl: spec: Extend netdev netlink spec in YAML for NAPI Add support in netlink spec(netdev.yaml) for napi related information. Add code generated from the spec. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147333119.5260.7050639053080529108.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/netdev.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index f857960a7f06..e7bdbcb01f22 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -109,6 +109,14 @@ enum { NETDEV_A_PAGE_POOL_STATS_MAX = (__NETDEV_A_PAGE_POOL_STATS_MAX - 1) }; +enum { + NETDEV_A_NAPI_IFINDEX = 1, + NETDEV_A_NAPI_ID, + + __NETDEV_A_NAPI_MAX, + NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) +}; + enum { NETDEV_A_QUEUE_ID = 1, NETDEV_A_QUEUE_IFINDEX, @@ -130,6 +138,7 @@ enum { NETDEV_CMD_PAGE_POOL_CHANGE_NTF, NETDEV_CMD_PAGE_POOL_STATS_GET, NETDEV_CMD_QUEUE_GET, + NETDEV_CMD_NAPI_GET, __NETDEV_CMD_MAX, NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) -- cgit v1.2.3 From 5a5131d66fe02337de0b1b2e021b58f0f55c6df5 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:29:02 -0800 Subject: netdev-genl: spec: Add irq in netdev netlink YAML spec Add support in netlink spec(netdev.yaml) for interrupt number among the NAPI attributes. Add code generated from the spec. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147334210.5260.18178387869057516983.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/netdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index e7bdbcb01f22..30fea409b71e 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -112,6 +112,7 @@ enum { enum { NETDEV_A_NAPI_IFINDEX = 1, NETDEV_A_NAPI_ID, + NETDEV_A_NAPI_IRQ, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit v1.2.3 From 26793bfb5d6072326d1465343e7cbf6156abca4f Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:29:07 -0800 Subject: net: Add NAPI IRQ support Add support to associate the interrupt vector number for a NAPI instance. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147334728.5260.13221803396905901904.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5ddff11cbe26..5551177e024e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -382,6 +382,7 @@ struct napi_struct { /* control-path-only fields follow */ struct list_head dev_list; struct hlist_node napi_hash_node; + int irq; }; enum { @@ -2665,6 +2666,11 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, enum netdev_queue_type type, struct napi_struct *napi); +static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) +{ + napi->irq = irq; +} + /* Default NAPI poll() weight * Device drivers are strongly advised to not use bigger value */ -- cgit v1.2.3 From 8481a249a0eaf0000dbb18f7689ccd50ea9835cd Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 1 Dec 2023 15:29:13 -0800 Subject: netdev-genl: spec: Add PID in netdev netlink YAML spec Add support in netlink spec(netdev.yaml) for PID of the NAPI thread. Add code generated from the spec. Signed-off-by: Amritha Nambiar Reviewed-by: Sridhar Samudrala Link: https://lore.kernel.org/r/170147335301.5260.11872351477120434501.stgit@anambiarhost.jf.intel.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/netdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 30fea409b71e..424c5e28f495 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -113,6 +113,7 @@ enum { NETDEV_A_NAPI_IFINDEX = 1, NETDEV_A_NAPI_ID, NETDEV_A_NAPI_IRQ, + NETDEV_A_NAPI_PID, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit v1.2.3 From 8ebe06611666a399162de31cdd6f2f48ffa87748 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 1 Dec 2023 16:19:42 +0800 Subject: net: bridge: add document for IFLA_BR enum Add document for IFLA_BR enum so we can use it in Documentation/networking/bridge.rst. Signed-off-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- include/uapi/linux/if_link.h | 280 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 280 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8181ef23a7a2..a5f873c85a72 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -461,6 +461,286 @@ enum in6_addr_gen_mode { /* Bridge section */ +/** + * DOC: Bridge enum definition + * + * Please *note* that the timer values in the following section are expected + * in clock_t format, which is seconds multiplied by USER_HZ (generally + * defined as 100). + * + * @IFLA_BR_FORWARD_DELAY + * The bridge forwarding delay is the time spent in LISTENING state + * (before moving to LEARNING) and in LEARNING state (before moving + * to FORWARDING). Only relevant if STP is enabled. + * + * The valid values are between (2 * USER_HZ) and (30 * USER_HZ). + * The default value is (15 * USER_HZ). + * + * @IFLA_BR_HELLO_TIME + * The time between hello packets sent by the bridge, when it is a root + * bridge or a designated bridge. Only relevant if STP is enabled. + * + * The valid values are between (1 * USER_HZ) and (10 * USER_HZ). + * The default value is (2 * USER_HZ). + * + * @IFLA_BR_MAX_AGE + * The hello packet timeout is the time until another bridge in the + * spanning tree is assumed to be dead, after reception of its last hello + * message. Only relevant if STP is enabled. + * + * The valid values are between (6 * USER_HZ) and (40 * USER_HZ). + * The default value is (20 * USER_HZ). + * + * @IFLA_BR_AGEING_TIME + * Configure the bridge's FDB entries aging time. It is the time a MAC + * address will be kept in the FDB after a packet has been received from + * that address. After this time has passed, entries are cleaned up. + * Allow values outside the 802.1 standard specification for special cases: + * + * * 0 - entry never ages (all permanent) + * * 1 - entry disappears (no persistence) + * + * The default value is (300 * USER_HZ). + * + * @IFLA_BR_STP_STATE + * Turn spanning tree protocol on (*IFLA_BR_STP_STATE* > 0) or off + * (*IFLA_BR_STP_STATE* == 0) for this bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_PRIORITY + * Set this bridge's spanning tree priority, used during STP root bridge + * election. + * + * The valid values are between 0 and 65535. + * + * @IFLA_BR_VLAN_FILTERING + * Turn VLAN filtering on (*IFLA_BR_VLAN_FILTERING* > 0) or off + * (*IFLA_BR_VLAN_FILTERING* == 0). When disabled, the bridge will not + * consider the VLAN tag when handling packets. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_VLAN_PROTOCOL + * Set the protocol used for VLAN filtering. + * + * The valid values are 0x8100(802.1Q) or 0x88A8(802.1AD). The default value + * is 0x8100(802.1Q). + * + * @IFLA_BR_GROUP_FWD_MASK + * The group forwarding mask. This is the bitmask that is applied to + * decide whether to forward incoming frames destined to link-local + * addresses (of the form 01:80:C2:00:00:0X). + * + * The default value is 0, which means the bridge does not forward any + * link-local frames coming on this port. + * + * @IFLA_BR_ROOT_ID + * The bridge root id, read only. + * + * @IFLA_BR_BRIDGE_ID + * The bridge id, read only. + * + * @IFLA_BR_ROOT_PORT + * The bridge root port, read only. + * + * @IFLA_BR_ROOT_PATH_COST + * The bridge root path cost, read only. + * + * @IFLA_BR_TOPOLOGY_CHANGE + * The bridge topology change, read only. + * + * @IFLA_BR_TOPOLOGY_CHANGE_DETECTED + * The bridge topology change detected, read only. + * + * @IFLA_BR_HELLO_TIMER + * The bridge hello timer, read only. + * + * @IFLA_BR_TCN_TIMER + * The bridge tcn timer, read only. + * + * @IFLA_BR_TOPOLOGY_CHANGE_TIMER + * The bridge topology change timer, read only. + * + * @IFLA_BR_GC_TIMER + * The bridge gc timer, read only. + * + * @IFLA_BR_GROUP_ADDR + * Set the MAC address of the multicast group this bridge uses for STP. + * The address must be a link-local address in standard Ethernet MAC address + * format. It is an address of the form 01:80:C2:00:00:0X, with X in [0, 4..f]. + * + * The default value is 0. + * + * @IFLA_BR_FDB_FLUSH + * Flush bridge's fdb dynamic entries. + * + * @IFLA_BR_MCAST_ROUTER + * Set bridge's multicast router if IGMP snooping is enabled. + * The valid values are: + * + * * 0 - disabled. + * * 1 - automatic (queried). + * * 2 - permanently enabled. + * + * The default value is 1. + * + * @IFLA_BR_MCAST_SNOOPING + * Turn multicast snooping on (*IFLA_BR_MCAST_SNOOPING* > 0) or off + * (*IFLA_BR_MCAST_SNOOPING* == 0). + * + * The default value is 1. + * + * @IFLA_BR_MCAST_QUERY_USE_IFADDR + * If enabled use the bridge's own IP address as source address for IGMP + * queries (*IFLA_BR_MCAST_QUERY_USE_IFADDR* > 0) or the default of 0.0.0.0 + * (*IFLA_BR_MCAST_QUERY_USE_IFADDR* == 0). + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MCAST_QUERIER + * Enable (*IFLA_BR_MULTICAST_QUERIER* > 0) or disable + * (*IFLA_BR_MULTICAST_QUERIER* == 0) IGMP querier, ie sending of multicast + * queries by the bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MCAST_HASH_ELASTICITY + * Set multicast database hash elasticity, It is the maximum chain length in + * the multicast hash table. This attribute is *deprecated* and the value + * is always 16. + * + * @IFLA_BR_MCAST_HASH_MAX + * Set maximum size of the multicast hash table + * + * The default value is 4096, the value must be a power of 2. + * + * @IFLA_BR_MCAST_LAST_MEMBER_CNT + * The Last Member Query Count is the number of Group-Specific Queries + * sent before the router assumes there are no local members. The Last + * Member Query Count is also the number of Group-and-Source-Specific + * Queries sent before the router assumes there are no listeners for a + * particular source. + * + * The default value is 2. + * + * @IFLA_BR_MCAST_STARTUP_QUERY_CNT + * The Startup Query Count is the number of Queries sent out on startup, + * separated by the Startup Query Interval. + * + * The default value is 2. + * + * @IFLA_BR_MCAST_LAST_MEMBER_INTVL + * The Last Member Query Interval is the Max Response Time inserted into + * Group-Specific Queries sent in response to Leave Group messages, and + * is also the amount of time between Group-Specific Query messages. + * + * The default value is (1 * USER_HZ). + * + * @IFLA_BR_MCAST_MEMBERSHIP_INTVL + * The interval after which the bridge will leave a group, if no membership + * reports for this group are received. + * + * The default value is (260 * USER_HZ). + * + * @IFLA_BR_MCAST_QUERIER_INTVL + * The interval between queries sent by other routers. if no queries are + * seen after this delay has passed, the bridge will start to send its own + * queries (as if *IFLA_BR_MCAST_QUERIER_INTVL* was enabled). + * + * The default value is (255 * USER_HZ). + * + * @IFLA_BR_MCAST_QUERY_INTVL + * The Query Interval is the interval between General Queries sent by + * the Querier. + * + * The default value is (125 * USER_HZ). The minimum value is (1 * USER_HZ). + * + * @IFLA_BR_MCAST_QUERY_RESPONSE_INTVL + * The Max Response Time used to calculate the Max Resp Code inserted + * into the periodic General Queries. + * + * The default value is (10 * USER_HZ). + * + * @IFLA_BR_MCAST_STARTUP_QUERY_INTVL + * The interval between queries in the startup phase. + * + * The default value is (125 * USER_HZ) / 4. The minimum value is (1 * USER_HZ). + * + * @IFLA_BR_NF_CALL_IPTABLES + * Enable (*NF_CALL_IPTABLES* > 0) or disable (*NF_CALL_IPTABLES* == 0) + * iptables hooks on the bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_NF_CALL_IP6TABLES + * Enable (*NF_CALL_IP6TABLES* > 0) or disable (*NF_CALL_IP6TABLES* == 0) + * ip6tables hooks on the bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_NF_CALL_ARPTABLES + * Enable (*NF_CALL_ARPTABLES* > 0) or disable (*NF_CALL_ARPTABLES* == 0) + * arptables hooks on the bridge. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_VLAN_DEFAULT_PVID + * VLAN ID applied to untagged and priority-tagged incoming packets. + * + * The default value is 1. Setting to the special value 0 makes all ports of + * this bridge not have a PVID by default, which means that they will + * not accept VLAN-untagged traffic. + * + * @IFLA_BR_PAD + * Bridge attribute padding type for netlink message. + * + * @IFLA_BR_VLAN_STATS_ENABLED + * Enable (*IFLA_BR_VLAN_STATS_ENABLED* == 1) or disable + * (*IFLA_BR_VLAN_STATS_ENABLED* == 0) per-VLAN stats accounting. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MCAST_STATS_ENABLED + * Enable (*IFLA_BR_MCAST_STATS_ENABLED* > 0) or disable + * (*IFLA_BR_MCAST_STATS_ENABLED* == 0) multicast (IGMP/MLD) stats + * accounting. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MCAST_IGMP_VERSION + * Set the IGMP version. + * + * The valid values are 2 and 3. The default value is 2. + * + * @IFLA_BR_MCAST_MLD_VERSION + * Set the MLD version. + * + * The valid values are 1 and 2. The default value is 1. + * + * @IFLA_BR_VLAN_STATS_PER_PORT + * Enable (*IFLA_BR_VLAN_STATS_PER_PORT* == 1) or disable + * (*IFLA_BR_VLAN_STATS_PER_PORT* == 0) per-VLAN per-port stats accounting. + * Can be changed only when there are no port VLANs configured. + * + * The default value is 0 (disabled). + * + * @IFLA_BR_MULTI_BOOLOPT + * The multi_boolopt is used to control new boolean options to avoid adding + * new netlink attributes. You can look at ``enum br_boolopt_id`` for those + * options. + * + * @IFLA_BR_MCAST_QUERIER_STATE + * Bridge mcast querier states, read only. + * + * @IFLA_BR_FDB_N_LEARNED + * The number of dynamically learned FDB entries for the current bridge, + * read only. + * + * @IFLA_BR_FDB_MAX_LEARNED + * Set the number of max dynamically learned FDB entries for the current + * bridge. + */ enum { IFLA_BR_UNSPEC, IFLA_BR_FORWARD_DELAY, -- cgit v1.2.3 From 8c4bafdb01cc7809903aced4981f563e3708ea37 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 1 Dec 2023 16:19:43 +0800 Subject: net: bridge: add document for IFLA_BRPORT enum Add document for IFLA_BRPORT enum so we can use it in Documentation/networking/bridge.rst. Signed-off-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- include/uapi/linux/if_link.h | 241 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 241 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index a5f873c85a72..ab9bcff96e4d 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -802,11 +802,252 @@ struct ifla_bridge_id { __u8 addr[6]; /* ETH_ALEN */ }; +/** + * DOC: Bridge mode enum definition + * + * @BRIDGE_MODE_HAIRPIN + * Controls whether traffic may be sent back out of the port on which it + * was received. This option is also called reflective relay mode, and is + * used to support basic VEPA (Virtual Ethernet Port Aggregator) + * capabilities. By default, this flag is turned off and the bridge will + * not forward traffic back out of the receiving port. + */ enum { BRIDGE_MODE_UNSPEC, BRIDGE_MODE_HAIRPIN, }; +/** + * DOC: Bridge port enum definition + * + * @IFLA_BRPORT_STATE + * The operation state of the port. Here are the valid values. + * + * * 0 - port is in STP *DISABLED* state. Make this port completely + * inactive for STP. This is also called BPDU filter and could be used + * to disable STP on an untrusted port, like a leaf virtual device. + * The traffic forwarding is also stopped on this port. + * * 1 - port is in STP *LISTENING* state. Only valid if STP is enabled + * on the bridge. In this state the port listens for STP BPDUs and + * drops all other traffic frames. + * * 2 - port is in STP *LEARNING* state. Only valid if STP is enabled on + * the bridge. In this state the port will accept traffic only for the + * purpose of updating MAC address tables. + * * 3 - port is in STP *FORWARDING* state. Port is fully active. + * * 4 - port is in STP *BLOCKING* state. Only valid if STP is enabled on + * the bridge. This state is used during the STP election process. + * In this state, port will only process STP BPDUs. + * + * @IFLA_BRPORT_PRIORITY + * The STP port priority. The valid values are between 0 and 255. + * + * @IFLA_BRPORT_COST + * The STP path cost of the port. The valid values are between 1 and 65535. + * + * @IFLA_BRPORT_MODE + * Set the bridge port mode. See *BRIDGE_MODE_HAIRPIN* for more details. + * + * @IFLA_BRPORT_GUARD + * Controls whether STP BPDUs will be processed by the bridge port. By + * default, the flag is turned off to allow BPDU processing. Turning this + * flag on will disable the bridge port if a STP BPDU packet is received. + * + * If the bridge has Spanning Tree enabled, hostile devices on the network + * may send BPDU on a port and cause network failure. Setting *guard on* + * will detect and stop this by disabling the port. The port will be + * restarted if the link is brought down, or removed and reattached. + * + * @IFLA_BRPORT_PROTECT + * Controls whether a given port is allowed to become a root port or not. + * Only used when STP is enabled on the bridge. By default the flag is off. + * + * This feature is also called root port guard. If BPDU is received from a + * leaf (edge) port, it should not be elected as root port. This could + * be used if using STP on a bridge and the downstream bridges are not fully + * trusted; this prevents a hostile guest from rerouting traffic. + * + * @IFLA_BRPORT_FAST_LEAVE + * This flag allows the bridge to immediately stop multicast traffic + * forwarding on a port that receives an IGMP Leave message. It is only used + * when IGMP snooping is enabled on the bridge. By default the flag is off. + * + * @IFLA_BRPORT_LEARNING + * Controls whether a given port will learn *source* MAC addresses from + * received traffic or not. Also controls whether dynamic FDB entries + * (which can also be added by software) will be refreshed by incoming + * traffic. By default this flag is on. + * + * @IFLA_BRPORT_UNICAST_FLOOD + * Controls whether unicast traffic for which there is no FDB entry will + * be flooded towards this port. By default this flag is on. + * + * @IFLA_BRPORT_PROXYARP + * Enable proxy ARP on this port. + * + * @IFLA_BRPORT_LEARNING_SYNC + * Controls whether a given port will sync MAC addresses learned on device + * port to bridge FDB. + * + * @IFLA_BRPORT_PROXYARP_WIFI + * Enable proxy ARP on this port which meets extended requirements by + * IEEE 802.11 and Hotspot 2.0 specifications. + * + * @IFLA_BRPORT_ROOT_ID + * + * @IFLA_BRPORT_BRIDGE_ID + * + * @IFLA_BRPORT_DESIGNATED_PORT + * + * @IFLA_BRPORT_DESIGNATED_COST + * + * @IFLA_BRPORT_ID + * + * @IFLA_BRPORT_NO + * + * @IFLA_BRPORT_TOPOLOGY_CHANGE_ACK + * + * @IFLA_BRPORT_CONFIG_PENDING + * + * @IFLA_BRPORT_MESSAGE_AGE_TIMER + * + * @IFLA_BRPORT_FORWARD_DELAY_TIMER + * + * @IFLA_BRPORT_HOLD_TIMER + * + * @IFLA_BRPORT_FLUSH + * Flush bridge ports' fdb dynamic entries. + * + * @IFLA_BRPORT_MULTICAST_ROUTER + * Configure the port's multicast router presence. A port with + * a multicast router will receive all multicast traffic. + * The valid values are: + * + * * 0 disable multicast routers on this port + * * 1 let the system detect the presence of routers (default) + * * 2 permanently enable multicast traffic forwarding on this port + * * 3 enable multicast routers temporarily on this port, not depending + * on incoming queries. + * + * @IFLA_BRPORT_PAD + * + * @IFLA_BRPORT_MCAST_FLOOD + * Controls whether a given port will flood multicast traffic for which + * there is no MDB entry. By default this flag is on. + * + * @IFLA_BRPORT_MCAST_TO_UCAST + * Controls whether a given port will replicate packets using unicast + * instead of multicast. By default this flag is off. + * + * This is done by copying the packet per host and changing the multicast + * destination MAC to a unicast one accordingly. + * + * *mcast_to_unicast* works on top of the multicast snooping feature of the + * bridge. Which means unicast copies are only delivered to hosts which + * are interested in unicast and signaled this via IGMP/MLD reports previously. + * + * This feature is intended for interface types which have a more reliable + * and/or efficient way to deliver unicast packets than broadcast ones + * (e.g. WiFi). + * + * However, it should only be enabled on interfaces where no IGMPv2/MLDv1 + * report suppression takes place. IGMP/MLD report suppression issue is + * usually overcome by the network daemon (supplicant) enabling AP isolation + * and by that separating all STAs. + * + * Delivery of STA-to-STA IP multicast is made possible again by enabling + * and utilizing the bridge hairpin mode, which considers the incoming port + * as a potential outgoing port, too (see *BRIDGE_MODE_HAIRPIN* option). + * Hairpin mode is performed after multicast snooping, therefore leading + * to only deliver reports to STAs running a multicast router. + * + * @IFLA_BRPORT_VLAN_TUNNEL + * Controls whether vlan to tunnel mapping is enabled on the port. + * By default this flag is off. + * + * @IFLA_BRPORT_BCAST_FLOOD + * Controls flooding of broadcast traffic on the given port. By default + * this flag is on. + * + * @IFLA_BRPORT_GROUP_FWD_MASK + * Set the group forward mask. This is a bitmask that is applied to + * decide whether to forward incoming frames destined to link-local + * addresses. The addresses of the form are 01:80:C2:00:00:0X (defaults + * to 0, which means the bridge does not forward any link-local frames + * coming on this port). + * + * @IFLA_BRPORT_NEIGH_SUPPRESS + * Controls whether neighbor discovery (arp and nd) proxy and suppression + * is enabled on the port. By default this flag is off. + * + * @IFLA_BRPORT_ISOLATED + * Controls whether a given port will be isolated, which means it will be + * able to communicate with non-isolated ports only. By default this + * flag is off. + * + * @IFLA_BRPORT_BACKUP_PORT + * Set a backup port. If the port loses carrier all traffic will be + * redirected to the configured backup port. Set the value to 0 to disable + * it. + * + * @IFLA_BRPORT_MRP_RING_OPEN + * + * @IFLA_BRPORT_MRP_IN_OPEN + * + * @IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT + * The number of per-port EHT hosts limit. The default value is 512. + * Setting to 0 is not allowed. + * + * @IFLA_BRPORT_MCAST_EHT_HOSTS_CNT + * The current number of tracked hosts, read only. + * + * @IFLA_BRPORT_LOCKED + * Controls whether a port will be locked, meaning that hosts behind the + * port will not be able to communicate through the port unless an FDB + * entry with the unit's MAC address is in the FDB. The common use case is + * that hosts are allowed access through authentication with the IEEE 802.1X + * protocol or based on whitelists. By default this flag is off. + * + * Please note that secure 802.1X deployments should always use the + * *BR_BOOLOPT_NO_LL_LEARN* flag, to not permit the bridge to populate its + * FDB based on link-local (EAPOL) traffic received on the port. + * + * @IFLA_BRPORT_MAB + * Controls whether a port will use MAC Authentication Bypass (MAB), a + * technique through which select MAC addresses may be allowed on a locked + * port, without using 802.1X authentication. Packets with an unknown source + * MAC address generates a "locked" FDB entry on the incoming bridge port. + * The common use case is for user space to react to these bridge FDB + * notifications and optionally replace the locked FDB entry with a normal + * one, allowing traffic to pass for whitelisted MAC addresses. + * + * Setting this flag also requires *IFLA_BRPORT_LOCKED* and + * *IFLA_BRPORT_LEARNING*. *IFLA_BRPORT_LOCKED* ensures that unauthorized + * data packets are dropped, and *IFLA_BRPORT_LEARNING* allows the dynamic + * FDB entries installed by user space (as replacements for the locked FDB + * entries) to be refreshed and/or aged out. + * + * @IFLA_BRPORT_MCAST_N_GROUPS + * + * @IFLA_BRPORT_MCAST_MAX_GROUPS + * Sets the maximum number of MDB entries that can be registered for a + * given port. Attempts to register more MDB entries at the port than this + * limit allows will be rejected, whether they are done through netlink + * (e.g. the bridge tool), or IGMP or MLD membership reports. Setting a + * limit of 0 disables the limit. The default value is 0. + * + * @IFLA_BRPORT_NEIGH_VLAN_SUPPRESS + * Controls whether neighbor discovery (arp and nd) proxy and suppression is + * enabled for a given port. By default this flag is off. + * + * Note that this option only takes effect when *IFLA_BRPORT_NEIGH_SUPPRESS* + * is enabled for a given port. + * + * @IFLA_BRPORT_BACKUP_NHID + * The FDB nexthop object ID to attach to packets being redirected to a + * backup port that has VLAN tunnel mapping enabled (via the + * *IFLA_BRPORT_VLAN_TUNNEL* option). Setting a value of 0 (default) has + * the effect of not attaching any ID. + */ enum { IFLA_BRPORT_UNSPEC, IFLA_BRPORT_STATE, /* Spanning tree state */ -- cgit v1.2.3 From 41f6f64e6999a837048b1bd13a2f8742964eca6b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 5 Dec 2023 10:42:39 -0800 Subject: bpf: support non-r10 register spill/fill to/from stack in precision tracking Use instruction (jump) history to record instructions that performed register spill/fill to/from stack, regardless if this was done through read-only r10 register, or any other register after copying r10 into it *and* potentially adjusting offset. To make this work reliably, we push extra per-instruction flags into instruction history, encoding stack slot index (spi) and stack frame number in extra 10 bit flags we take away from prev_idx in instruction history. We don't touch idx field for maximum performance, as it's checked most frequently during backtracking. This change removes basically the last remaining practical limitation of precision backtracking logic in BPF verifier. It fixes known deficiencies, but also opens up new opportunities to reduce number of verified states, explored in the subsequent patches. There are only three differences in selftests' BPF object files according to veristat, all in the positive direction (less states). File Program Insns (A) Insns (B) Insns (DIFF) States (A) States (B) States (DIFF) -------------------------------------- ------------- --------- --------- ------------- ---------- ---------- ------------- test_cls_redirect_dynptr.bpf.linked3.o cls_redirect 2987 2864 -123 (-4.12%) 240 231 -9 (-3.75%) xdp_synproxy_kern.bpf.linked3.o syncookie_tc 82848 82661 -187 (-0.23%) 5107 5073 -34 (-0.67%) xdp_synproxy_kern.bpf.linked3.o syncookie_xdp 85116 84964 -152 (-0.18%) 5162 5130 -32 (-0.62%) Note, I avoided renaming jmp_history to more generic insn_hist to minimize number of lines changed and potential merge conflicts between bpf and bpf-next trees. Notice also cur_hist_entry pointer reset to NULL at the beginning of instruction verification loop. This pointer avoids the problem of relying on last jump history entry's insn_idx to determine whether we already have entry for current instruction or not. It can happen that we added jump history entry because current instruction is_jmp_point(), but also we need to add instruction flags for stack access. In this case, we don't want to entries, so we need to reuse last added entry, if it is present. Relying on insn_idx comparison has the same ambiguity problem as the one that was fixed recently in [0], so we avoid that. [0] https://patchwork.kernel.org/project/netdevbpf/patch/20231110002638.4168352-3-andrii@kernel.org/ Acked-by: Eduard Zingerman Reported-by: Tao Lyu Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231205184248.1502704-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3378cc753061..bada59812e00 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -325,12 +325,34 @@ struct bpf_func_state { int allocated_stack; }; -struct bpf_idx_pair { - u32 prev_idx; +#define MAX_CALL_FRAMES 8 + +/* instruction history flags, used in bpf_jmp_history_entry.flags field */ +enum { + /* instruction references stack slot through PTR_TO_STACK register; + * we also store stack's frame number in lower 3 bits (MAX_CALL_FRAMES is 8) + * and accessed stack slot's index in next 6 bits (MAX_BPF_STACK is 512, + * 8 bytes per slot, so slot index (spi) is [0, 63]) + */ + INSN_F_FRAMENO_MASK = 0x7, /* 3 bits */ + + INSN_F_SPI_MASK = 0x3f, /* 6 bits */ + INSN_F_SPI_SHIFT = 3, /* shifted 3 bits to the left */ + + INSN_F_STACK_ACCESS = BIT(9), /* we need 10 bits total */ +}; + +static_assert(INSN_F_FRAMENO_MASK + 1 >= MAX_CALL_FRAMES); +static_assert(INSN_F_SPI_MASK + 1 >= MAX_BPF_STACK / 8); + +struct bpf_jmp_history_entry { u32 idx; + /* insn idx can't be bigger than 1 million */ + u32 prev_idx : 22; + /* special flags, e.g., whether insn is doing register stack spill/load */ + u32 flags : 10; }; -#define MAX_CALL_FRAMES 8 /* Maximum number of register states that can exist at once */ #define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) struct bpf_verifier_state { @@ -413,7 +435,7 @@ struct bpf_verifier_state { * For most states jmp_history_cnt is [0-3]. * For loops can go up to ~40. */ - struct bpf_idx_pair *jmp_history; + struct bpf_jmp_history_entry *jmp_history; u32 jmp_history_cnt; u32 dfs_depth; u32 callback_unroll_depth; @@ -656,6 +678,7 @@ struct bpf_verifier_env { int cur_stack; } cfg; struct backtrack_state bt; + struct bpf_jmp_history_entry *cur_hist_ent; u32 pass_cnt; /* number of times do_check() was called */ u32 subprog_cnt; /* number of instructions analyzed by the verifier */ -- cgit v1.2.3 From 43a71cd66b9c0a4af3d15d8644359fde35bdbed0 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Mon, 4 Dec 2023 20:12:30 +0000 Subject: net-device: reorganize net_device fast path variables Reorganize fast path variables on tx-txrx-rx order Fastpath variables end after npinfo. Below data generated with pahole on x86 architecture. Fast path variables span cache lines before change: 12 Fast path variables span cache lines after change: 4 Suggested-by: Eric Dumazet Signed-off-by: Coco Li Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20231204201232.520025-2-lixiaoyan@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 117 +++++++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5551177e024e..cb96aad6a6ee 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2097,6 +2097,70 @@ enum netdev_stat_type { */ struct net_device { + /* Cacheline organization can be found documented in + * Documentation/networking/net_cachelines/net_device.rst. + * Please update the document when adding new fields. + */ + + /* TX read-mostly hotpath */ + __cacheline_group_begin(net_device_read_tx); + unsigned long long priv_flags; + const struct net_device_ops *netdev_ops; + const struct header_ops *header_ops; + struct netdev_queue *_tx; + unsigned int real_num_tx_queues; + unsigned int gso_max_size; + unsigned int gso_ipv4_max_size; + u16 gso_max_segs; + s16 num_tc; + /* Note : dev->mtu is often read without holding a lock. + * Writers usually hold RTNL. + * It is recommended to use READ_ONCE() to annotate the reads, + * and to use WRITE_ONCE() to annotate the writes. + */ + unsigned int mtu; + unsigned short needed_headroom; + struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; +#ifdef CONFIG_XPS + struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; +#endif +#ifdef CONFIG_NETFILTER_EGRESS + struct nf_hook_entries __rcu *nf_hooks_egress; +#endif +#ifdef CONFIG_NET_XGRESS + struct bpf_mprog_entry __rcu *tcx_egress; +#endif + __cacheline_group_end(net_device_read_tx); + + /* TXRX read-mostly hotpath */ + __cacheline_group_begin(net_device_read_txrx); + unsigned int flags; + unsigned short hard_header_len; + netdev_features_t features; + struct inet6_dev __rcu *ip6_ptr; + __cacheline_group_end(net_device_read_txrx); + + /* RX read-mostly hotpath */ + __cacheline_group_begin(net_device_read_rx); + struct list_head ptype_specific; + int ifindex; + unsigned int real_num_rx_queues; + struct netdev_rx_queue *_rx; + unsigned long gro_flush_timeout; + int napi_defer_hard_irqs; + unsigned int gro_max_size; + unsigned int gro_ipv4_max_size; + rx_handler_func_t __rcu *rx_handler; + void __rcu *rx_handler_data; + possible_net_t nd_net; +#ifdef CONFIG_NETPOLL + struct netpoll_info __rcu *npinfo; +#endif +#ifdef CONFIG_NET_XGRESS + struct bpf_mprog_entry __rcu *tcx_ingress; +#endif + __cacheline_group_end(net_device_read_rx); + char name[IFNAMSIZ]; struct netdev_name_node *name_node; struct dev_ifalias __rcu *ifalias; @@ -2121,7 +2185,6 @@ struct net_device { struct list_head unreg_list; struct list_head close_list; struct list_head ptype_all; - struct list_head ptype_specific; struct { struct list_head upper; @@ -2129,26 +2192,13 @@ struct net_device { } adj_list; /* Read-mostly cache-line for fast-path access */ - unsigned int flags; xdp_features_t xdp_features; - unsigned long long priv_flags; - const struct net_device_ops *netdev_ops; const struct xdp_metadata_ops *xdp_metadata_ops; const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops; - int ifindex; unsigned short gflags; - unsigned short hard_header_len; - /* Note : dev->mtu is often read without holding a lock. - * Writers usually hold RTNL. - * It is recommended to use READ_ONCE() to annotate the reads, - * and to use WRITE_ONCE() to annotate the writes. - */ - unsigned int mtu; - unsigned short needed_headroom; unsigned short needed_tailroom; - netdev_features_t features; netdev_features_t hw_features; netdev_features_t wanted_features; netdev_features_t vlan_features; @@ -2192,8 +2242,6 @@ struct net_device { const struct tlsdev_ops *tlsdev_ops; #endif - const struct header_ops *header_ops; - unsigned char operstate; unsigned char link_mode; @@ -2234,9 +2282,7 @@ struct net_device { /* Protocol-specific pointers */ - struct in_device __rcu *ip_ptr; - struct inet6_dev __rcu *ip6_ptr; #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif @@ -2271,26 +2317,14 @@ struct net_device { /* Interface address info used in eth_type_trans() */ const unsigned char *dev_addr; - struct netdev_rx_queue *_rx; unsigned int num_rx_queues; - unsigned int real_num_rx_queues; - struct bpf_prog __rcu *xdp_prog; - unsigned long gro_flush_timeout; - int napi_defer_hard_irqs; #define GRO_LEGACY_MAX_SIZE 65536u /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), * and shinfo->gso_segs is a 16bit field. */ #define GRO_MAX_SIZE (8 * 65535u) - unsigned int gro_max_size; - unsigned int gro_ipv4_max_size; unsigned int xdp_zc_max_segs; - rx_handler_func_t __rcu *rx_handler; - void __rcu *rx_handler_data; -#ifdef CONFIG_NET_XGRESS - struct bpf_mprog_entry __rcu *tcx_ingress; -#endif struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS struct nf_hook_entries __rcu *nf_hooks_ingress; @@ -2305,25 +2339,13 @@ struct net_device { /* * Cache lines mostly used on transmit path */ - struct netdev_queue *_tx ____cacheline_aligned_in_smp; unsigned int num_tx_queues; - unsigned int real_num_tx_queues; struct Qdisc __rcu *qdisc; unsigned int tx_queue_len; spinlock_t tx_global_lock; struct xdp_dev_bulk_queue __percpu *xdp_bulkq; -#ifdef CONFIG_XPS - struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; -#endif -#ifdef CONFIG_NET_XGRESS - struct bpf_mprog_entry __rcu *tcx_egress; -#endif -#ifdef CONFIG_NETFILTER_EGRESS - struct nf_hook_entries __rcu *nf_hooks_egress; -#endif - #ifdef CONFIG_NET_SCHED DECLARE_HASHTABLE (qdisc_hash, 4); #endif @@ -2362,12 +2384,6 @@ struct net_device { bool needs_free_netdev; void (*priv_destructor)(struct net_device *dev); -#ifdef CONFIG_NETPOLL - struct netpoll_info __rcu *npinfo; -#endif - - possible_net_t nd_net; - /* mid-layer private */ void *ml_priv; enum netdev_ml_priv_type ml_priv_type; @@ -2402,20 +2418,15 @@ struct net_device { */ #define GSO_MAX_SIZE (8 * GSO_MAX_SEGS) - unsigned int gso_max_size; #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX unsigned int tso_max_size; - u16 gso_max_segs; #define TSO_MAX_SEGS U16_MAX u16 tso_max_segs; - unsigned int gso_ipv4_max_size; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif - s16 num_tc; - struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; u8 prio_tc_map[TC_BITMASK + 1]; #if IS_ENABLED(CONFIG_FCOE) -- cgit v1.2.3 From d5fed5addb2b6bc13035de4338b7ea2052a2e006 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Mon, 4 Dec 2023 20:12:31 +0000 Subject: tcp: reorganize tcp_sock fast path variables The variables are organized according in the following way: - TX read-mostly hotpath cache lines - TXRX read-mostly hotpath cache lines - RX read-mostly hotpath cache lines - TX read-write hotpath cache line - TXRX read-write hotpath cache line - RX read-write hotpath cache line Fastpath cachelines end after rcvq_space. Cache line boundaries are enforced only between read-mostly and read-write. That is, if read-mostly tx cachelines bleed into read-mostly txrx cachelines, we do not care. We care about the boundaries between read and write cachelines because we want to prevent false sharing. Fast path variables span cache lines before change: 12 Fast path variables span cache lines after change: 8 Suggested-by: Eric Dumazet Reviewed-by: Wei Wang Signed-off-by: Coco Li Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20231204201232.520025-3-lixiaoyan@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 248 ++++++++++++++++++++++++++++------------------------ 1 file changed, 134 insertions(+), 114 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 68f3d315d2e1..f55ec155f5b7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -194,23 +194,121 @@ static inline bool tcp_rsk_used_ao(const struct request_sock *req) #define TCP_RMEM_TO_WIN_SCALE 8 struct tcp_sock { + /* Cacheline organization can be found documented in + * Documentation/networking/net_cachelines/tcp_sock.rst. + * Please update the document when adding new fields. + */ + /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; - u16 tcp_header_len; /* Bytes of tcp header to send */ + + /* TX read-mostly hotpath cache lines */ + __cacheline_group_begin(tcp_sock_read_tx); + /* timestamp of last sent data packet (for restart window) */ + u32 max_window; /* Maximal window ever seen from peer */ + u32 rcv_ssthresh; /* Current window clamp */ + u32 reordering; /* Packet reordering metric. */ + u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u16 gso_segs; /* Max number of segs per GSO packet */ + /* from STCP, retrans queue hinting */ + struct sk_buff *lost_skb_hint; + struct sk_buff *retransmit_skb_hint; + __cacheline_group_end(tcp_sock_read_tx); + + /* TXRX read-mostly hotpath cache lines */ + __cacheline_group_begin(tcp_sock_read_txrx); + u32 tsoffset; /* timestamp offset */ + u32 snd_wnd; /* The window we expect to receive */ + u32 mss_cache; /* Cached effective mss, not including SACKS */ + u32 snd_cwnd; /* Sending congestion window */ + u32 prr_out; /* Total number of pkts sent during Recovery. */ + u32 lost_out; /* Lost packets */ + u32 sacked_out; /* SACK'd packets */ + u16 tcp_header_len; /* Bytes of tcp header to send */ + u8 chrono_type : 2, /* current chronograph type */ + repair : 1, + is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ + is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ + __cacheline_group_end(tcp_sock_read_txrx); + + /* RX read-mostly hotpath cache lines */ + __cacheline_group_begin(tcp_sock_read_rx); + u32 copied_seq; /* Head of yet unread data */ + u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ + u32 snd_wl1; /* Sequence for window update */ + u32 tlp_high_seq; /* snd_nxt at the time of TLP */ + u32 rttvar_us; /* smoothed mdev_max */ + u32 retrans_out; /* Retransmitted packets out */ + u16 advmss; /* Advertised MSS */ + u16 urg_data; /* Saved octet of OOB data and control flags */ + u32 lost; /* Total data packets lost incl. rexmits */ + struct minmax rtt_min; + /* OOO segments go in this rbtree. Socket lock must be held. */ + struct rb_root out_of_order_queue; + u32 snd_ssthresh; /* Slow start size threshold */ + __cacheline_group_end(tcp_sock_read_rx); + /* TX read-write hotpath cache lines */ + __cacheline_group_begin(tcp_sock_write_tx) ____cacheline_aligned; + u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut + * total number of data bytes sent. + */ + u32 snd_sml; /* Last byte of the most recently transmitted small packet */ + u32 chrono_start; /* Start time in jiffies of a TCP chrono */ + u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ + u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ + u32 pushed_seq; /* Last pushed seq, required to talk to windows */ + u32 lsndtime; + u32 mdev_us; /* medium deviation */ + u64 tcp_wstamp_ns; /* departure time for next sent data packet */ + u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ + u64 tcp_mstamp; /* most recent packet received/sent */ + u32 rtt_seq; /* sequence number to update rttvar */ + struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ + struct sk_buff *highest_sack; /* skb just after the highest + * skb with SACKed bit set + * (validity guaranteed only if + * sacked_out > 0) + */ + u8 ecn_flags; /* ECN status bits. */ + __cacheline_group_end(tcp_sock_write_tx); + + /* TXRX read-write hotpath cache lines */ + __cacheline_group_begin(tcp_sock_write_txrx); /* * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ __be32 pred_flags; - + u32 rcv_nxt; /* What we want to receive next */ + u32 snd_nxt; /* Next sequence we send */ + u32 snd_una; /* First byte we want an ack for */ + u32 window_clamp; /* Maximal window to advertise */ + u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + u32 packets_out; /* Packets which are "in flight" */ + u32 snd_up; /* Urgent pointer */ + u32 delivered; /* Total data packets delivered incl. rexmits */ + u32 delivered_ce; /* Like the above but only ECE marked packets */ + u32 app_limited; /* limited until "delivered" reaches this val */ + u32 rcv_wnd; /* Current receiver window */ /* - * RFC793 variables by their proper names. This means you can - * read the code and the spec side by side (and laugh ...) - * See RFC793 and RFC1122. The RFC writes these in capitals. + * Options received (usually on last packet, some only on SYN packets). */ - u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + struct tcp_options_received rx_opt; + u8 nonagle : 4,/* Disable Nagle algorithm? */ + rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ + __cacheline_group_end(tcp_sock_write_txrx); + + /* RX read-write hotpath cache lines */ + __cacheline_group_begin(tcp_sock_write_rx); + u64 bytes_received; + /* RFC4898 tcpEStatsAppHCThruOctetsReceived * sum(delta(rcv_nxt)), or how many bytes * were acked. */ @@ -220,45 +318,44 @@ struct tcp_sock { u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn * total number of data segments in. */ - u32 rcv_nxt; /* What we want to receive next */ - u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ - u32 snd_nxt; /* Next sequence we send */ - u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut - * The total number of segments sent. - */ - u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut - * total number of data segments sent. - */ - u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut - * total number of data bytes sent. - */ + u32 max_packets_out; /* max packets_out in last window */ + u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */ + u32 rate_delivered; /* saved rate sample: packets delivered */ + u32 rate_interval_us; /* saved rate sample: time elapsed */ + u32 rcv_rtt_last_tsecr; + u64 first_tx_mstamp; /* start of window send phase */ + u64 delivered_mstamp; /* time we reached "delivered" */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. */ + struct { + u32 rtt_us; + u32 seq; + u64 time; + } rcv_rtt_est; +/* Receiver queue space */ + struct { + u32 space; + u32 seq; + u64 time; + } rcvq_space; + __cacheline_group_end(tcp_sock_write_rx); + /* End of Hot Path */ + +/* + * RFC793 variables by their proper names. This means you can + * read the code and the spec side by side (and laugh ...) + * See RFC793 and RFC1122. The RFC writes these in capitals. + */ u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups * total number of DSACK blocks received */ - u32 snd_una; /* First byte we want an ack for */ - u32 snd_sml; /* Last byte of the most recently transmitted small packet */ - u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ - u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ u32 compressed_ack_rcv_nxt; - - u32 tsoffset; /* timestamp offset */ - struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ - struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ - - u32 snd_wl1; /* Sequence for window update */ - u32 snd_wnd; /* The window we expect to receive */ - u32 max_window; /* Maximal window ever seen from peer */ - u32 mss_cache; /* Cached effective mss, not including SACKS */ - u32 window_clamp; /* Maximal window to advertise */ - u32 rcv_ssthresh; /* Current window clamp */ u8 scaling_ratio; /* see tcp_win_from_space() */ /* Information of the most recently (s)acked skb */ struct tcp_rack { @@ -272,24 +369,16 @@ struct tcp_sock { dsack_seen:1, /* Whether DSACK seen after last adj */ advanced:1; /* mstamp advanced since last lost marking */ } rack; - u16 advmss; /* Advertised MSS */ u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ tcp_usec_ts:1, /* TSval values in usec */ unused:4; - u32 chrono_start; /* Start time in jiffies of a TCP chrono */ - u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ - u8 chrono_type:2, /* current chronograph type */ - rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ + u8 thin_lto : 1,/* Use linear timeouts for thin streams */ + recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ - is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ - fastopen_client_fail:2; /* reason why fastopen failed */ - u8 nonagle : 4,/* Disable Nagle algorithm? */ - thin_lto : 1,/* Use linear timeouts for thin streams */ - recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ - repair : 1, + fastopen_client_fail:2, /* reason why fastopen failed */ frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; u8 save_syn:2, /* Save headers of SYN packet */ @@ -297,45 +386,19 @@ struct tcp_sock { syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ - syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ - is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ - u32 tlp_high_seq; /* snd_nxt at the time of TLP */ + syn_data_acked:1;/* data in SYN is acked by SYN-ACK */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ - u64 tcp_wstamp_ns; /* departure time for next sent data packet */ - u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ /* RTT measurement */ - u64 tcp_mstamp; /* most recent packet received/sent */ - u32 srtt_us; /* smoothed round trip time << 3 in usecs */ - u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ - u32 rttvar_us; /* smoothed mdev_max */ - u32 rtt_seq; /* sequence number to update rttvar */ - struct minmax rtt_min; - u32 packets_out; /* Packets which are "in flight" */ - u32 retrans_out; /* Retransmitted packets out */ - u32 max_packets_out; /* max packets_out in last window */ - u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */ - - u16 urg_data; /* Saved octet of OOB data and control flags */ - u8 ecn_flags; /* ECN status bits. */ u8 keepalive_probes; /* num of allowed keep alive probes */ - u32 reordering; /* Packet reordering metric. */ u32 reord_seen; /* number of data packet reordering events */ - u32 snd_up; /* Urgent pointer */ - -/* - * Options received (usually on last packet, some only on SYN packets). - */ - struct tcp_options_received rx_opt; /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ - u32 snd_ssthresh; /* Slow start size threshold */ - u32 snd_cwnd; /* Sending congestion window */ u32 snd_cwnd_cnt; /* Linear increase counter */ u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; @@ -343,32 +406,10 @@ struct tcp_sock { u32 prior_cwnd; /* cwnd right before starting loss recovery */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ - u32 prr_out; /* Total number of pkts sent during Recovery. */ - u32 delivered; /* Total data packets delivered incl. rexmits */ - u32 delivered_ce; /* Like the above but only ECE marked packets */ - u32 lost; /* Total data packets lost incl. rexmits */ - u32 app_limited; /* limited until "delivered" reaches this val */ - u64 first_tx_mstamp; /* start of window send phase */ - u64 delivered_mstamp; /* time we reached "delivered" */ - u32 rate_delivered; /* saved rate sample: packets delivered */ - u32 rate_interval_us; /* saved rate sample: time elapsed */ - - u32 rcv_wnd; /* Current receiver window */ - u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ - u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ - u32 pushed_seq; /* Last pushed seq, required to talk to windows */ - u32 lost_out; /* Lost packets */ - u32 sacked_out; /* SACK'd packets */ struct hrtimer pacing_timer; struct hrtimer compressed_ack_timer; - /* from STCP, retrans queue hinting */ - struct sk_buff* lost_skb_hint; - struct sk_buff *retransmit_skb_hint; - - /* OOO segments go in this rbtree. Socket lock must be held. */ - struct rb_root out_of_order_queue; struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ @@ -377,12 +418,6 @@ struct tcp_sock { struct tcp_sack_block recv_sack_cache[4]; - struct sk_buff *highest_sack; /* skb just after the highest - * skb with SACKed bit set - * (validity guaranteed only if - * sacked_out > 0) - */ - int lost_cnt_hint; u32 prior_ssthresh; /* ssthresh saved at recovery start */ @@ -433,21 +468,6 @@ struct tcp_sock { u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ -/* Receiver side RTT estimation */ - u32 rcv_rtt_last_tsecr; - struct { - u32 rtt_us; - u32 seq; - u64 time; - } rcv_rtt_est; - -/* Receiver queue space */ - struct { - u32 space; - u32 seq; - u64 time; - } rcvq_space; - /* TCP-specific MTU probe information. */ struct { u32 probe_seq_start; -- cgit v1.2.3 From facd15dfd69122042502d99ab8c9f888b48ee994 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 Dec 2023 21:47:07 +0100 Subject: net: core: synchronize link-watch when carrier is queried There are multiple ways to query for the carrier state: through rtnetlink, sysfs, and (possibly) ethtool. Synchronize linkwatch work before these operations so that we don't have a situation where userspace queries the carrier state between the driver's carrier off->on transition and linkwatch running and expects it to work, when really (at least) TX cannot work until linkwatch has run. I previously posted a longer explanation of how this applies to wireless [1] but with this wireless can simply query the state before sending data, to ensure the kernel is ready for it. [1] https://lore.kernel.org/all/346b21d87c69f817ea3c37caceb34f1f56255884.camel@sipsolutions.net/ Signed-off-by: Johannes Berg Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20231204214706.303c62768415.I1caedccae72ee5a45c9085c5eb49c145ce1c0dd5@changeid Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cb96aad6a6ee..1b935ee341b4 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4229,6 +4229,15 @@ static inline void netdev_ref_replace(struct net_device *odev, */ void linkwatch_fire_event(struct net_device *dev); +/** + * linkwatch_sync_dev - sync linkwatch for the given device + * @dev: network device to sync linkwatch for + * + * Sync linkwatch for the given device, removing it from the + * pending work list (if queued). + */ +void linkwatch_sync_dev(struct net_device *dev); + /** * netif_carrier_ok - test if carrier present * @dev: network device -- cgit v1.2.3 From 2f57dd94bdef083855366138646b26b05f410d99 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Mon, 4 Dec 2023 11:33:28 -0800 Subject: packet: add a generic drop reason for receive Commit da37845fdce2 ("packet: uses kfree_skb() for errors.") switches from consume_skb to kfree_skb to improve error handling. However, this could bring a lot of noises when we monitor real packet drops in kfree_skb[1], because in tpacket_rcv or packet_rcv only packet clones can be freed, not actual packets. Adding a generic drop reason to allow distinguish these "clone drops". [1]: https://lore.kernel.org/netdev/CABWYdi00L+O30Q=Zah28QwZ_5RU-xcxLFUK2Zj08A8MrLk9jzg@mail.gmail.com/ Fixes: da37845fdce2 ("packet: uses kfree_skb() for errors.") Suggested-by: Eric Dumazet Suggested-by: Willem de Bruijn Signed-off-by: Yan Zhai Reviewed-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/r/ZW4piNbx3IenYnuw@debian.debian Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 3c70ad53a49c..278e4c7d465c 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -86,6 +86,7 @@ FN(IPV6_NDISC_NS_OTHERHOST) \ FN(QUEUE_PURGE) \ FN(TC_ERROR) \ + FN(PACKET_SOCK_ERROR) \ FNe(MAX) /** @@ -378,6 +379,11 @@ enum skb_drop_reason { SKB_DROP_REASON_QUEUE_PURGE, /** @SKB_DROP_REASON_TC_ERROR: generic internal tc error. */ SKB_DROP_REASON_TC_ERROR, + /** + * @SKB_DROP_REASON_PACKET_SOCK_ERROR: generic packet socket errors + * after its filter matches an incoming packet. + */ + SKB_DROP_REASON_PACKET_SOCK_ERROR, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen -- cgit v1.2.3 From 88f29324042752a28245ec0ab285d71c7f4d9c6a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Nov 2023 22:50:58 +0100 Subject: wifi: cfg80211: make RX assoc data const This is just a collection of data and we only read it, so make it const. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d36ad4cedf3b..d59669d86718 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7312,7 +7312,7 @@ struct cfg80211_rx_assoc_resp_data { * This function may sleep. The caller must hold the corresponding wdev's mutex. */ void cfg80211_rx_assoc_resp(struct net_device *dev, - struct cfg80211_rx_assoc_resp_data *data); + const struct cfg80211_rx_assoc_resp_data *data); /** * struct cfg80211_assoc_failure - association failure data -- cgit v1.2.3 From 40bba140c60fbb3ee8df6203c82fbd3de9f19d95 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:14 -0800 Subject: bpf: add BPF token delegation mount options to BPF FS Add few new mount options to BPF FS that allow to specify that a given BPF FS instance allows creation of BPF token (added in the next patch), and what sort of operations are allowed under BPF token. As such, we get 4 new mount options, each is a bit mask - `delegate_cmds` allow to specify which bpf() syscall commands are allowed with BPF token derived from this BPF FS instance; - if BPF_MAP_CREATE command is allowed, `delegate_maps` specifies a set of allowable BPF map types that could be created with BPF token; - if BPF_PROG_LOAD command is allowed, `delegate_progs` specifies a set of allowable BPF program types that could be loaded with BPF token; - if BPF_PROG_LOAD command is allowed, `delegate_attachs` specifies a set of allowable BPF program attach types that could be loaded with BPF token; delegate_progs and delegate_attachs are meant to be used together, as full BPF program type is, in general, determined through both program type and program attach type. Currently, these mount options accept the following forms of values: - a special value "any", that enables all possible values of a given bit set; - numeric value (decimal or hexadecimal, determined by kernel automatically) that specifies a bit mask value directly; - all the values for a given mount option are combined, if specified multiple times. E.g., `mount -t bpf nodev /path/to/mount -o delegate_maps=0x1 -o delegate_maps=0x2` will result in a combined 0x3 mask. Ideally, more convenient (for humans) symbolic form derived from corresponding UAPI enums would be accepted (e.g., `-o delegate_progs=kprobe|tracepoint`) and I intend to implement this, but it requires a bunch of UAPI header churn, so I postponed it until this feature lands upstream or at least there is a definite consensus that this feature is acceptable and is going to make it, just to minimize amount of wasted effort and not increase amount of non-essential code to be reviewed. Attentive reader will notice that BPF FS is now marked as FS_USERNS_MOUNT, which theoretically makes it mountable inside non-init user namespace as long as the process has sufficient *namespaced* capabilities within that user namespace. But in reality we still restrict BPF FS to be mountable only by processes with CAP_SYS_ADMIN *in init userns* (extra check in bpf_fill_super()). FS_USERNS_MOUNT is added to allow creating BPF FS context object (i.e., fsopen("bpf")) from inside unprivileged process inside non-init userns, to capture that userns as the owning userns. It will still be required to pass this context object back to privileged process to instantiate and mount it. This manipulation is important, because capturing non-init userns as the owning userns of BPF FS instance (super block) allows to use that userns to constraint BPF token to that userns later on (see next patch). So creating BPF FS with delegation inside unprivileged userns will restrict derived BPF token objects to only "work" inside that intended userns, making it scoped to a intended "container". Also, setting these delegation options requires capable(CAP_SYS_ADMIN), so unprivileged process cannot set this up without involvement of a privileged process. There is a set of selftests at the end of the patch set that simulates this sequence of steps and validates that everything works as intended. But careful review is requested to make sure there are no missed gaps in the implementation and testing. This somewhat subtle set of aspects is the result of previous discussions ([0]) about various user namespace implications and interactions with BPF token functionality and is necessary to contain BPF token inside intended user namespace. [0] https://lore.kernel.org/bpf/20230704-hochverdient-lehne-eeb9eeef785e@brauner/ Acked-by: Christian Brauner Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 10e5e4d8a00f..d3c9acc593ea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1581,6 +1581,16 @@ struct bpf_link_primer { u32 id; }; +struct bpf_mount_opts { + umode_t mode; + + /* BPF token-related delegation options */ + u64 delegate_cmds; + u64 delegate_maps; + u64 delegate_progs; + u64 delegate_attachs; +}; + struct bpf_struct_ops_value; struct btf_member; -- cgit v1.2.3 From 4527358b76861dfd64ee34aba45d81648fbc8a61 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:15 -0800 Subject: bpf: introduce BPF token object Add new kind of BPF kernel object, BPF token. BPF token is meant to allow delegating privileged BPF functionality, like loading a BPF program or creating a BPF map, from privileged process to a *trusted* unprivileged process, all while having a good amount of control over which privileged operations could be performed using provided BPF token. This is achieved through mounting BPF FS instance with extra delegation mount options, which determine what operations are delegatable, and also constraining it to the owning user namespace (as mentioned in the previous patch). BPF token itself is just a derivative from BPF FS and can be created through a new bpf() syscall command, BPF_TOKEN_CREATE, which accepts BPF FS FD, which can be attained through open() API by opening BPF FS mount point. Currently, BPF token "inherits" delegated command, map types, prog type, and attach type bit sets from BPF FS as is. In the future, having an BPF token as a separate object with its own FD, we can allow to further restrict BPF token's allowable set of things either at the creation time or after the fact, allowing the process to guard itself further from unintentionally trying to load undesired kind of BPF programs. But for now we keep things simple and just copy bit sets as is. When BPF token is created from BPF FS mount, we take reference to the BPF super block's owning user namespace, and then use that namespace for checking all the {CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN} capabilities that are normally only checked against init userns (using capable()), but now we check them using ns_capable() instead (if BPF token is provided). See bpf_token_capable() for details. Such setup means that BPF token in itself is not sufficient to grant BPF functionality. User namespaced process has to *also* have necessary combination of capabilities inside that user namespace. So while previously CAP_BPF was useless when granted within user namespace, now it gains a meaning and allows container managers and sys admins to have a flexible control over which processes can and need to use BPF functionality within the user namespace (i.e., container in practice). And BPF FS delegation mount options and derived BPF tokens serve as a per-container "flag" to grant overall ability to use bpf() (plus further restrict on which parts of bpf() syscalls are treated as namespaced). Note also, BPF_TOKEN_CREATE command itself requires ns_capable(CAP_BPF) within the BPF FS owning user namespace, rounding up the ns_capable() story of BPF token. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 41 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/bpf.h | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d3c9acc593ea..aa9cf8e5fab1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -51,6 +51,10 @@ struct module; struct bpf_func_state; struct ftrace_ops; struct cgroup; +struct bpf_token; +struct user_namespace; +struct super_block; +struct inode; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1591,6 +1595,13 @@ struct bpf_mount_opts { u64 delegate_attachs; }; +struct bpf_token { + struct work_struct work; + atomic64_t refcnt; + struct user_namespace *userns; + u64 allowed_cmds; +}; + struct bpf_struct_ops_value; struct btf_member; @@ -2048,6 +2059,7 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } +extern const struct super_operations bpf_super_ops; extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; @@ -2182,6 +2194,8 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map) extern int sysctl_unprivileged_bpf_disabled; +bool bpf_token_capable(const struct bpf_token *token, int cap); + static inline bool bpf_allow_ptr_leaks(void) { return perfmon_capable(); @@ -2216,8 +2230,17 @@ int bpf_link_new_fd(struct bpf_link *link); struct bpf_link *bpf_link_get_from_fd(u32 ufd); struct bpf_link *bpf_link_get_curr_or_next(u32 *id); +void bpf_token_inc(struct bpf_token *token); +void bpf_token_put(struct bpf_token *token); +int bpf_token_create(union bpf_attr *attr); +struct bpf_token *bpf_token_get_from_fd(u32 ufd); + +bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); + int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); +struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, + umode_t mode); #define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ @@ -2580,6 +2603,24 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } +static inline bool bpf_token_capable(const struct bpf_token *token, int cap) +{ + return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); +} + +static inline void bpf_token_inc(struct bpf_token *token) +{ +} + +static inline void bpf_token_put(struct bpf_token *token) +{ +} + +static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) +{ + return ERR_PTR(-EOPNOTSUPP); +} + static inline void __dev_flush(void) { } diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e88746ba7d21..d4a567e5bc3c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -847,6 +847,36 @@ union bpf_iter_link_info { * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * + * BPF_TOKEN_CREATE + * Description + * Create BPF token with embedded information about what + * BPF-related functionality it allows: + * - a set of allowed bpf() syscall commands; + * - a set of allowed BPF map types to be created with + * BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed; + * - a set of allowed BPF program types and BPF program attach + * types to be loaded with BPF_PROG_LOAD command, if + * BPF_PROG_LOAD itself is allowed. + * + * BPF token is created (derived) from an instance of BPF FS, + * assuming it has necessary delegation mount options specified. + * This BPF token can be passed as an extra parameter to various + * bpf() syscall commands to grant BPF subsystem functionality to + * unprivileged processes. + * + * When created, BPF token is "associated" with the owning + * user namespace of BPF FS instance (super block) that it was + * derived from, and subsequent BPF operations performed with + * BPF token would be performing capabilities checks (i.e., + * CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within + * that user namespace. Without BPF token, such capabilities + * have to be granted in init user namespace, making bpf() + * syscall incompatible with user namespace, for the most part. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -901,6 +931,8 @@ enum bpf_cmd { BPF_ITER_CREATE, BPF_LINK_DETACH, BPF_PROG_BIND_MAP, + BPF_TOKEN_CREATE, + __MAX_BPF_CMD, }; enum bpf_map_type { @@ -1712,6 +1744,11 @@ union bpf_attr { __u32 flags; /* extra flags */ } prog_bind_map; + struct { /* struct used by BPF_TOKEN_CREATE command */ + __u32 flags; + __u32 bpffs_fd; + } token_create; + } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 688b7270b3cb75e8ac78123d719967db40336e5b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:16 -0800 Subject: bpf: add BPF token support to BPF_MAP_CREATE command Allow providing token_fd for BPF_MAP_CREATE command to allow controlled BPF map creation from unprivileged process through delegated BPF token. Wire through a set of allowed BPF map types to BPF token, derived from BPF FS at BPF token creation time. This, in combination with allowed_cmds allows to create a narrowly-focused BPF token (controlled by privileged agent) with a restrictive set of BPF maps that application can attempt to create. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aa9cf8e5fab1..e08e8436df38 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1600,6 +1600,7 @@ struct bpf_token { atomic64_t refcnt; struct user_namespace *userns; u64 allowed_cmds; + u64 allowed_maps; }; struct bpf_struct_ops_value; @@ -2236,6 +2237,7 @@ int bpf_token_create(union bpf_attr *attr); struct bpf_token *bpf_token_get_from_fd(u32 ufd); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); +bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d4a567e5bc3c..0bba3392b17a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -983,6 +983,7 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, + __MAX_BPF_MAP_TYPE }; /* Note that tracing related programs such as @@ -1433,6 +1434,7 @@ union bpf_attr { * to using 5 hash functions). */ __u64 map_extra; + __u32 map_token_fd; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ -- cgit v1.2.3 From ee54b1a910e4d49c9a104f31ae3f5b979131adf8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:17 -0800 Subject: bpf: add BPF token support to BPF_BTF_LOAD command Accept BPF token FD in BPF_BTF_LOAD command to allow BTF data loading through delegated BPF token. BTF loading is a pretty straightforward operation, so as long as BPF token is created with allow_cmds granting BPF_BTF_LOAD command, kernel proceeds to parsing BTF data and creating BTF object. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0bba3392b17a..9f9989e0d062 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1616,6 +1616,7 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 btf_log_true_size; + __u32 btf_token_fd; }; struct { -- cgit v1.2.3 From e1cef620f598853a90f17701fcb1057a6768f7b8 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:18 -0800 Subject: bpf: add BPF token support to BPF_PROG_LOAD command Add basic support of BPF token to BPF_PROG_LOAD. Wire through a set of allowed BPF program types and attach types, derived from BPF FS at BPF token creation time. Then make sure we perform bpf_token_capable() checks everywhere where it's relevant. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ include/uapi/linux/bpf.h | 2 ++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e08e8436df38..20af87b59d70 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1461,6 +1461,7 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif + struct bpf_token *token; struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; @@ -1601,6 +1602,8 @@ struct bpf_token { struct user_namespace *userns; u64 allowed_cmds; u64 allowed_maps; + u64 allowed_progs; + u64 allowed_attachs; }; struct bpf_struct_ops_value; @@ -2238,6 +2241,9 @@ struct bpf_token *bpf_token_get_from_fd(u32 ufd); bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); +bool bpf_token_allow_prog_type(const struct bpf_token *token, + enum bpf_prog_type prog_type, + enum bpf_attach_type attach_type); int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9f9989e0d062..4df2d025c784 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1028,6 +1028,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, + __MAX_BPF_PROG_TYPE }; enum bpf_attach_type { @@ -1504,6 +1505,7 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 log_true_size; + __u32 prog_token_fd; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.2.3 From 4cbb270e115bc197ff2046aeb54cc951666b16ec Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:19 -0800 Subject: bpf: take into account BPF token when fetching helper protos Instead of performing unconditional system-wide bpf_capable() and perfmon_capable() calls inside bpf_base_func_proto() function (and other similar ones) to determine eligibility of a given BPF helper for a given program, use previously recorded BPF token during BPF_PROG_LOAD command handling to inform the decision. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 20af87b59d70..2a3ab4f3dd8c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2492,7 +2492,8 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); -const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); +const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog); void bpf_task_storage_free(struct task_struct *task); void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); @@ -2752,7 +2753,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log, } static inline const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) +bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { return NULL; } -- cgit v1.2.3 From 8062fb12de99b2da33754c6a3be1bfc30d9a35f4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:20 -0800 Subject: bpf: consistently use BPF token throughout BPF verifier logic Remove remaining direct queries to perfmon_capable() and bpf_capable() in BPF verifier logic and instead use BPF token (if available) to make decisions about privileges. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-9-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 16 ++++++++-------- include/linux/filter.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2a3ab4f3dd8c..435abad3cc61 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2200,24 +2200,24 @@ extern int sysctl_unprivileged_bpf_disabled; bool bpf_token_capable(const struct bpf_token *token, int cap); -static inline bool bpf_allow_ptr_leaks(void) +static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token) { - return perfmon_capable(); + return bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_allow_uninit_stack(void) +static inline bool bpf_allow_uninit_stack(const struct bpf_token *token) { - return perfmon_capable(); + return bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_bypass_spec_v1(void) +static inline bool bpf_bypass_spec_v1(const struct bpf_token *token) { - return cpu_mitigations_off() || perfmon_capable(); + return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); } -static inline bool bpf_bypass_spec_v4(void) +static inline bool bpf_bypass_spec_v4(const struct bpf_token *token) { - return cpu_mitigations_off() || perfmon_capable(); + return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); } int bpf_map_new_fd(struct bpf_map *map, int flags); diff --git a/include/linux/filter.h b/include/linux/filter.h index a4953fafc8cb..14354605ad26 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) return false; if (!bpf_jit_harden) return false; - if (bpf_jit_harden == 1 && bpf_capable()) + if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF)) return false; return true; -- cgit v1.2.3 From c3dd6e94df7193f33f45d33303f5e85afb2a72dc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:21 -0800 Subject: bpf,lsm: refactor bpf_prog_alloc/bpf_prog_free LSM hooks Based on upstream discussion ([0]), rework existing bpf_prog_alloc_security LSM hook. Rename it to bpf_prog_load and instead of passing bpf_prog_aux, pass proper bpf_prog pointer for a full BPF program struct. Also, we pass bpf_attr union with all the user-provided arguments for BPF_PROG_LOAD command. This will give LSMs as much information as we can basically provide. The hook is also BPF token-aware now, and optional bpf_token struct is passed as a third argument. bpf_prog_load LSM hook is called after a bunch of sanity checks were performed, bpf_prog and bpf_prog_aux were allocated and filled out, but right before performing full-fledged BPF verification step. bpf_prog_free LSM hook is now accepting struct bpf_prog argument, for consistency. SELinux code is adjusted to all new names, types, and signatures. Note, given that bpf_prog_load (previously bpf_prog_alloc) hook can be used by some LSMs to allocate extra security blob, but also by other LSMs to reject BPF program loading, we need to make sure that bpf_prog_free LSM hook is called after bpf_prog_load/bpf_prog_alloc one *even* if the hook itself returned error. If we don't do that, we run the risk of leaking memory. This seems to be possible today when combining SELinux and BPF LSM, as one example, depending on their relative ordering. Also, for BPF LSM setup, add bpf_prog_load and bpf_prog_free to sleepable LSM hooks list, as they are both executed in sleepable context. Also drop bpf_prog_load hook from untrusted, as there is no issue with refcount or anything else anymore, that originally forced us to add it to untrusted list in c0c852dd1876 ("bpf: Do not mark certain LSM hook arguments as trusted"). We now trigger this hook much later and it should not be an issue anymore. [0] https://lore.kernel.org/bpf/9fe88aef7deabbe87d3fc38c4aea3c69.paul@paul-moore.com/ Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-10-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/lsm_hook_defs.h | 5 +++-- include/linux/security.h | 12 +++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index ff217a5ce552..41ec4a7c070e 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -400,8 +400,9 @@ LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) -LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux) -LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux) +LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token) +LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index 1d1df326c881..65467eef6678 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2020,15 +2020,16 @@ static inline void securityfs_remove(struct dentry *dentry) union bpf_attr; struct bpf_map; struct bpf_prog; -struct bpf_prog_aux; +struct bpf_token; #ifdef CONFIG_SECURITY extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); extern int security_bpf_map_alloc(struct bpf_map *map); extern void security_bpf_map_free(struct bpf_map *map); -extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux); -extern void security_bpf_prog_free(struct bpf_prog_aux *aux); +extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token); +extern void security_bpf_prog_free(struct bpf_prog *prog); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2054,12 +2055,13 @@ static inline int security_bpf_map_alloc(struct bpf_map *map) static inline void security_bpf_map_free(struct bpf_map *map) { } -static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux) +static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, + struct bpf_token *token) { return 0; } -static inline void security_bpf_prog_free(struct bpf_prog_aux *aux) +static inline void security_bpf_prog_free(struct bpf_prog *prog) { } #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From 66d636d70a79c1d37e3eea67ab50969e6aaef983 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:22 -0800 Subject: bpf,lsm: refactor bpf_map_alloc/bpf_map_free LSM hooks Similarly to bpf_prog_alloc LSM hook, rename and extend bpf_map_alloc hook into bpf_map_create, taking not just struct bpf_map, but also bpf_attr and bpf_token, to give a fuller context to LSMs. Unlike bpf_prog_alloc, there is no need to move the hook around, as it currently is firing right before allocating BPF map ID and FD, which seems to be a sweet spot. But like bpf_prog_alloc/bpf_prog_free combo, make sure that bpf_map_free LSM hook is called even if bpf_map_create hook returned error, as if few LSMs are combined together it could be that one LSM successfully allocated security blob for its needs, while subsequent LSM rejected BPF map creation. The former LSM would still need to free up LSM blob, so we need to ensure security_bpf_map_free() is called regardless of the outcome. Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-11-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/lsm_hook_defs.h | 5 +++-- include/linux/security.h | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 41ec4a7c070e..adb25cc63ce3 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -398,8 +398,9 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) -LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) -LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) +LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token) +LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) diff --git a/include/linux/security.h b/include/linux/security.h index 65467eef6678..08fd777cbe94 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2025,7 +2025,8 @@ struct bpf_token; extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); -extern int security_bpf_map_alloc(struct bpf_map *map); +extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token); extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token); @@ -2047,7 +2048,8 @@ static inline int security_bpf_prog(struct bpf_prog *prog) return 0; } -static inline int security_bpf_map_alloc(struct bpf_map *map) +static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, + struct bpf_token *token) { return 0; } -- cgit v1.2.3 From d734ca7b33dbf60eb15dcf7c44f3da7073356777 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 30 Nov 2023 10:52:23 -0800 Subject: bpf,lsm: add BPF token LSM hooks Wire up bpf_token_create and bpf_token_free LSM hooks, which allow to allocate LSM security blob (we add `void *security` field to struct bpf_token for that), but also control who can instantiate BPF token. This follows existing pattern for BPF map and BPF prog. Also add security_bpf_token_allow_cmd() and security_bpf_token_capable() LSM hooks that allow LSM implementation to control and negate (if necessary) BPF token's delegation of a specific bpf_cmd and capability, respectively. Acked-by: Paul Moore Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231130185229.2688956-12-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/linux/lsm_hook_defs.h | 5 +++++ include/linux/security.h | 25 +++++++++++++++++++++++++ 3 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 435abad3cc61..7a483f6b6d5f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1604,6 +1604,9 @@ struct bpf_token { u64 allowed_maps; u64 allowed_progs; u64 allowed_attachs; +#ifdef CONFIG_SECURITY + void *security; +#endif }; struct bpf_struct_ops_value; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index adb25cc63ce3..3fdd00b452ac 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -404,6 +404,11 @@ LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) +LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, + struct path *path) +LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token) +LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd) +LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index 08fd777cbe94..00809d2d5c38 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -32,6 +32,7 @@ #include #include #include +#include struct linux_binprm; struct cred; @@ -2031,6 +2032,11 @@ extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, struct bpf_token *token); extern void security_bpf_prog_free(struct bpf_prog *prog); +extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, + struct path *path); +extern void security_bpf_token_free(struct bpf_token *token); +extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd); +extern int security_bpf_token_capable(const struct bpf_token *token, int cap); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2065,6 +2071,25 @@ static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr * static inline void security_bpf_prog_free(struct bpf_prog *prog) { } + +static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, + struct path *path) +{ + return 0; +} + +static inline void security_bpf_token_free(struct bpf_token *token) +{ } + +static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd) +{ + return 0; +} + +static inline int security_bpf_token_capable(const struct bpf_token *token, int cap) +{ + return 0; +} #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ -- cgit v1.2.3 From 7065eefb38f16c91e9ace36fb7c873e4c9857c27 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 6 Dec 2023 11:09:20 -0800 Subject: bpf: rename MAX_BPF_LINK_TYPE into __MAX_BPF_LINK_TYPE for consistency To stay consistent with the naming pattern used for similar cases in BPF UAPI (__MAX_BPF_ATTACH_TYPE, etc), rename MAX_BPF_LINK_TYPE into __MAX_BPF_LINK_TYPE. Also similar to MAX_BPF_ATTACH_TYPE and MAX_BPF_REG, add: #define MAX_BPF_LINK_TYPE __MAX_BPF_LINK_TYPE Not all __MAX_xxx enums have such #define, so I'm not sure if we should add it or not, but I figured I'll start with a completely backwards compatible way, and we can drop that, if necessary. Also adjust a selftest that used MAX_BPF_LINK_TYPE enum. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20231206190920.1651226-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4df2d025c784..e0545201b55f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1108,9 +1108,11 @@ enum bpf_link_type { BPF_LINK_TYPE_TCX = 11, BPF_LINK_TYPE_UPROBE_MULTI = 12, BPF_LINK_TYPE_NETKIT = 13, - MAX_BPF_LINK_TYPE, + __MAX_BPF_LINK_TYPE, }; +#define MAX_BPF_LINK_TYPE __MAX_BPF_LINK_TYPE + enum bpf_perf_event_type { BPF_PERF_EVENT_UNSPEC = 0, BPF_PERF_EVENT_UPROBE = 1, -- cgit v1.2.3 From f08a1c658257c73697a819c4ded3a84b6f0ead74 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:48 -0800 Subject: bpf: Let bpf_prog_pack_free handle any pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, bpf_prog_pack_free only can only free pointer to struct bpf_binary_header, which is not flexible. Add a size argument to bpf_prog_pack_free so that it can handle any pointer. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Reviewed-by: Björn Töpel Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 14354605ad26..12d907f17d36 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1067,7 +1067,7 @@ struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); -void bpf_prog_pack_free(struct bpf_binary_header *hdr); +void bpf_prog_pack_free(void *ptr, u32 size); static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { -- cgit v1.2.3 From 7a3d9a159b178e87306a6e989071ed9a114a1a31 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:49 -0800 Subject: bpf: Adjust argument names of arch_prepare_bpf_trampoline() We are using "im" for "struct bpf_tramp_image" and "tr" for "struct bpf_trampoline" in most of the code base. The only exception is the prototype and fallback version of arch_prepare_bpf_trampoline(). Update them to match the rest of the code base. We mix "orig_call" and "func_addr" for the argument in different versions of arch_prepare_bpf_trampoline(). s/orig_call/func_addr/g so they match. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-3-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7a483f6b6d5f..17eb6d905204 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1098,10 +1098,10 @@ struct bpf_tramp_run_ctx; * fexit = a set of program to run after original function */ struct bpf_tramp_image; -int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, - void *orig_call); + void *func_addr); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, -- cgit v1.2.3 From 82583daa2efc2e336962b231a46bad03a280b3e0 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:50 -0800 Subject: bpf: Add helpers for trampoline image management As BPF trampoline of different archs moves from bpf_jit_[alloc|free]_exec() to bpf_prog_pack_[alloc|free](), we need to use different _alloc, _free for different archs during the transition. Add the following helpers for this transition: void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); void arch_protect_bpf_trampoline(void *image, unsigned int size); void arch_unprotect_bpf_trampoline(void *image, unsigned int size); The fallback version of these helpers require size <= PAGE_SIZE, but they are only called with size == PAGE_SIZE. They will be called with size < PAGE_SIZE when arch_bpf_trampoline_size() helper is introduced later. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20231206224054.492250-4-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 17eb6d905204..b7fca151cf1b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1102,6 +1102,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *func_addr); +void *arch_alloc_bpf_trampoline(unsigned int size); +void arch_free_bpf_trampoline(void *image, unsigned int size); +void arch_protect_bpf_trampoline(void *image, unsigned int size); +void arch_unprotect_bpf_trampoline(void *image, unsigned int size); + u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, -- cgit v1.2.3 From 96d1b7c081c0c96cbe8901045f4ff15a2e9974a2 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:52 -0800 Subject: bpf: Add arch_bpf_trampoline_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This helper will be used to calculate the size of the trampoline before allocating the memory. arch_prepare_bpf_trampoline() for arm64 and riscv64 can use arch_bpf_trampoline_size() to check the trampoline fits in the image. OTOH, arch_prepare_bpf_trampoline() for s390 has to call the JIT process twice, so it cannot use arch_bpf_trampoline_size(). Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Acked-by: Björn Töpel Tested-by: Björn Töpel # on riscv Link: https://lore.kernel.org/r/20231206224054.492250-6-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b7fca151cf1b..2332ddeb396b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1106,6 +1106,8 @@ void *arch_alloc_bpf_trampoline(unsigned int size); void arch_free_bpf_trampoline(void *image, unsigned int size); void arch_protect_bpf_trampoline(void *image, unsigned int size); void arch_unprotect_bpf_trampoline(void *image, unsigned int size); +int arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, + struct bpf_tramp_links *tlinks, void *func_addr); u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); -- cgit v1.2.3 From 26ef208c209a0e6eed8942a5d191b39dccfa6e38 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 6 Dec 2023 14:40:53 -0800 Subject: bpf: Use arch_bpf_trampoline_size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of blindly allocating PAGE_SIZE for each trampoline, check the size of the trampoline with arch_bpf_trampoline_size(). This size is saved in bpf_tramp_image->size, and used for modmem charge/uncharge. The fallback arch_alloc_bpf_trampoline() still allocates a whole page because we need to use set_memory_* to protect the memory. struct_ops trampoline still uses a whole page for multiple trampolines. With this size check at caller (regular trampoline and struct_ops trampoline), remove arch_bpf_trampoline_size() from arch_prepare_bpf_trampoline() in archs. Also, update bpf_image_ksym_add() to handle symbol of different sizes. Signed-off-by: Song Liu Acked-by: Ilya Leoshkevich Tested-by: Ilya Leoshkevich # on s390x Acked-by: Jiri Olsa Acked-by: Björn Töpel Tested-by: Björn Töpel # on riscv Link: https://lore.kernel.org/r/20231206224054.492250-7-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2332ddeb396b..c1a06263a4f3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1141,6 +1141,7 @@ enum bpf_tramp_prog_type { struct bpf_tramp_image { void *image; + int size; struct bpf_ksym ksym; struct percpu_ref pcref; void *ip_after_call; @@ -1325,7 +1326,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ -void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); +void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); -- cgit v1.2.3 From 5a08d0065a915ccf325563d7ca57fa8b4897881c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Dec 2023 17:32:50 +0000 Subject: ipv6: add debug checks in fib6_info_release() Some elusive syzbot reports are hinting to fib6_info_release(), with a potential dangling f6i->gc_link anchor. Add debug checks so that syzbot can catch the issue earlier eventually. BUG: KASAN: slab-use-after-free in __hlist_del include/linux/list.h:990 [inline] BUG: KASAN: slab-use-after-free in hlist_del_init include/linux/list.h:1016 [inline] BUG: KASAN: slab-use-after-free in fib6_clean_expires_locked include/net/ip6_fib.h:533 [inline] BUG: KASAN: slab-use-after-free in fib6_purge_rt+0x986/0x9c0 net/ipv6/ip6_fib.c:1064 Write of size 8 at addr ffff88802805a840 by task syz-executor.1/10057 CPU: 1 PID: 10057 Comm: syz-executor.1 Not tainted 6.7.0-rc2-syzkaller-00029-g9b6de136b5f0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/10/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x1b0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0xc4/0x620 mm/kasan/report.c:475 kasan_report+0xda/0x110 mm/kasan/report.c:588 __hlist_del include/linux/list.h:990 [inline] hlist_del_init include/linux/list.h:1016 [inline] fib6_clean_expires_locked include/net/ip6_fib.h:533 [inline] fib6_purge_rt+0x986/0x9c0 net/ipv6/ip6_fib.c:1064 fib6_del_route net/ipv6/ip6_fib.c:1993 [inline] fib6_del+0xa7a/0x1750 net/ipv6/ip6_fib.c:2038 __ip6_del_rt net/ipv6/route.c:3866 [inline] ip6_del_rt+0xf7/0x200 net/ipv6/route.c:3881 ndisc_router_discovery+0x295b/0x3560 net/ipv6/ndisc.c:1372 ndisc_rcv+0x3de/0x5f0 net/ipv6/ndisc.c:1856 icmpv6_rcv+0x1470/0x19c0 net/ipv6/icmp.c:979 ip6_protocol_deliver_rcu+0x170/0x13e0 net/ipv6/ip6_input.c:438 ip6_input_finish+0x14f/0x2f0 net/ipv6/ip6_input.c:483 NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ip6_input+0xa1/0xc0 net/ipv6/ip6_input.c:492 ip6_mc_input+0x48b/0xf40 net/ipv6/ip6_input.c:586 dst_input include/net/dst.h:461 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:79 [inline] NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ipv6_rcv+0x24e/0x380 net/ipv6/ip6_input.c:310 __netif_receive_skb_one_core+0x115/0x180 net/core/dev.c:5529 __netif_receive_skb+0x1f/0x1b0 net/core/dev.c:5643 netif_receive_skb_internal net/core/dev.c:5729 [inline] netif_receive_skb+0x133/0x700 net/core/dev.c:5788 tun_rx_batched+0x429/0x780 drivers/net/tun.c:1579 tun_get_user+0x29e3/0x3bc0 drivers/net/tun.c:2002 tun_chr_write_iter+0xe8/0x210 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2020 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x64f/0xdf0 fs/read_write.c:584 ksys_write+0x12f/0x250 fs/read_write.c:637 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b RIP: 0033:0x7f38e387b82f Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 b9 80 02 00 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 0c 81 02 00 48 RSP: 002b:00007f38e45c9090 EFLAGS: 00000293 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 00007f38e399bf80 RCX: 00007f38e387b82f RDX: 00000000000003b6 RSI: 0000000020000680 RDI: 00000000000000c8 RBP: 00007f38e38c847a R08: 0000000000000000 R09: 0000000000000000 R10: 00000000000003b6 R11: 0000000000000293 R12: 0000000000000000 R13: 000000000000000b R14: 00007f38e399bf80 R15: 00007f38e3abfa48 Allocated by task 10044: kasan_save_stack+0x33/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 ____kasan_kmalloc mm/kasan/common.c:374 [inline] __kasan_kmalloc+0xa2/0xb0 mm/kasan/common.c:383 kasan_kmalloc include/linux/kasan.h:198 [inline] __do_kmalloc_node mm/slab_common.c:1007 [inline] __kmalloc+0x59/0x90 mm/slab_common.c:1020 kmalloc include/linux/slab.h:604 [inline] kzalloc include/linux/slab.h:721 [inline] fib6_info_alloc+0x40/0x160 net/ipv6/ip6_fib.c:155 ip6_route_info_create+0x337/0x1e70 net/ipv6/route.c:3749 ip6_route_add+0x26/0x150 net/ipv6/route.c:3843 rt6_add_route_info+0x2e7/0x4b0 net/ipv6/route.c:4316 rt6_route_rcv+0x76c/0xbf0 net/ipv6/route.c:985 ndisc_router_discovery+0x138b/0x3560 net/ipv6/ndisc.c:1529 ndisc_rcv+0x3de/0x5f0 net/ipv6/ndisc.c:1856 icmpv6_rcv+0x1470/0x19c0 net/ipv6/icmp.c:979 ip6_protocol_deliver_rcu+0x170/0x13e0 net/ipv6/ip6_input.c:438 ip6_input_finish+0x14f/0x2f0 net/ipv6/ip6_input.c:483 NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ip6_input+0xa1/0xc0 net/ipv6/ip6_input.c:492 ip6_mc_input+0x48b/0xf40 net/ipv6/ip6_input.c:586 dst_input include/net/dst.h:461 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:79 [inline] NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ipv6_rcv+0x24e/0x380 net/ipv6/ip6_input.c:310 __netif_receive_skb_one_core+0x115/0x180 net/core/dev.c:5529 __netif_receive_skb+0x1f/0x1b0 net/core/dev.c:5643 netif_receive_skb_internal net/core/dev.c:5729 [inline] netif_receive_skb+0x133/0x700 net/core/dev.c:5788 tun_rx_batched+0x429/0x780 drivers/net/tun.c:1579 tun_get_user+0x29e3/0x3bc0 drivers/net/tun.c:2002 tun_chr_write_iter+0xe8/0x210 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2020 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x64f/0xdf0 fs/read_write.c:584 ksys_write+0x12f/0x250 fs/read_write.c:637 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Freed by task 5123: kasan_save_stack+0x33/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 kasan_save_free_info+0x2b/0x40 mm/kasan/generic.c:522 ____kasan_slab_free mm/kasan/common.c:236 [inline] ____kasan_slab_free+0x15b/0x1b0 mm/kasan/common.c:200 kasan_slab_free include/linux/kasan.h:164 [inline] slab_free_hook mm/slub.c:1800 [inline] slab_free_freelist_hook+0x114/0x1e0 mm/slub.c:1826 slab_free mm/slub.c:3809 [inline] __kmem_cache_free+0xc0/0x180 mm/slub.c:3822 rcu_do_batch kernel/rcu/tree.c:2158 [inline] rcu_core+0x819/0x1680 kernel/rcu/tree.c:2431 __do_softirq+0x21a/0x8de kernel/softirq.c:553 Last potentially related work creation: kasan_save_stack+0x33/0x50 mm/kasan/common.c:45 __kasan_record_aux_stack+0xbc/0xd0 mm/kasan/generic.c:492 __call_rcu_common.constprop.0+0x9a/0x7a0 kernel/rcu/tree.c:2681 fib6_info_release include/net/ip6_fib.h:332 [inline] fib6_info_release include/net/ip6_fib.h:329 [inline] rt6_route_rcv+0xa4e/0xbf0 net/ipv6/route.c:997 ndisc_router_discovery+0x138b/0x3560 net/ipv6/ndisc.c:1529 ndisc_rcv+0x3de/0x5f0 net/ipv6/ndisc.c:1856 icmpv6_rcv+0x1470/0x19c0 net/ipv6/icmp.c:979 ip6_protocol_deliver_rcu+0x170/0x13e0 net/ipv6/ip6_input.c:438 ip6_input_finish+0x14f/0x2f0 net/ipv6/ip6_input.c:483 NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ip6_input+0xa1/0xc0 net/ipv6/ip6_input.c:492 ip6_mc_input+0x48b/0xf40 net/ipv6/ip6_input.c:586 dst_input include/net/dst.h:461 [inline] ip6_rcv_finish net/ipv6/ip6_input.c:79 [inline] NF_HOOK include/linux/netfilter.h:314 [inline] NF_HOOK include/linux/netfilter.h:308 [inline] ipv6_rcv+0x24e/0x380 net/ipv6/ip6_input.c:310 __netif_receive_skb_one_core+0x115/0x180 net/core/dev.c:5529 __netif_receive_skb+0x1f/0x1b0 net/core/dev.c:5643 netif_receive_skb_internal net/core/dev.c:5729 [inline] netif_receive_skb+0x133/0x700 net/core/dev.c:5788 tun_rx_batched+0x429/0x780 drivers/net/tun.c:1579 tun_get_user+0x29e3/0x3bc0 drivers/net/tun.c:2002 tun_chr_write_iter+0xe8/0x210 drivers/net/tun.c:2048 call_write_iter include/linux/fs.h:2020 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x64f/0xdf0 fs/read_write.c:584 ksys_write+0x12f/0x250 fs/read_write.c:637 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0x6b Second to last potentially related work creation: kasan_save_stack+0x33/0x50 mm/kasan/common.c:45 __kasan_record_aux_stack+0xbc/0xd0 mm/kasan/generic.c:492 insert_work+0x38/0x230 kernel/workqueue.c:1647 __queue_work+0xcdc/0x11f0 kernel/workqueue.c:1803 call_timer_fn+0x193/0x590 kernel/time/timer.c:1700 expire_timers kernel/time/timer.c:1746 [inline] __run_timers+0x585/0xb20 kernel/time/timer.c:2022 run_timer_softirq+0x58/0xd0 kernel/time/timer.c:2035 __do_softirq+0x21a/0x8de kernel/softirq.c:553 The buggy address belongs to the object at ffff88802805a800 which belongs to the cache kmalloc-512 of size 512 The buggy address is located 64 bytes inside of freed 512-byte region [ffff88802805a800, ffff88802805aa00) The buggy address belongs to the physical page: page:ffffea0000a01600 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x28058 head:ffffea0000a01600 order:2 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0xfff00000000840(slab|head|node=0|zone=1|lastcpupid=0x7ff) page_type: 0xffffffff() raw: 00fff00000000840 ffff888013041c80 ffffea0001e02600 dead000000000002 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 2, migratetype Unmovable, gfp_mask 0x1d20c0(__GFP_IO|__GFP_FS|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC|__GFP_HARDWALL), pid 18706, tgid 18699 (syz-executor.2), ts 999991973280, free_ts 996884464281 set_page_owner include/linux/page_owner.h:31 [inline] post_alloc_hook+0x2d0/0x350 mm/page_alloc.c:1537 prep_new_page mm/page_alloc.c:1544 [inline] get_page_from_freelist+0xa25/0x36d0 mm/page_alloc.c:3312 __alloc_pages+0x22e/0x2420 mm/page_alloc.c:4568 alloc_pages_mpol+0x258/0x5f0 mm/mempolicy.c:2133 alloc_slab_page mm/slub.c:1870 [inline] allocate_slab mm/slub.c:2017 [inline] new_slab+0x283/0x3c0 mm/slub.c:2070 ___slab_alloc+0x979/0x1500 mm/slub.c:3223 __slab_alloc.constprop.0+0x56/0xa0 mm/slub.c:3322 __slab_alloc_node mm/slub.c:3375 [inline] slab_alloc_node mm/slub.c:3468 [inline] __kmem_cache_alloc_node+0x131/0x310 mm/slub.c:3517 __do_kmalloc_node mm/slab_common.c:1006 [inline] __kmalloc+0x49/0x90 mm/slab_common.c:1020 kmalloc include/linux/slab.h:604 [inline] kzalloc include/linux/slab.h:721 [inline] copy_splice_read+0x1ac/0x8f0 fs/splice.c:338 vfs_splice_read fs/splice.c:992 [inline] vfs_splice_read+0x2ea/0x3b0 fs/splice.c:962 splice_direct_to_actor+0x2a5/0xa30 fs/splice.c:1069 do_splice_direct+0x1af/0x280 fs/splice.c:1194 do_sendfile+0xb3e/0x1310 fs/read_write.c:1254 __do_sys_sendfile64 fs/read_write.c:1322 [inline] __se_sys_sendfile64 fs/read_write.c:1308 [inline] __x64_sys_sendfile64+0x1d6/0x220 fs/read_write.c:1308 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1137 [inline] free_unref_page_prepare+0x4fa/0xaa0 mm/page_alloc.c:2347 free_unref_page_list+0xe6/0xb40 mm/page_alloc.c:2533 release_pages+0x32a/0x14f0 mm/swap.c:1042 tlb_batch_pages_flush+0x9a/0x190 mm/mmu_gather.c:98 tlb_flush_mmu_free mm/mmu_gather.c:293 [inline] tlb_flush_mmu mm/mmu_gather.c:300 [inline] tlb_finish_mmu+0x14b/0x6f0 mm/mmu_gather.c:392 exit_mmap+0x38b/0xa70 mm/mmap.c:3321 __mmput+0x12a/0x4d0 kernel/fork.c:1349 mmput+0x62/0x70 kernel/fork.c:1371 exit_mm kernel/exit.c:567 [inline] do_exit+0x9ad/0x2ae0 kernel/exit.c:858 do_group_exit+0xd4/0x2a0 kernel/exit.c:1021 get_signal+0x23be/0x2790 kernel/signal.c:2904 arch_do_signal_or_restart+0x90/0x7f0 arch/x86/kernel/signal.c:309 exit_to_user_mode_loop kernel/entry/common.c:168 [inline] exit_to_user_mode_prepare+0x121/0x240 kernel/entry/common.c:204 irqentry_exit_to_user_mode+0xa/0x40 kernel/entry/common.c:309 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:645 Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20231205173250.2982846-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip6_fib.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 1ba9f4ddf2f6..e1e7a894863a 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -328,8 +328,11 @@ static inline bool fib6_info_hold_safe(struct fib6_info *f6i) static inline void fib6_info_release(struct fib6_info *f6i) { - if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) + if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) { + DEBUG_NET_WARN_ON_ONCE(fib6_has_expires(f6i)); + DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link)); call_rcu(&f6i->rcu, fib6_info_destroy_rcu); + } } enum fib6_walk_state { -- cgit v1.2.3 From 3bc05faf37876f99e2a7baffa9c66fdcfb11d1f7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 5 Dec 2023 17:42:30 +0100 Subject: net: dsa: microchip: properly support platform_data probing The ksz driver has bits and pieces of platform_data probing support, but it doesn't work. The conventional thing to do is to have an encapsulating structure for struct dsa_chip_data that gets put into dev->platform_data. This driver expects a struct ksz_platform_data, but that doesn't contain a struct dsa_chip_data as first element, which will obviously not work with dsa_switch_probe() -> dsa_switch_parse(). Pointing dev->platform_data to a struct dsa_chip_data directly is in principle possible, but that doesn't work either. The driver has ksz_switch_detect() to read the device ID from hardware, followed by ksz_check_device_id() to compare it against a predetermined expected value. This protects against early errors in the SPI/I2C communication. With platform_data, the mechanism in ksz_check_device_id() doesn't work and even leads to NULL pointer dereferences, since of_device_get_match_data() doesn't work in that probe path. So obviously, the platform_data support is actually missing, and the existing handling of struct ksz_platform_data is bogus. Complete the support by adding a struct dsa_chip_data as first element, and fixing up ksz_check_device_id() to pick up the platform_data instead of the unavailable of_device_get_match_data(). The early dev->chip_id assignment from ksz_switch_register() is also bogus, because ksz_switch_detect() sets it to an initial value. So remove it. Also, ksz_platform_data :: enabled_ports isn't used anywhere, delete it. Link: https://lore.kernel.org/netdev/20231204154315.3906267-1-dd@embedd.com/ Signed-off-by: Vladimir Oltean Signed-off-by: Daniel Danzberger Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/platform_data/microchip-ksz.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h index ea1cc6d829e9..6480bf4af0fb 100644 --- a/include/linux/platform_data/microchip-ksz.h +++ b/include/linux/platform_data/microchip-ksz.h @@ -20,10 +20,12 @@ #define __MICROCHIP_KSZ_H #include +#include struct ksz_platform_data { + /* Must be first such that dsa_register_switch() can access it */ + struct dsa_chip_data cd; u32 chip_id; - u16 enabled_ports; }; #endif -- cgit v1.2.3 From d16f1096b320d42e41ad9dee4d4098afd140d3e1 Mon Sep 17 00:00:00 2001 From: Daniel Danzberger Date: Tue, 5 Dec 2023 17:42:31 +0100 Subject: net: dsa: microchip: move ksz_chip_id enum to platform include With the ksz_chip_id enums moved to the platform include file for ksz switches, platform code that instantiates a device can now use these to set ksz_platform_data::chip_id. Signed-off-by: Daniel Danzberger Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/platform_data/microchip-ksz.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/microchip-ksz.h b/include/linux/platform_data/microchip-ksz.h index 6480bf4af0fb..f177416635a2 100644 --- a/include/linux/platform_data/microchip-ksz.h +++ b/include/linux/platform_data/microchip-ksz.h @@ -22,6 +22,25 @@ #include #include +enum ksz_chip_id { + KSZ8563_CHIP_ID = 0x8563, + KSZ8795_CHIP_ID = 0x8795, + KSZ8794_CHIP_ID = 0x8794, + KSZ8765_CHIP_ID = 0x8765, + KSZ8830_CHIP_ID = 0x8830, + KSZ9477_CHIP_ID = 0x00947700, + KSZ9896_CHIP_ID = 0x00989600, + KSZ9897_CHIP_ID = 0x00989700, + KSZ9893_CHIP_ID = 0x00989300, + KSZ9563_CHIP_ID = 0x00956300, + KSZ9567_CHIP_ID = 0x00956700, + LAN9370_CHIP_ID = 0x00937000, + LAN9371_CHIP_ID = 0x00937100, + LAN9372_CHIP_ID = 0x00937200, + LAN9373_CHIP_ID = 0x00937300, + LAN9374_CHIP_ID = 0x00937400, +}; + struct ksz_platform_data { /* Must be first such that dsa_register_switch() can access it */ struct dsa_chip_data cd; -- cgit v1.2.3 From 2a48c635fd9a48699805bbfeee1e4b94b8fe819d Mon Sep 17 00:00:00 2001 From: "justinstitt@google.com" Date: Wed, 6 Dec 2023 23:16:10 +0000 Subject: ethtool: Implement ethtool_puts() Use strscpy() to implement ethtool_puts(). Functionally the same as ethtool_sprintf() when it's used with two arguments or with just "%s" format specifier. Signed-off-by: Justin Stitt Reviewed-by: Przemek Kitszel Reviewed-by: Andrew Lunn Reviewed-by: Madhuri Sripada Signed-off-by: David S. Miller --- include/linux/ethtool.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index c2bb74143eda..deb683d3360f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1061,6 +1061,19 @@ int ethtool_get_ts_info_by_layer(struct net_device *dev, struct ethtool_ts_info */ extern __printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...); +/** + * ethtool_puts - Write string to ethtool string data + * @data: Pointer to a pointer to the start of string to update + * @str: String to write + * + * Write string to *data without a trailing newline. Update *data + * to point at start of next string. + * + * Prefer this function to ethtool_sprintf() when given only + * two arguments or if @fmt is just "%s". + */ +extern void ethtool_puts(u8 **data, const char *str); + /* Link mode to forced speed capabilities maps */ struct ethtool_forced_speed_map { u32 speed; -- cgit v1.2.3 From d9f28735af8781d9c8c6c406c2a102090644133d Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 6 Dec 2023 13:44:20 +0000 Subject: Use READ/WRITE_ONCE() for IP local_port_range. Commit 227b60f5102cd added a seqlock to ensure that the low and high port numbers were always updated together. This is overkill because the two 16bit port numbers can be held in a u32 and read/written in a single instruction. More recently 91d0b78c5177f added support for finer per-socket limits. The user-supplied value is 'high << 16 | low' but they are held separately and the socket options protected by the socket lock. Use a u32 containing 'high << 16 | low' for both the 'net' and 'sk' fields and use READ_ONCE()/WRITE_ONCE() to ensure both values are always updated together. Change (the now trival) inet_get_local_port_range() to a static inline to optimise the calling code. (In particular avoiding returning integers by reference.) Signed-off-by: David Laight Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Acked-by: Mat Martineau Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/4e505d4198e946a8be03fb1b4c3072b0@AcuMS.aculab.com Signed-off-by: Jakub Kicinski --- include/net/inet_sock.h | 5 +---- include/net/ip.h | 8 +++++++- include/net/netns/ipv4.h | 3 +-- 3 files changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 74db6d97cae1..aa86453f6b9b 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -234,10 +234,7 @@ struct inet_sock { int uc_index; int mc_index; __be32 mc_addr; - struct { - __u16 lo; - __u16 hi; - } local_port_range; + u32 local_port_range; /* high << 16 | low */ struct ip_mc_socklist __rcu *mc_list; struct inet_cork_full cork; diff --git a/include/net/ip.h b/include/net/ip.h index 1fc4c8d69e33..b31be912489a 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -349,7 +349,13 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o } \ } -void inet_get_local_port_range(const struct net *net, int *low, int *high); +static inline void inet_get_local_port_range(const struct net *net, int *low, int *high) +{ + u32 range = READ_ONCE(net->ipv4.ip_local_ports.range); + + *low = range & 0xffff; + *high = range >> 16; +} void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); #ifdef CONFIG_SYSCTL diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index ea882964c71e..c356c458b340 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -19,8 +19,7 @@ struct hlist_head; struct fib_table; struct sock; struct local_ports { - seqlock_t lock; - int range[2]; + u32 range; /* high << 16 | low */ bool warned; }; -- cgit v1.2.3 From 172db56d90d29e47e7d0d64885d5dbd92c87ec42 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 6 Dec 2023 12:59:07 -0800 Subject: netlink: Return unsigned value for nla_len() The return value from nla_len() is never expected to be negative, and can never be more than struct nlattr::nla_len (a u16). Adjust the prototype on the function. This will let GCC's value range optimization passes know that the return can never be negative, and can never be larger than u16. As recently discussed[1], this silences the following warning in GCC 12+: net/wireless/nl80211.c: In function 'nl80211_set_cqm_rssi.isra': net/wireless/nl80211.c:12892:17: warning: 'memcpy' specified bound 18446744073709551615 exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=] 12892 | memcpy(cqm_config->rssi_thresholds, thresholds, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 12893 | flex_array_size(cqm_config, rssi_thresholds, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 12894 | n_thresholds)); | ~~~~~~~~~~~~~~ A future change would be to clamp the subtraction to make sure it never wraps around if nla_len is somehow less than NLA_HDRLEN, which would have the additional benefit of being defensive in the face of nlattr corruption or logic errors. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311090752.hWcJWAHL-lkp@intel.com/ [1] Cc: Johannes Berg Cc: Jeff Johnson Cc: Michael Walle Cc: Max Schulze Link: https://lore.kernel.org/r/20231202202539.it.704-kees@kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20231206205904.make.018-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 167b91348e57..28039e57070a 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1214,7 +1214,7 @@ static inline void *nla_data(const struct nlattr *nla) * nla_len - length of payload * @nla: netlink attribute */ -static inline int nla_len(const struct nlattr *nla) +static inline u16 nla_len(const struct nlattr *nla) { return nla->nla_len - NLA_HDRLEN; } -- cgit v1.2.3 From 92e1567ee3e3f6f160e320890ac77eec50bf8e7d Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Thu, 7 Dec 2023 22:25:17 -0500 Subject: bpf: Add some comments to stack representation Add comments to the datastructure tracking the stack state, as the mapping between each stack slot and where its state is stored is not entirely obvious. Signed-off-by: Andrei Matei Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/bpf/20231208032519.260451-2-andreimatei1@gmail.com --- include/linux/bpf_verifier.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bada59812e00..314b679fb494 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -321,7 +321,17 @@ struct bpf_func_state { /* The following fields should be last. See copy_func_state() */ int acquired_refs; struct bpf_reference_state *refs; + /* The state of the stack. Each element of the array describes BPF_REG_SIZE + * (i.e. 8) bytes worth of stack memory. + * stack[0] represents bytes [*(r10-8)..*(r10-1)] + * stack[1] represents bytes [*(r10-16)..*(r10-9)] + * ... + * stack[allocated_stack/8 - 1] represents [*(r10-allocated_stack)..*(r10-allocated_stack+7)] + */ struct bpf_stack_state *stack; + /* Size of the current stack, in bytes. The stack state is tracked below, in + * `stack`. allocated_stack is always a multiple of BPF_REG_SIZE. + */ int allocated_stack; }; @@ -658,6 +668,10 @@ struct bpf_verifier_env { int exception_callback_subprog; bool explore_alu_limits; bool allow_ptr_leaks; + /* Allow access to uninitialized stack memory. Writes with fixed offset are + * always allowed, so this refers to reads (with fixed or variable offset), + * to writes with variable offset and to indirect (helper) accesses. + */ bool allow_uninit_stack; bool bpf_capable; bool bypass_spec_v1; -- cgit v1.2.3 From a3c205d0560f63ff02516b6d9fc3348dc34251c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Dec 2023 20:13:22 +0000 Subject: ipv6: do not check fib6_has_expires() in fib6_info_release() My prior patch went a bit too far, because apparently fib6_has_expires() could be true while f6i->gc_link is not hashed yet. fib6_set_expires_locked() can indeed set RTF_EXPIRES while f6i->fib6_table is NULL. Original syzbot reports were about corruptions caused by dangling f6i->gc_link. Fixes: 5a08d0065a91 ("ipv6: add debug checks in fib6_info_release()") Reported-by: syzbot+c15aa445274af8674f41@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Cc: Kui-Feng Lee Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20231207201322.549000-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip6_fib.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index e1e7a894863a..95ed495c3a40 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -329,7 +329,6 @@ static inline bool fib6_info_hold_safe(struct fib6_info *f6i) static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) { - DEBUG_NET_WARN_ON_ONCE(fib6_has_expires(f6i)); DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link)); call_rcu(&f6i->rcu, fib6_info_destroy_rcu); } -- cgit v1.2.3 From 2ebe81c814355d000fe49d9c4213983844dcb32b Mon Sep 17 00:00:00 2001 From: Aleksander Lobakin Date: Wed, 6 Dec 2023 21:59:19 +0100 Subject: net, xdp: Allow metadata > 32 32 bytes may be not enough for some custom metadata. Relax the restriction, allow metadata larger than 32 bytes and make __skb_metadata_differs() work with bigger lengths. Now size of metadata is only limited by the fact it is stored as u8 in skb_shared_info, so maximum possible value is 255. Size still has to be aligned to 4, so the actual upper limit becomes 252. Most driver implementations will offer less, none can offer more. Other important conditions, such as having enough space for xdp_frame building, are already checked in bpf_xdp_adjust_meta(). Signed-off-by: Aleksander Lobakin Signed-off-by: Larysa Zaremba Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/eb87653c-8ff8-447d-a7a1-25961f60518a@kernel.org Link: https://lore.kernel.org/bpf/20231206205919.404415-3-larysa.zaremba@intel.com --- include/linux/skbuff.h | 13 ++++++++----- include/net/xdp.h | 7 ++++++- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b370eb8d70f7..df6ef42639d8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4247,10 +4247,13 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, { const void *a = skb_metadata_end(skb_a); const void *b = skb_metadata_end(skb_b); - /* Using more efficient varaiant than plain call to memcmp(). */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 diffs = 0; + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + BITS_PER_LONG != 64) + goto slow; + + /* Using more efficient variant than plain call to memcmp(). */ switch (meta_len) { #define __it(x, op) (x -= sizeof(u##op)) #define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op)) @@ -4270,11 +4273,11 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, fallthrough; case 4: diffs |= __it_diff(a, b, 32); break; + default: +slow: + return memcmp(a - meta_len, b - meta_len, meta_len); } return diffs; -#else - return memcmp(a - meta_len, b - meta_len, meta_len); -#endif } static inline bool skb_metadata_differs(const struct sk_buff *skb_a, diff --git a/include/net/xdp.h b/include/net/xdp.h index 349c36fb5fd8..5d3673afc037 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -369,7 +369,12 @@ xdp_data_meta_unsupported(const struct xdp_buff *xdp) static inline bool xdp_metalen_invalid(unsigned long metalen) { - return (metalen & (sizeof(__u32) - 1)) || (metalen > 32); + unsigned long meta_max; + + meta_max = type_max(typeof_member(struct skb_shared_info, meta_len)); + BUILD_BUG_ON(!__builtin_constant_p(meta_max)); + + return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max; } struct xdp_attachment_info { -- cgit v1.2.3 From c5e2a973448d958feb7881e4d875eac59fdeff3d Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Fri, 8 Dec 2023 16:28:41 -0300 Subject: rtnl: add helper to check if rtnl group has listeners As of today, rtnl code creates a new skb and unconditionally fills and broadcasts it to the relevant group. For most operations this is okay and doesn't waste resources in general. When operations are done without the rtnl_lock, as in tc-flower, such skb allocation, message fill and no-op broadcasting can happen in all cores of the system, which contributes to system pressure and wastes precious cpu cycles when no one will receive the built message. Introduce this helper so rtnetlink operations can simply check if someone is listening and then proceed if necessary. Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Signed-off-by: Pedro Tammela Link: https://lore.kernel.org/r/20231208192847.714940-2-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 3d6cf306cd55..a7d757e96c55 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -130,4 +130,11 @@ extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, extern void rtnl_offload_xstats_notify(struct net_device *dev); +static inline int rtnl_has_listeners(const struct net *net, u32 group) +{ + struct sock *rtnl = net->rtnl; + + return netlink_has_listeners(rtnl, group); +} + #endif /* __LINUX_RTNETLINK_H */ -- cgit v1.2.3 From 8439109b76a3c405808383bf9dd532fc4b9c2dbd Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Fri, 8 Dec 2023 16:28:42 -0300 Subject: rtnl: add helper to check if a notification is needed Building on the rtnl_has_listeners helper, add the rtnl_notify_needed helper to check if we can bail out early in the notification routines. Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: Victor Nogueira Signed-off-by: Pedro Tammela Link: https://lore.kernel.org/r/20231208192847.714940-3-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index a7d757e96c55..0cbbbded0331 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -137,4 +137,19 @@ static inline int rtnl_has_listeners(const struct net *net, u32 group) return netlink_has_listeners(rtnl, group); } +/** + * rtnl_notify_needed - check if notification is needed + * @net: Pointer to the net namespace + * @nlflags: netlink ingress message flags + * @group: rtnl group + * + * Based on the ingress message flags and rtnl group, returns true + * if a notification is needed, false otherwise. + */ +static inline bool +rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group) +{ + return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group); +} + #endif /* __LINUX_RTNETLINK_H */ -- cgit v1.2.3 From ddb6b284bdc32b6e218b3d90b5a745ea26620812 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 8 Dec 2023 16:28:43 -0300 Subject: rtnl: add helper to send if skb is not null This is a convenience helper for routines handling conditional rtnl events, that is code that might send a notification depending on rtnl_has_listeners/rtnl_notify_needed. Instead of: if (skb) rtnetlink_send(...) Use: rtnetlink_maybe_send(...) Reviewed-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: Pedro Tammela Link: https://lore.kernel.org/r/20231208192847.714940-4-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- include/linux/rtnetlink.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 0cbbbded0331..6a8543b34e2c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -10,6 +10,13 @@ #include extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); + +static inline int rtnetlink_maybe_send(struct sk_buff *skb, struct net *net, + u32 pid, u32 group, int echo) +{ + return !skb ? 0 : rtnetlink_send(skb, net, pid, group, echo); +} + extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, const struct nlmsghdr *nlh, gfp_t flags); -- cgit v1.2.3 From 1a1ad782dcbbacd9e8d4e2e7ff1bf14d1db80727 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Dec 2023 15:39:21 -0800 Subject: bpf: tidy up exception callback management a bit Use the fact that we are passing subprog index around and have a corresponding struct bpf_subprog_info in bpf_verifier_env for each subprogram. We don't need to separately pass around a flag whether subprog is exception callback or not, each relevant verifier function can determine this using provided subprog index if we maintain bpf_subprog_info properly. Also move out exception callback-specific logic from btf_prepare_func_args(), keeping it generic. We can enforce all these restriction right before exception callback verification pass. We add out parameter, arg_cnt, for now, but this will be unnecessary with subsequent refactoring and will be removed. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231204233931.49758-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c1a06263a4f3..0bd4889e917a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2494,7 +2494,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg, bool is_ex_cb); + struct bpf_reg_state *reg, u32 *nargs); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, -- cgit v1.2.3 From 406a6fa44bfbc8563f0612b08d43df2fa65e8bc5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 4 Dec 2023 15:39:22 -0800 Subject: bpf: use bitfields for simple per-subprog bool flags We have a bunch of bool flags for each subprog. Instead of wasting bytes for them, use bitfields instead. Signed-off-by: Andrii Nakryiko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20231204233931.49758-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 314b679fb494..c2819a6579a5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -611,12 +611,12 @@ struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u16 stack_depth; /* max. stack depth used by this function */ - bool has_tail_call; - bool tail_call_reachable; - bool has_ld_abs; - bool is_cb; - bool is_async_cb; - bool is_exception_cb; + bool has_tail_call: 1; + bool tail_call_reachable: 1; + bool has_ld_abs: 1; + bool is_cb: 1; + bool is_async_cb: 1; + bool is_exception_cb: 1; }; struct bpf_verifier_env; -- cgit v1.2.3 From 10fa22b6fb68198ab6bf2ab468a11c54f083761d Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Mon, 11 Dec 2023 18:06:24 +0800 Subject: wifi: cfg80211: expose nl80211_chan_width_to_mhz for wide sharing The newly added WBRF feature needs this interface for channel width calculation. Signed-off-by: Evan Quan Signed-off-by: Ma Jun Reviewed-by: Mario Limonciello Link: https://msgid.link/20231211100630.2170152-4-Jun.Ma2@amd.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d59669d86718..324a5f710ad3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -976,6 +976,15 @@ const struct cfg80211_chan_def * cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1, const struct cfg80211_chan_def *chandef2); +/** + * nl80211_chan_width_to_mhz - get the channel width in MHz + * @chan_width: the channel width from &enum nl80211_chan_width + * + * Return: channel width in MHz if the chan_width from &enum nl80211_chan_width + * is valid. -1 otherwise. + */ +int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width); + /** * cfg80211_chandef_valid - check if a channel definition is valid * @chandef: the channel definition to check -- cgit v1.2.3 From aa0887c4f18e280f8c2aa6964af602bd16c37f54 Mon Sep 17 00:00:00 2001 From: Vinayak Yadawad Date: Wed, 29 Nov 2023 18:20:43 +0530 Subject: wifi: nl80211: Extend del pmksa support for SAE and OWE security Current handling of del pmksa with SSID is limited to FILS security. In the current change the del pmksa support is extended to SAE/OWE security offloads as well. For OWE/SAE offloads, the PMK is generated and cached at driver/FW, so user app needs the capability to request cache deletion based on SSID for drivers supporting SAE/OWE offload. Signed-off-by: Vinayak Yadawad Link: https://msgid.link/ecdae726459e0944c377a6a6f6cb2c34d2e057d0.1701262123.git.vinayak.yadawad@broadcom.com [drop whitespace-damaged rdev_ops pointer completely, enabling tracing] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 0cd1da2c2902..8f42d598e285 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -568,7 +568,8 @@ * @NL80211_CMD_DEL_PMKSA: Delete a PMKSA cache entry, using %NL80211_ATTR_MAC * (for the BSSID) and %NL80211_ATTR_PMKID or using %NL80211_ATTR_SSID, * %NL80211_ATTR_FILS_CACHE_ID, and %NL80211_ATTR_PMKID in case of FILS - * authentication. + * authentication. Additionally in case of SAE offload and OWE offloads + * PMKSA entry can be deleted using %NL80211_ATTR_SSID. * @NL80211_CMD_FLUSH_PMKSA: Flush all PMKSA cache entries. * * @NL80211_CMD_REG_CHANGE: indicates to userspace the regulatory domain -- cgit v1.2.3 From d02a12b8e4bbd188f38321849791af02d494c7fd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 11 Dec 2023 09:05:20 +0200 Subject: wifi: cfg80211: add BSS usage reporting Sometimes there may be reasons for which a BSS that's actually found in scan cannot be used to connect to, for example a nonprimary link of an NSTR mobile AP MLD cannot be used for normal direct connections to it. Not indicating these to userspace as we do now of course avoids being able to connect to them, but it's better if they're shown to userspace and it can make an appropriate decision, without e.g. doing an additional ML probe. Thus add an indication of what a BSS can be used for, currently "normal" and "MLD link", including a reason bitmap for it being not usable. The latter can be extended later for certain BSSes if there are other reasons they cannot be used. Signed-off-by: Johannes Berg Reviewed-by: Ilan Peer Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20231211085121.0464f25e0b1d.I9f70ca9f1440565ad9a5207d0f4d00a20cca67e7@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 60 ++++++++++++++++++++++++++++++++++++++------ include/uapi/linux/nl80211.h | 40 +++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 324a5f710ad3..cabe57a00eaf 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2828,6 +2828,13 @@ enum cfg80211_signal_type { * the BSS that requested the scan in which the beacon/probe was received. * @chains: bitmask for filled values in @chain_signal. * @chain_signal: per-chain signal strength of last received BSS in dBm. + * @restrict_use: restrict usage, if not set, assume @use_for is + * %NL80211_BSS_USE_FOR_NORMAL. + * @use_for: bitmap of possible usage for this BSS, see + * &enum nl80211_bss_use_for + * @cannot_use_reasons: the reasons (bitmap) for not being able to connect, + * if @restrict_use is set and @use_for is zero (empty); may be 0 for + * unspecified reasons; see &enum nl80211_bss_cannot_use_reasons * @drv_data: Data to be passed through to @inform_bss */ struct cfg80211_inform_bss { @@ -2839,6 +2846,9 @@ struct cfg80211_inform_bss { u8 chains; s8 chain_signal[IEEE80211_MAX_CHAINS]; + u8 restrict_use:1, use_for:7; + u8 cannot_use_reasons; + void *drv_data; }; @@ -2890,6 +2900,11 @@ struct cfg80211_bss_ies { * @chain_signal: per-chain signal strength of last received BSS in dBm. * @bssid_index: index in the multiple BSS set * @max_bssid_indicator: max number of members in the BSS set + * @use_for: bitmap of possible usage for this BSS, see + * &enum nl80211_bss_use_for + * @cannot_use_reasons: the reasons (bitmap) for not being able to connect, + * if @restrict_use is set and @use_for is zero (empty); may be 0 for + * unspecified reasons; see &enum nl80211_bss_cannot_use_reasons * @priv: private area for driver use, has at least wiphy->bss_priv_size bytes */ struct cfg80211_bss { @@ -2915,6 +2930,9 @@ struct cfg80211_bss { u8 bssid_index; u8 max_bssid_indicator; + u8 use_for; + u8 cannot_use_reasons; + u8 priv[] __aligned(sizeof(void *)); }; @@ -4922,6 +4940,8 @@ struct cfg80211_ops { * NL80211_REGDOM_SET_BY_DRIVER. * @WIPHY_FLAG_CHANNEL_CHANGE_ON_BEACON: reg_call_notifier() is called if driver * set this flag to update channels on beacon hints. + * @WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY: support connection to non-primary link + * of an NSTR mobile AP MLD. */ enum wiphy_flags { WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK = BIT(0), @@ -4935,7 +4955,7 @@ enum wiphy_flags { WIPHY_FLAG_IBSS_RSN = BIT(8), WIPHY_FLAG_MESH_AUTH = BIT(10), WIPHY_FLAG_SUPPORTS_EXT_KCK_32 = BIT(11), - /* use hole at 12 */ + WIPHY_FLAG_SUPPORTS_NSTR_NONPRIMARY = BIT(12), WIPHY_FLAG_SUPPORTS_FW_ROAM = BIT(13), WIPHY_FLAG_AP_UAPSD = BIT(14), WIPHY_FLAG_SUPPORTS_TDLS = BIT(15), @@ -7173,6 +7193,25 @@ cfg80211_inform_bss(struct wiphy *wiphy, gfp); } +/** + * __cfg80211_get_bss - get a BSS reference + * @wiphy: the wiphy this BSS struct belongs to + * @channel: the channel to search on (or %NULL) + * @bssid: the desired BSSID (or %NULL) + * @ssid: the desired SSID (or %NULL) + * @ssid_len: length of the SSID (or 0) + * @bss_type: type of BSS, see &enum ieee80211_bss_type + * @privacy: privacy filter, see &enum ieee80211_privacy + * @use_for: indicates which use is intended + */ +struct cfg80211_bss *__cfg80211_get_bss(struct wiphy *wiphy, + struct ieee80211_channel *channel, + const u8 *bssid, + const u8 *ssid, size_t ssid_len, + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy, + u32 use_for); + /** * cfg80211_get_bss - get a BSS reference * @wiphy: the wiphy this BSS struct belongs to @@ -7182,13 +7221,20 @@ cfg80211_inform_bss(struct wiphy *wiphy, * @ssid_len: length of the SSID (or 0) * @bss_type: type of BSS, see &enum ieee80211_bss_type * @privacy: privacy filter, see &enum ieee80211_privacy + * + * This version implies regular usage, %NL80211_BSS_USE_FOR_NORMAL. */ -struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, - struct ieee80211_channel *channel, - const u8 *bssid, - const u8 *ssid, size_t ssid_len, - enum ieee80211_bss_type bss_type, - enum ieee80211_privacy privacy); +static inline struct cfg80211_bss * +cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, + const u8 *bssid, const u8 *ssid, size_t ssid_len, + enum ieee80211_bss_type bss_type, + enum ieee80211_privacy privacy) +{ + return __cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, + bss_type, privacy, + NL80211_BSS_USE_FOR_NORMAL); +} + static inline struct cfg80211_bss * cfg80211_get_ibss(struct wiphy *wiphy, struct ieee80211_channel *channel, diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8f42d598e285..07fc1fec4b12 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2831,6 +2831,10 @@ enum nl80211_commands { * @NL80211_ATTR_MLO_LINK_DISABLED: Flag attribute indicating that the link is * disabled. * + * @NL80211_ATTR_BSS_DUMP_INCLUDE_USE_DATA: Include BSS usage data, i.e. + * include BSSes that can only be used in restricted scenarios and/or + * cannot be used at all. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3369,6 +3373,8 @@ enum nl80211_attrs { NL80211_ATTR_MLO_LINK_DISABLED, + NL80211_ATTR_BSS_DUMP_INCLUDE_USE_DATA, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5032,6 +5038,30 @@ enum nl80211_bss_scan_width { NL80211_BSS_CHAN_WIDTH_2, }; +/** + * enum nl80211_bss_use_for - bitmap indicating possible BSS use + * @NL80211_BSS_USE_FOR_NORMAL: Use this BSS for normal "connection", + * including IBSS/MBSS depending on the type. + * @NL80211_BSS_USE_FOR_MLD_LINK: This BSS can be used as a link in an + * MLO connection. Note that for an MLO connection, all links including + * the assoc link must have this flag set, and the assoc link must + * additionally have %NL80211_BSS_USE_FOR_NORMAL set. + */ +enum nl80211_bss_use_for { + NL80211_BSS_USE_FOR_NORMAL = 1 << 0, + NL80211_BSS_USE_FOR_MLD_LINK = 1 << 1, +}; + +/** + * enum nl80211_bss_cannot_use_reasons - reason(s) connection to a + * BSS isn't possible + * @NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY: NSTR nonprimary links aren't + * supported by the device, and this BSS entry represents one. + */ +enum nl80211_bss_cannot_use_reasons { + NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY = 1 << 0, +}; + /** * enum nl80211_bss - netlink attributes for a BSS * @@ -5084,6 +5114,14 @@ enum nl80211_bss_scan_width { * @NL80211_BSS_FREQUENCY_OFFSET: frequency offset in KHz * @NL80211_BSS_MLO_LINK_ID: MLO link ID of the BSS (u8). * @NL80211_BSS_MLD_ADDR: MLD address of this BSS if connected to it. + * @NL80211_BSS_USE_FOR: u32 bitmap attribute indicating what the BSS can be + * used for, see &enum nl80211_bss_use_for. + * @NL80211_BSS_CANNOT_USE_REASONS: Indicates the reason that this BSS cannot + * be used for all or some of the possible uses by the device reporting it, + * even though its presence was detected. + * This is a u64 attribute containing a bitmap of values from + * &enum nl80211_cannot_use_reasons, note that the attribute may be missing + * if no reasons are specified. * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -5111,6 +5149,8 @@ enum nl80211_bss { NL80211_BSS_FREQUENCY_OFFSET, NL80211_BSS_MLO_LINK_ID, NL80211_BSS_MLD_ADDR, + NL80211_BSS_USE_FOR, + NL80211_BSS_CANNOT_USE_REASONS, /* keep last */ __NL80211_BSS_AFTER_LAST, -- cgit v1.2.3 From 9adc8b65218f70cbf50151fc6c2c40949e29eb4f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 11 Dec 2023 09:05:21 +0200 Subject: wifi: mac80211: update some locking documentation With the locking rework, more functions need to be called with the wiphy mutex held. Document that, and for that use the "Context" description that shows up more nicely in the generated documentation. Signed-off-by: Johannes Berg Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20231211085121.24fa44c7eeb4.I8c9e030ddd78e07c99dd21fe1d5156555390f92e@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 580781ff9dcf..aa8e1055fc3a 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5809,12 +5809,11 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, * ieee80211_remove_key - remove the given key * @keyconf: the parameter passed with the set key * + * Context: Must be called with the wiphy mutex held. + * * Remove the given key. If the key was uploaded to the hardware at the * time this function is called, it is not deleted in the hardware but * instead assumed to have been removed already. - * - * Note that due to locking considerations this function can (currently) - * only be called during key iteration (ieee80211_iter_keys().) */ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); @@ -6368,12 +6367,12 @@ ieee80211_txq_airtime_check(struct ieee80211_hw *hw, struct ieee80211_txq *txq); * @iter: iterator function that will be called for each key * @iter_data: custom data to pass to the iterator function * + * Context: Must be called with wiphy mutex held; can sleep. + * * This function can be used to iterate all the keys known to * mac80211, even those that weren't previously programmed into * the device. This is intended for use in WoWLAN if the device - * needs reprogramming of the keys during suspend. Note that due - * to locking reasons, it is also only safe to call this at few - * spots since it must hold the RTNL and be able to sleep. + * needs reprogramming of the keys during suspend. * * The order in which the keys are iterated matches the order * in which they were originally installed and handed to the @@ -7435,6 +7434,9 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * @vif: interface to set active links on * @active_links: the new active links bitmap * + * Context: Must be called with wiphy mutex held; may sleep; calls + * back into the driver. + * * This changes the active links on an interface. The interface * must be in client mode (in AP mode, all links are always active), * and @active_links must be a subset of the vif's valid_links. @@ -7442,6 +7444,7 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * If a link is switched off and another is switched on at the same * time (e.g. active_links going from 0x1 to 0x10) then you will get * a sequence of calls like + * * - change_vif_links(0x11) * - unassign_vif_chanctx(link_id=0) * - change_sta_links(0x11) for each affected STA (the AP) @@ -7451,10 +7454,6 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) * - change_sta_links(0x10) for each affected STA (the AP) * - assign_vif_chanctx(link_id=4) * - change_vif_links(0x10) - * - * Note: This function acquires some mac80211 locks and must not - * be called with any driver locks held that could cause a - * lock dependency inversion. Best call it without locks. */ int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); -- cgit v1.2.3 From b61e6b41a2f6818ee7b8f92f670a8a6ebcd25a71 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Mon, 11 Dec 2023 09:05:22 +0200 Subject: wifi: cfg80211: Add support for setting TID to link mapping Add support for setting the TID to link mapping for a non-AP MLD station. This is useful in cases user space needs to restrict the possible set of active links, e.g., since it got a BSS Transition Management request forcing to use only a subset of the valid links etc. Signed-off-by: Ilan Peer Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20231211085121.da4d56a5f3ff.Iacf88e943326bf9c169c49b728c4a3445fdedc97@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 18 ++++++++++++++++++ include/uapi/linux/nl80211.h | 19 +++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index cabe57a00eaf..4d6b9d801c2f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1673,6 +1673,21 @@ struct link_station_del_parameters { u32 link_id; }; +/** + * struct cfg80211_ttlm_params: TID to link mapping parameters + * + * Used for setting a TID to link mapping. + * + * @dlink: Downlink TID to link mapping, as defined in section 9.4.2.314 + * (TID-To-Link Mapping element) in Draft P802.11be_D4.0. + * @ulink: Uplink TID to link mapping, as defined in section 9.4.2.314 + * (TID-To-Link Mapping element) in Draft P802.11be_D4.0. + */ +struct cfg80211_ttlm_params { + u16 dlink[8]; + u16 ulink[8]; +}; + /** * struct station_parameters - station parameters * @@ -4523,6 +4538,7 @@ struct mgmt_frame_regs { * @del_link_station: Remove a link of a station. * * @set_hw_timestamp: Enable/disable HW timestamping of TM/FTM frames. + * @set_ttlm: set the TID to link mapping. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -4882,6 +4898,8 @@ struct cfg80211_ops { struct link_station_del_parameters *params); int (*set_hw_timestamp)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts); + int (*set_ttlm)(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_ttlm_params *params); }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 07fc1fec4b12..2d8468cbc457 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1328,6 +1328,11 @@ * Multi-Link reconfiguration. %NL80211_ATTR_MLO_LINKS is used to provide * information about the removed STA MLD setup links. * + * @NL80211_CMD_SET_TID_TO_LINK_MAPPING: Set the TID to Link Mapping for a + * non-AP MLD station. The %NL80211_ATTR_MLO_TTLM_DLINK and + * %NL80211_ATTR_MLO_TTLM_ULINK attributes are used to specify the + * TID to Link mapping for downlink/uplink traffic. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1583,6 +1588,8 @@ enum nl80211_commands { NL80211_CMD_LINKS_REMOVED, + NL80211_CMD_SET_TID_TO_LINK_MAPPING, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2835,6 +2842,15 @@ enum nl80211_commands { * include BSSes that can only be used in restricted scenarios and/or * cannot be used at all. * + * @NL80211_ATTR_MLO_TTLM_DLINK: Binary attribute specifying the downlink TID to + * link mapping. The length is 8 * sizeof(u16). For each TID the link + * mapping is as defined in section 9.4.2.314 (TID-To-Link Mapping element) + * in Draft P802.11be_D4.0. + * @NL80211_ATTR_MLO_TTLM_ULINK: Binary attribute specifying the uplink TID to + * link mapping. The length is 8 * sizeof(u16). For each TID the link + * mapping is as defined in section 9.4.2.314 (TID-To-Link Mapping element) + * in Draft P802.11be_D4.0. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3375,6 +3391,9 @@ enum nl80211_attrs { NL80211_ATTR_BSS_DUMP_INCLUDE_USE_DATA, + NL80211_ATTR_MLO_TTLM_DLINK, + NL80211_ATTR_MLO_TTLM_ULINK, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, -- cgit v1.2.3 From 42b941cd6738d26abdae2f3fee520b79ea77e2fe Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 11 Dec 2023 09:05:23 +0200 Subject: wifi: mac80211: add a flag to disallow puncturing There may be cases where puncturing isn't possible, and a connection needs to be downgraded. Add a hardware flag to support this. This is likely temporary: it seems we will need to move puncturing to the chandef/channel context. Signed-off-by: Johannes Berg Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20231211085121.c1e89ea55e93.I37b8ca0ee64d5d7699e351785a9010afc106da3c@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index aa8e1055fc3a..77a71b1396b1 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2686,6 +2686,9 @@ struct ieee80211_txq { * @IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX: Hardware/driver handles transmitting * multicast frames on all links, mac80211 should not do that. * + * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT + * and connecting with a lower bandwidth instead + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -2743,6 +2746,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_SUPPORTS_CONC_MON_RX_DECAP, IEEE80211_HW_DETECTS_COLOR_COLLISION, IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX, + IEEE80211_HW_DISALLOW_PUNCTURING, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS -- cgit v1.2.3 From 82c944d05b1a24c76948ee9d6bb1d7de1ebb8b3a Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Tue, 28 Nov 2023 14:25:30 +0100 Subject: net: wan: Add framer framework support A framer is a component in charge of an E1/T1 line interface. Connected usually to a TDM bus, it converts TDM frames to/from E1/T1 frames. It also provides information related to the E1/T1 line. The framer framework provides a set of APIs for the framer drivers (framer provider) to create/destroy a framer and APIs for the framer users (framer consumer) to obtain a reference to the framer, and use the framer. This basic implementation provides a framer abstraction for: - power on/off the framer - get the framer status (line state) - be notified on framer status changes - get/set the framer configuration Signed-off-by: Herve Codina Reviewed-by: Christophe Leroy Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20231128132534.258459-2-herve.codina@bootlin.com Signed-off-by: Linus Walleij --- include/linux/framer/framer-provider.h | 194 +++++++++++++++++++++++++++++++ include/linux/framer/framer.h | 205 +++++++++++++++++++++++++++++++++ 2 files changed, 399 insertions(+) create mode 100644 include/linux/framer/framer-provider.h create mode 100644 include/linux/framer/framer.h (limited to 'include') diff --git a/include/linux/framer/framer-provider.h b/include/linux/framer/framer-provider.h new file mode 100644 index 000000000000..782cd5fc83d5 --- /dev/null +++ b/include/linux/framer/framer-provider.h @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Generic framer profider header file + * + * Copyright 2023 CS GROUP France + * + * Author: Herve Codina + */ + +#ifndef __DRIVERS_PROVIDER_FRAMER_H +#define __DRIVERS_PROVIDER_FRAMER_H + +#include +#include +#include + +#define FRAMER_FLAG_POLL_STATUS BIT(0) + +/** + * struct framer_ops - set of function pointers for performing framer operations + * @init: operation to be performed for initializing the framer + * @exit: operation to be performed while exiting + * @power_on: powering on the framer + * @power_off: powering off the framer + * @flags: OR-ed flags (FRAMER_FLAG_*) to ask for core functionality + * - @FRAMER_FLAG_POLL_STATUS: + * Ask the core to perform a polling to get the framer status and + * notify consumers on change. + * The framer should call @framer_notify_status_change() when it + * detects a status change. This is usually done using interrupts. + * If the framer cannot detect this change, it can ask the core for + * a status polling. The core will call @get_status() periodically + * and, on change detected, it will notify the consumer. + * the @get_status() + * @owner: the module owner containing the ops + */ +struct framer_ops { + int (*init)(struct framer *framer); + void (*exit)(struct framer *framer); + int (*power_on)(struct framer *framer); + int (*power_off)(struct framer *framer); + + /** + * @get_status: + * + * Optional. + * + * Used to get the framer status. framer_init() must have + * been called on the framer. + * + * Returns: 0 if successful, an negative error code otherwise + */ + int (*get_status)(struct framer *framer, struct framer_status *status); + + /** + * @set_config: + * + * Optional. + * + * Used to set the framer configuration. framer_init() must have + * been called on the framer. + * + * Returns: 0 if successful, an negative error code otherwise + */ + int (*set_config)(struct framer *framer, const struct framer_config *config); + + /** + * @get_config: + * + * Optional. + * + * Used to get the framer configuration. framer_init() must have + * been called on the framer. + * + * Returns: 0 if successful, an negative error code otherwise + */ + int (*get_config)(struct framer *framer, struct framer_config *config); + + u32 flags; + struct module *owner; +}; + +/** + * struct framer_provider - represents the framer provider + * @dev: framer provider device + * @children: can be used to override the default (dev->of_node) child node + * @owner: the module owner having of_xlate + * @list: to maintain a linked list of framer providers + * @of_xlate: function pointer to obtain framer instance from framer pointer + */ +struct framer_provider { + struct device *dev; + struct module *owner; + struct list_head list; + struct framer * (*of_xlate)(struct device *dev, + struct of_phandle_args *args); +}; + +static inline void framer_set_drvdata(struct framer *framer, void *data) +{ + dev_set_drvdata(&framer->dev, data); +} + +static inline void *framer_get_drvdata(struct framer *framer) +{ + return dev_get_drvdata(&framer->dev); +} + +#if IS_ENABLED(CONFIG_GENERIC_FRAMER) + +/* Create and destroy a framer */ +struct framer *framer_create(struct device *dev, struct device_node *node, + const struct framer_ops *ops); +void framer_destroy(struct framer *framer); + +/* devm version */ +struct framer *devm_framer_create(struct device *dev, struct device_node *node, + const struct framer_ops *ops); + +struct framer *framer_provider_simple_of_xlate(struct device *dev, + struct of_phandle_args *args); + +struct framer_provider * +__framer_provider_of_register(struct device *dev, struct module *owner, + struct framer *(*of_xlate)(struct device *dev, + struct of_phandle_args *args)); + +void framer_provider_of_unregister(struct framer_provider *framer_provider); + +struct framer_provider * +__devm_framer_provider_of_register(struct device *dev, struct module *owner, + struct framer *(*of_xlate)(struct device *dev, + struct of_phandle_args *args)); + +void framer_notify_status_change(struct framer *framer); + +#else /* IS_ENABLED(CONFIG_GENERIC_FRAMER) */ + +static inline struct framer *framer_create(struct device *dev, struct device_node *node, + const struct framer_ops *ops) +{ + return ERR_PTR(-ENOSYS); +} + +static inline void framer_destroy(struct framer *framer) +{ +} + +/* devm version */ +static inline struct framer *devm_framer_create(struct device *dev, struct device_node *node, + const struct framer_ops *ops) +{ + return ERR_PTR(-ENOSYS); +} + +static inline struct framer *framer_provider_simple_of_xlate(struct device *dev, + struct of_phandle_args *args) +{ + return ERR_PTR(-ENOSYS); +} + +static inline struct framer_provider * +__framer_provider_of_register(struct device *dev, struct module *owner, + struct framer *(*of_xlate)(struct device *dev, + struct of_phandle_args *args)) +{ + return ERR_PTR(-ENOSYS); +} + +void framer_provider_of_unregister(struct framer_provider *framer_provider) +{ +} + +static inline struct framer_provider * +__devm_framer_provider_of_register(struct device *dev, struct module *owner, + struct framer *(*of_xlate)(struct device *dev, + struct of_phandle_args *args)) +{ + return ERR_PTR(-ENOSYS); +} + +void framer_notify_status_change(struct framer *framer) +{ +} + +#endif /* IS_ENABLED(CONFIG_GENERIC_FRAMER) */ + +#define framer_provider_of_register(dev, xlate) \ + __framer_provider_of_register((dev), THIS_MODULE, (xlate)) + +#define devm_framer_provider_of_register(dev, xlate) \ + __devm_framer_provider_of_register((dev), THIS_MODULE, (xlate)) + +#endif /* __DRIVERS_PROVIDER_FRAMER_H */ diff --git a/include/linux/framer/framer.h b/include/linux/framer/framer.h new file mode 100644 index 000000000000..9a9b88962c29 --- /dev/null +++ b/include/linux/framer/framer.h @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Generic framer header file + * + * Copyright 2023 CS GROUP France + * + * Author: Herve Codina + */ + +#ifndef __DRIVERS_FRAMER_H +#define __DRIVERS_FRAMER_H + +#include +#include +#include +#include +#include +#include + +/** + * enum framer_iface - Framer interface + * @FRAMER_IFACE_E1: E1 interface + * @FRAMER_IFACE_T1: T1 interface + */ +enum framer_iface { + FRAMER_IFACE_E1, + FRAMER_IFACE_T1, +}; + +/** + * enum framer_clock_type - Framer clock type + * @FRAMER_CLOCK_EXT: External clock + * @FRAMER_CLOCK_INT: Internal clock + */ +enum framer_clock_type { + FRAMER_CLOCK_EXT, + FRAMER_CLOCK_INT, +}; + +/** + * struct framer_config - Framer configuration + * @iface: Framer line interface + * @clock_type: Framer clock type + * @line_clock_rate: Framer line clock rate + */ +struct framer_config { + enum framer_iface iface; + enum framer_clock_type clock_type; + unsigned long line_clock_rate; +}; + +/** + * struct framer_status - Framer status + * @link_is_on: Framer link state. true, the link is on, false, the link is off. + */ +struct framer_status { + bool link_is_on; +}; + +/** + * enum framer_event - Event available for notification + * @FRAMER_EVENT_STATUS: Event notified on framer_status changes + */ +enum framer_event { + FRAMER_EVENT_STATUS, +}; + +/** + * struct framer - represents the framer device + * @dev: framer device + * @id: id of the framer device + * @ops: function pointers for performing framer operations + * @mutex: mutex to protect framer_ops + * @init_count: used to protect when the framer is used by multiple consumers + * @power_count: used to protect when the framer is used by multiple consumers + * @pwr: power regulator associated with the framer + * @notify_status_work: work structure used for status notifications + * @notifier_list: notifier list used for notifications + * @polling_work: delayed work structure used for the polling task + * @prev_status: previous read status used by the polling task to detect changes + */ +struct framer { + struct device dev; + int id; + const struct framer_ops *ops; + struct mutex mutex; /* Protect framer */ + int init_count; + int power_count; + struct regulator *pwr; + struct work_struct notify_status_work; + struct blocking_notifier_head notifier_list; + struct delayed_work polling_work; + struct framer_status prev_status; +}; + +#if IS_ENABLED(CONFIG_GENERIC_FRAMER) +int framer_pm_runtime_get(struct framer *framer); +int framer_pm_runtime_get_sync(struct framer *framer); +int framer_pm_runtime_put(struct framer *framer); +int framer_pm_runtime_put_sync(struct framer *framer); +int framer_init(struct framer *framer); +int framer_exit(struct framer *framer); +int framer_power_on(struct framer *framer); +int framer_power_off(struct framer *framer); +int framer_get_status(struct framer *framer, struct framer_status *status); +int framer_get_config(struct framer *framer, struct framer_config *config); +int framer_set_config(struct framer *framer, const struct framer_config *config); +int framer_notifier_register(struct framer *framer, struct notifier_block *nb); +int framer_notifier_unregister(struct framer *framer, struct notifier_block *nb); + +struct framer *framer_get(struct device *dev, const char *con_id); +void framer_put(struct device *dev, struct framer *framer); + +struct framer *devm_framer_get(struct device *dev, const char *con_id); +struct framer *devm_framer_optional_get(struct device *dev, const char *con_id); +#else +static inline int framer_pm_runtime_get(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_pm_runtime_get_sync(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_pm_runtime_put(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_pm_runtime_put_sync(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_init(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_exit(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_power_on(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_power_off(struct framer *framer) +{ + return -ENOSYS; +} + +static inline int framer_get_status(struct framer *framer, struct framer_status *status) +{ + return -ENOSYS; +} + +static inline int framer_get_config(struct framer *framer, struct framer_config *config) +{ + return -ENOSYS; +} + +static inline int framer_set_config(struct framer *framer, const struct framer_config *config) +{ + return -ENOSYS; +} + +static inline int framer_notifier_register(struct framer *framer, + struct notifier_block *nb) +{ + return -ENOSYS; +} + +static inline int framer_notifier_unregister(struct framer *framer, + struct notifier_block *nb) +{ + return -ENOSYS; +} + +struct framer *framer_get(struct device *dev, const char *con_id) +{ + return ERR_PTR(-ENOSYS); +} + +void framer_put(struct device *dev, struct framer *framer) +{ +} + +static inline struct framer *devm_framer_get(struct device *dev, const char *con_id) +{ + return ERR_PTR(-ENOSYS); +} + +static inline struct framer *devm_framer_optional_get(struct device *dev, const char *con_id) +{ + return NULL; +} + +#endif + +#endif /* __DRIVERS_FRAMER_H */ -- cgit v1.2.3 From c96e976d9a05d559f4ac4f617ea0f798c75a1799 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Tue, 28 Nov 2023 14:25:32 +0100 Subject: net: wan: framer: Add support for the Lantiq PEF2256 framer The Lantiq PEF2256 is a framer and line interface component designed to fulfill all required interfacing between an analog E1/T1/J1 line and the digital PCM system highway/H.100 bus. Signed-off-by: Herve Codina Reviewed-by: Christophe Leroy Reviewed-by: Linus Walleij Acked-by: Jakub Kicinski Link: https://lore.kernel.org/r/20231128132534.258459-4-herve.codina@bootlin.com Signed-off-by: Linus Walleij --- include/linux/framer/pef2256.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 include/linux/framer/pef2256.h (limited to 'include') diff --git a/include/linux/framer/pef2256.h b/include/linux/framer/pef2256.h new file mode 100644 index 000000000000..71d80af58c40 --- /dev/null +++ b/include/linux/framer/pef2256.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * PEF2256 consumer API + * + * Copyright 2023 CS GROUP France + * + * Author: Herve Codina + */ +#ifndef __PEF2256_H__ +#define __PEF2256_H__ + +#include + +struct pef2256; +struct regmap; + +/* Retrieve the PEF2256 regmap */ +struct regmap *pef2256_get_regmap(struct pef2256 *pef2256); + +/* PEF2256 hardware versions */ +enum pef2256_version { + PEF2256_VERSION_UNKNOWN, + PEF2256_VERSION_1_2, + PEF2256_VERSION_2_1, + PEF2256_VERSION_2_2, +}; + +/* Get the PEF2256 hardware version */ +enum pef2256_version pef2256_get_version(struct pef2256 *pef2256); + +#endif /* __PEF2256_H__ */ -- cgit v1.2.3 From 4f7aa122bc9219baca0bfface5917062d6c45ee8 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 7 Dec 2023 16:12:04 +0100 Subject: dpll: remove leftover mode_supported() op and use mode_get() instead Mode supported is currently reported to the user exactly the same, as the current mode. That's because mode changing is not implemented. Remove the leftover mode_supported() op and use mode_get() to fill up the supported mode exposed to user. One, if even, mode changing is going to be introduced, this could be very easily taken back. In the meantime, prevent drivers form implementing this in wrong way (as for example recent netdevsim implementation attempt intended to do). Signed-off-by: Jiri Pirko Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/dpll.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 578fc5fa3750..b1a5f9ca8ee5 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -17,9 +17,6 @@ struct dpll_pin; struct dpll_device_ops { int (*mode_get)(const struct dpll_device *dpll, void *dpll_priv, enum dpll_mode *mode, struct netlink_ext_ack *extack); - bool (*mode_supported)(const struct dpll_device *dpll, void *dpll_priv, - const enum dpll_mode mode, - struct netlink_ext_ack *extack); int (*lock_status_get)(const struct dpll_device *dpll, void *dpll_priv, enum dpll_lock_status *status, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 750e785796bb72423b97cac21ecd0fa3b3b65610 Mon Sep 17 00:00:00 2001 From: Jie Jiang Date: Tue, 12 Dec 2023 09:39:23 +0000 Subject: bpf: Support uid and gid when mounting bpffs Parse uid and gid in bpf_parse_param() so that they can be passed in as the `data` parameter when mount() bpffs. This will be useful when we want to control which user/group has the control to the mounted bpffs, otherwise a separate chown() call will be needed. Signed-off-by: Jie Jiang Signed-off-by: Andrii Nakryiko Acked-by: Mike Frysinger Acked-by: Christian Brauner Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20231212093923.497838-1-jiejiang@chromium.org --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0bd4889e917a..c87c608a3689 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1595,6 +1595,8 @@ struct bpf_link_primer { }; struct bpf_mount_opts { + kuid_t uid; + kgid_t gid; umode_t mode; /* BPF token-related delegation options */ -- cgit v1.2.3 From 0e6a7b09597011985d7aad3b747c43e9b2a43555 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 5 Dec 2023 22:08:35 +0100 Subject: ice: Support RX hash XDP hint RX hash XDP hint requests both hash value and type. Type is XDP-specific, so we need a separate way to map these values to the hardware ptypes, so create a lookup table. Instead of creating a new long list, reuse contents of ice_decode_rx_desc_ptype[] through preprocessor. Current hash type enum does not contain ICMP packet type, but ice devices support it, so also add a new type into core code. Then use previously refactored code and create a function that allows XDP code to read RX hash. Signed-off-by: Larysa Zaremba Link: https://lore.kernel.org/r/20231205210847.28460-7-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 5d3673afc037..b7d6fe61381f 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -432,6 +432,7 @@ enum xdp_rss_hash_type { XDP_RSS_L4_UDP = BIT(5), XDP_RSS_L4_SCTP = BIT(6), XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ + XDP_RSS_L4_ICMP = BIT(8), /* Second part: RSS hash type combinations used for driver HW mapping */ XDP_RSS_TYPE_NONE = 0, @@ -447,11 +448,13 @@ enum xdp_rss_hash_type { XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + XDP_RSS_TYPE_L4_IPV4_ICMP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, + XDP_RSS_TYPE_L4_IPV6_ICMP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, -- cgit v1.2.3 From b4e352ff1169ebce930c734630f9587b1677d163 Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Tue, 5 Dec 2023 22:08:36 +0100 Subject: xsk: add functions to fill control buffer Commit 94ecc5ca4dbf ("xsk: Add cb area to struct xdp_buff_xsk") has added a buffer for custom data to xdp_buff_xsk. Particularly, this memory is used for data, consumed by XDP hints kfuncs. It does not always change on a per-packet basis and some parts can be set for example, at the same time as RX queue info. Add functions to fill all cbs in xsk_buff_pool with the same metadata. Signed-off-by: Maciej Fijalkowski Signed-off-by: Larysa Zaremba Acked-by: Magnus Karlsson Link: https://lore.kernel.org/r/20231205210847.28460-8-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock_drv.h | 17 +++++++++++++++++ include/net/xsk_buff_pool.h | 2 ++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 81e02de3f453..b62bb8525a5f 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -14,6 +14,12 @@ #ifdef CONFIG_XDP_SOCKETS +struct xsk_cb_desc { + void *src; + u8 off; + u8 bytes; +}; + void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); @@ -47,6 +53,12 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, xp_set_rxq_info(pool, rxq); } +static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, + struct xsk_cb_desc *desc) +{ + xp_fill_cb(pool, desc); +} + static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) { #ifdef CONFIG_NET_RX_BUSY_POLL @@ -274,6 +286,11 @@ static inline void xsk_pool_set_rxq_info(struct xsk_buff_pool *pool, { } +static inline void xsk_pool_fill_cb(struct xsk_buff_pool *pool, + struct xsk_cb_desc *desc) +{ +} + static inline unsigned int xsk_pool_get_napi_id(struct xsk_buff_pool *pool) { return 0; diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 8d48d37ab7c0..99dd7376df6a 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -12,6 +12,7 @@ struct xsk_buff_pool; struct xdp_rxq_info; +struct xsk_cb_desc; struct xsk_queue; struct xdp_desc; struct xdp_umem; @@ -135,6 +136,7 @@ static inline void xp_init_xskb_dma(struct xdp_buff_xsk *xskb, struct xsk_buff_p /* AF_XDP ZC drivers, via xdp_sock_buff.h */ void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); +void xp_fill_cb(struct xsk_buff_pool *pool, struct xsk_cb_desc *desc); int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, unsigned long attrs, struct page **pages, u32 nr_pages); void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); -- cgit v1.2.3 From e6795330f88b4f643c649a02662d47b779340535 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 5 Dec 2023 22:08:38 +0100 Subject: xdp: Add VLAN tag hint Implement functionality that enables drivers to expose VLAN tag to XDP code. VLAN tag is represented by 2 variables: - protocol ID, which is passed to bpf code in BE - VLAN TCI, in host byte order Acked-by: Stanislav Fomichev Signed-off-by: Larysa Zaremba Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20231205210847.28460-10-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 6 ++++++ include/uapi/linux/netdev.h | 3 +++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index b7d6fe61381f..8cd04a74dba5 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -404,6 +404,10 @@ void xdp_attachment_setup(struct xdp_attachment_info *info, NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ + XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \ + NETDEV_XDP_RX_METADATA_VLAN_TAG, \ + bpf_xdp_metadata_rx_vlan_tag, \ + xmo_rx_vlan_tag) \ enum xdp_rx_metadata { #define XDP_METADATA_KFUNC(name, _, __, ___) name, @@ -465,6 +469,8 @@ struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type); + int (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto, + u16 *vlan_tci); }; #ifdef CONFIG_NET diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 6244c0164976..966638b08ccf 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -44,10 +44,13 @@ enum netdev_xdp_act { * timestamp via bpf_xdp_metadata_rx_timestamp(). * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet * hash via bpf_xdp_metadata_rx_hash(). + * @NETDEV_XDP_RX_METADATA_VLAN_TAG: Device is capable of exposing receive + * packet VLAN tag via bpf_xdp_metadata_rx_vlan_tag(). */ enum netdev_xdp_rx_metadata { NETDEV_XDP_RX_METADATA_TIMESTAMP = 1, NETDEV_XDP_RX_METADATA_HASH = 2, + NETDEV_XDP_RX_METADATA_VLAN_TAG = 4, }; /** -- cgit v1.2.3 From 537fec0733c4a72e2a2b69fee365459c5b75d92e Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 5 Dec 2023 22:08:42 +0100 Subject: net: make vlan_get_tag() return -ENODATA instead of -EINVAL __vlan_hwaccel_get_tag() is used in veth XDP hints implementation, its return value (-EINVAL if skb is not VLAN tagged) is passed to bpf code, but XDP hints specification requires drivers to return -ENODATA, if a hint cannot be provided for a particular packet. Solve this inconsistency by changing error return value of __vlan_hwaccel_get_tag() from -EINVAL to -ENODATA, do the same thing to __vlan_get_tag(), because this function is supposed to follow the same convention. This, in turn, makes -ENODATA the only non-zero value vlan_get_tag() can return. We can do this with no side effects, because none of the users of the 3 above-mentioned functions rely on the exact value. Suggested-by: Jesper Dangaard Brouer Acked-by: Stanislav Fomichev Signed-off-by: Larysa Zaremba Link: https://lore.kernel.org/r/20231205210847.28460-14-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- include/linux/if_vlan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 3028af87716e..c1645c86eed9 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -540,7 +540,7 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb); if (!eth_type_vlan(veth->h_vlan_proto)) - return -EINVAL; + return -ENODATA; *vlan_tci = ntohs(veth->h_vlan_TCI); return 0; @@ -561,7 +561,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, return 0; } else { *vlan_tci = 0; - return -EINVAL; + return -ENODATA; } } -- cgit v1.2.3 From 7978bad4b6b9265a1e808a5f679ee428d1dd6523 Mon Sep 17 00:00:00 2001 From: Larysa Zaremba Date: Tue, 5 Dec 2023 22:08:43 +0100 Subject: mlx5: implement VLAN tag XDP hint Implement the newly added .xmo_rx_vlan_tag() hint function. Reviewed-by: Tariq Toukan Signed-off-by: Larysa Zaremba Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/20231205210847.28460-15-larysa.zaremba@intel.com Signed-off-by: Alexei Starovoitov --- include/linux/mlx5/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 820bca965fb6..01275c6e8468 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -918,7 +918,7 @@ static inline u8 get_cqe_tls_offload(struct mlx5_cqe64 *cqe) return (cqe->tls_outer_l3_tunneled >> 3) & 0x3; } -static inline bool cqe_has_vlan(struct mlx5_cqe64 *cqe) +static inline bool cqe_has_vlan(const struct mlx5_cqe64 *cqe) { return cqe->l4_l3_hdr_type & 0x1; } -- cgit v1.2.3 From 1dd7f18fc0ed75dad4d5f2ecc84f69c6b62b6a81 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Mon, 11 Dec 2023 15:18:07 -0300 Subject: net/sched: act_api: skip idr replace on bound actions tcf_idr_insert_many will replace the allocated -EBUSY pointer in tcf_idr_check_alloc with the real action pointer, exposing it to all operations. This operation is only needed when the action pointer is created (ACT_P_CREATED). For actions which are bound to (returned 0), the pointer already resides in the idr making such operation a nop. Even though it's a nop, it's still not a cheap operation as internally the idr code walks the idr and then does a replace on the appropriate slot. So if the action was bound, better skip the idr replace entirely. Signed-off-by: Pedro Tammela Acked-by: Jamal Hadi Salim Reviewed-by: Vlad Buslov Link: https://lore.kernel.org/r/20231211181807.96028-3-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- include/net/act_api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 4ae0580b63ca..ea13e1e4a7c2 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -191,7 +191,7 @@ int tcf_idr_create_from_flags(struct tc_action_net *tn, u32 index, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int bind, u32 flags); -void tcf_idr_insert_many(struct tc_action *actions[]); +void tcf_idr_insert_many(struct tc_action *actions[], int init_res[]); void tcf_idr_cleanup(struct tc_action_net *tn, u32 index); int tcf_idr_check_alloc(struct tc_action_net *tn, u32 *index, struct tc_action **a, int bind); -- cgit v1.2.3 From 0c476157085fe2ad13b9bec70ea672e86647fa1a Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 12 Dec 2023 06:41:43 +0100 Subject: net: phy: c45: add genphy_c45_pma_read_ext_abilities() function Move part of the genphy_c45_pma_read_abilities() code to a separate function. Some PHYs do not implement PMA/PMD status 2 register (Register 1.8) but do implement PMA/PMD extended ability register (Register 1.11). To make use of it, we need to be able to access this part of code separately. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Reviewed-by: Russell King (Oracle) Link: https://lore.kernel.org/r/20231212054144.87527-2-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 6e7ebcc50b85..dbb5e13e3e1b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1866,6 +1866,7 @@ int genphy_c45_an_config_aneg(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); +int genphy_c45_pma_read_ext_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_abilities(struct phy_device *phydev); int genphy_c45_read_eee_abilities(struct phy_device *phydev); int genphy_c45_pma_baset1_read_master_slave(struct phy_device *phydev); -- cgit v1.2.3 From 13049408a4bd29c92227ca2d6befab80dbb96663 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 7 May 2023 16:47:42 +0300 Subject: net/mlx5: Add mlx5_ifc bits used for supporting single netdev Socket-Direct Multiple device caps and features are required to support single netdev Socket-Direct. Add them here in preparation for the feature implementation. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ce2e71cd6d2a..405d141b4a08 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -435,7 +435,7 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 flow_table_modify[0x1]; u8 reformat[0x1]; u8 decap[0x1]; - u8 reserved_at_9[0x1]; + u8 reset_root_to_default[0x1]; u8 pop_vlan[0x1]; u8 push_vlan[0x1]; u8 reserved_at_c[0x1]; @@ -1801,7 +1801,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 disable_local_lb_uc[0x1]; u8 disable_local_lb_mc[0x1]; u8 log_min_hairpin_wq_data_sz[0x5]; - u8 reserved_at_3e8[0x2]; + u8 reserved_at_3e8[0x1]; + u8 silent_mode[0x1]; u8 vhca_state[0x1]; u8 log_max_vlan_list[0x5]; u8 reserved_at_3f0[0x3]; @@ -1818,7 +1819,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_460[0x1]; u8 ats[0x1]; - u8 reserved_at_462[0x1]; + u8 cross_vhca_rqt[0x1]; u8 log_max_uctx[0x5]; u8 reserved_at_468[0x1]; u8 crypto[0x1]; @@ -1943,6 +1944,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { enum { MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_TO_REMOTE_FLOW_TABLE_MISS = 0x80000, + MLX5_CROSS_VHCA_OBJ_TO_OBJ_SUPPORTED_LOCAL_FLOW_TABLE_ROOT_TO_REMOTE_FLOW_TABLE = (1ULL << 20), }; enum { @@ -1992,7 +1994,11 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_260[0x120]; u8 reserved_at_380[0x10]; u8 ec_vf_vport_base[0x10]; - u8 reserved_at_3a0[0x460]; + + u8 reserved_at_3a0[0x10]; + u8 max_rqt_vhca_id[0x10]; + + u8 reserved_at_3c0[0x440]; }; enum mlx5_ifc_flow_destination_type { @@ -2151,6 +2157,13 @@ struct mlx5_ifc_rq_num_bits { u8 rq_num[0x18]; }; +struct mlx5_ifc_rq_vhca_bits { + u8 reserved_at_0[0x8]; + u8 rq_num[0x18]; + u8 reserved_at_20[0x10]; + u8 rq_vhca_id[0x10]; +}; + struct mlx5_ifc_mac_address_layout_bits { u8 reserved_at_0[0x10]; u8 mac_addr_47_32[0x10]; @@ -3901,7 +3914,10 @@ struct mlx5_ifc_rqtc_bits { u8 reserved_at_e0[0x6a0]; - struct mlx5_ifc_rq_num_bits rq_num[]; + union { + DECLARE_FLEX_ARRAY(struct mlx5_ifc_rq_num_bits, rq_num); + DECLARE_FLEX_ARRAY(struct mlx5_ifc_rq_vhca_bits, rq_vhca); + }; }; enum { @@ -4744,7 +4760,10 @@ struct mlx5_ifc_set_l2_table_entry_in_bits { u8 reserved_at_c0[0x20]; - u8 reserved_at_e0[0x13]; + u8 reserved_at_e0[0x10]; + u8 silent_mode_valid[0x1]; + u8 silent_mode[0x1]; + u8 reserved_at_f2[0x1]; u8 vlan_valid[0x1]; u8 vlan[0xc]; -- cgit v1.2.3 From f5e956329960903d908668d7a20bbc08e0a8b92b Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 7 Aug 2023 09:05:34 +0300 Subject: net/mlx5: Expose Management PCIe Index Register (MPIR) MPIR register allows to query the PCIe indexes and Socket-Direct related parameters. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d2b8d4a74a30..2f67cec1a898 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -150,6 +150,7 @@ enum { MLX5_REG_MTPPSE = 0x9054, MLX5_REG_MTUTC = 0x9055, MLX5_REG_MPEGC = 0x9056, + MLX5_REG_MPIR = 0x9059, MLX5_REG_MCQS = 0x9060, MLX5_REG_MCQI = 0x9061, MLX5_REG_MCC = 0x9062, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 405d141b4a08..828938368fb7 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -10108,6 +10108,20 @@ struct mlx5_ifc_mpegc_reg_bits { u8 reserved_at_60[0x100]; }; +struct mlx5_ifc_mpir_reg_bits { + u8 sdm[0x1]; + u8 reserved_at_1[0x1b]; + u8 host_buses[0x4]; + + u8 reserved_at_20[0x20]; + + u8 local_port[0x8]; + u8 reserved_at_28[0x15]; + u8 sd_group[0x3]; + + u8 reserved_at_60[0x20]; +}; + enum { MLX5_MTUTC_FREQ_ADJ_UNITS_PPB = 0x0, MLX5_MTUTC_FREQ_ADJ_UNITS_SCALED_PPM = 0x1, -- cgit v1.2.3 From b25bd37c859f32e50a436ab9d2078b76e433008e Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 6 Aug 2023 14:01:10 +0300 Subject: net/mlx5: Move TISes from priv to mdev HW resources The transport interface send (TIS) object is responsible for performing all transport related operations of the transmit side. Messages from Send Queues get segmented and transmitted by the TIS including all transport required implications, e.g. in the case of large send offload, the TIS is responsible for the segmentation. These are stateless objects and can be used by multiple netdevs (e.g. representors) who share the same core device. Providing the TISes as a service from the core layer to the netdev layer reduces the number of replecated TIS objects (in case of multiple netdevs), and will ease the transition to netdev with multiple mdevs. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2f67cec1a898..7ee5b79ff3d6 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -679,6 +679,8 @@ struct mlx5e_resources { struct mlx5_td td; u32 mkey; struct mlx5_sq_bfreg bfreg; +#define MLX5_MAX_NUM_TC 8 + u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; } hw_objs; struct net_device *uplink_netdev; struct mutex uplink_netdev_lock; -- cgit v1.2.3 From 50d73710715de7d1a2c88194562f520816af9c2a Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Tue, 12 Dec 2023 15:27:51 +0100 Subject: ethtool: add SET for TCP_DATA_SPLIT ringparam Follow up commit 9690ae604290 ("ethtool: add header/data split indication") and add the set part of Ethtool's header split, i.e. ability to enable/disable header split via the Ethtool Netlink interface. This might be helpful to optimize the setup for particular workloads, for example, to avoid XDP frags, and so on. A driver should advertise ``ETHTOOL_RING_USE_TCP_DATA_SPLIT`` in its ops->supported_ring_params to allow doing that. "Unknown" passed from the userspace when the header split is supported means the driver is free to choose the preferred state. Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Link: https://lore.kernel.org/r/20231212142752.935000-2-aleksander.lobakin@intel.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index deb683d3360f..67b30940234b 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -95,6 +95,7 @@ struct kernel_ethtool_ringparam { * @ETHTOOL_RING_USE_TX_PUSH: capture for setting tx_push * @ETHTOOL_RING_USE_RX_PUSH: capture for setting rx_push * @ETHTOOL_RING_USE_TX_PUSH_BUF_LEN: capture for setting tx_push_buf_len + * @ETHTOOL_RING_USE_TCP_DATA_SPLIT: capture for setting tcp_data_split */ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_RX_BUF_LEN = BIT(0), @@ -102,6 +103,7 @@ enum ethtool_supported_ring_param { ETHTOOL_RING_USE_TX_PUSH = BIT(2), ETHTOOL_RING_USE_RX_PUSH = BIT(3), ETHTOOL_RING_USE_TX_PUSH_BUF_LEN = BIT(4), + ETHTOOL_RING_USE_TCP_DATA_SPLIT = BIT(5), }; #define __ETH_RSS_HASH_BIT(bit) ((u32)1 << (bit)) -- cgit v1.2.3 From 0a149ab78ee220c75eef797abea7a29f4490e226 Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Tue, 12 Dec 2023 12:46:11 +0800 Subject: page_pool: transition to reference count management after page draining To support multiple users referencing the same fragment, 'pp_frag_count' is renamed to 'pp_ref_count', transitioning pp pages from fragment management to reference count management after draining based on the suggestion from [1]. The idea is that the concept of fragmenting exists before the page is drained, and all related functions retain their current names. However, once the page is drained, its management shifts to being governed by 'pp_ref_count'. Therefore, all functions associated with that lifecycle stage of a pp page are renamed. [1] http://lore.kernel.org/netdev/f71d9448-70c8-8793-dc9a-0eb48a570300@huawei.com Signed-off-by: Liang Chen Reviewed-by: Yunsheng Lin Reviewed-by: Ilias Apalodimas Reviewed-by: Mina Almasry Link: https://lore.kernel.org/r/20231212044614.42733-2-liangchen.linux@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/mm_types.h | 2 +- include/net/page_pool/helpers.h | 60 +++++++++++++++++++++++------------------ include/net/page_pool/types.h | 6 ++--- 3 files changed, 38 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2..64e4572ef06d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -125,7 +125,7 @@ struct page { struct page_pool *pp; unsigned long _pp_mapping_pad; unsigned long dma_addr; - atomic_long_t pp_frag_count; + atomic_long_t pp_ref_count; }; struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 7dc65774cde5..841e0a930bd7 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -29,7 +29,7 @@ * page allocated from page pool. Page splitting enables memory saving and thus * avoids TLB/cache miss for data access, but there also is some cost to * implement page splitting, mainly some cache line dirtying/bouncing for - * 'struct page' and atomic operation for page->pp_frag_count. + * 'struct page' and atomic operation for page->pp_ref_count. * * The API keeps track of in-flight pages, in order to let API users know when * it is safe to free a page_pool object, the API users must call @@ -210,69 +210,77 @@ inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) return pool->p.dma_dir; } -/* pp_frag_count represents the number of writers who can update the page - * either by updating skb->data or via DMA mappings for the device. - * We can't rely on the page refcnt for that as we don't know who might be - * holding page references and we can't reliably destroy or sync DMA mappings - * of the fragments. +/** + * page_pool_fragment_page() - split a fresh page into fragments + * @page: page to split + * @nr: references to set + * + * pp_ref_count represents the number of outstanding references to the page, + * which will be freed using page_pool APIs (rather than page allocator APIs + * like put_page()). Such references are usually held by page_pool-aware + * objects like skbs marked for page pool recycling. * - * When pp_frag_count reaches 0 we can either recycle the page if the page - * refcnt is 1 or return it back to the memory allocator and destroy any - * mappings we have. + * This helper allows the caller to take (set) multiple references to a + * freshly allocated page. The page must be freshly allocated (have a + * pp_ref_count of 1). This is commonly done by drivers and + * "fragment allocators" to save atomic operations - either when they know + * upfront how many references they will need; or to take MAX references and + * return the unused ones with a single atomic dec(), instead of performing + * multiple atomic inc() operations. */ static inline void page_pool_fragment_page(struct page *page, long nr) { - atomic_long_set(&page->pp_frag_count, nr); + atomic_long_set(&page->pp_ref_count, nr); } -static inline long page_pool_defrag_page(struct page *page, long nr) +static inline long page_pool_unref_page(struct page *page, long nr) { long ret; - /* If nr == pp_frag_count then we have cleared all remaining + /* If nr == pp_ref_count then we have cleared all remaining * references to the page: * 1. 'n == 1': no need to actually overwrite it. * 2. 'n != 1': overwrite it with one, which is the rare case - * for pp_frag_count draining. + * for pp_ref_count draining. * * The main advantage to doing this is that not only we avoid a atomic * update, as an atomic_read is generally a much cheaper operation than * an atomic update, especially when dealing with a page that may be - * partitioned into only 2 or 3 pieces; but also unify the pp_frag_count + * referenced by only 2 or 3 users; but also unify the pp_ref_count * handling by ensuring all pages have partitioned into only 1 piece * initially, and only overwrite it when the page is partitioned into * more than one piece. */ - if (atomic_long_read(&page->pp_frag_count) == nr) { + if (atomic_long_read(&page->pp_ref_count) == nr) { /* As we have ensured nr is always one for constant case using * the BUILD_BUG_ON(), only need to handle the non-constant case - * here for pp_frag_count draining, which is a rare case. + * here for pp_ref_count draining, which is a rare case. */ BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); if (!__builtin_constant_p(nr)) - atomic_long_set(&page->pp_frag_count, 1); + atomic_long_set(&page->pp_ref_count, 1); return 0; } - ret = atomic_long_sub_return(nr, &page->pp_frag_count); + ret = atomic_long_sub_return(nr, &page->pp_ref_count); WARN_ON(ret < 0); - /* We are the last user here too, reset pp_frag_count back to 1 to + /* We are the last user here too, reset pp_ref_count back to 1 to * ensure all pages have been partitioned into 1 piece initially, * this should be the rare case when the last two fragment users call - * page_pool_defrag_page() currently. + * page_pool_unref_page() currently. */ if (unlikely(!ret)) - atomic_long_set(&page->pp_frag_count, 1); + atomic_long_set(&page->pp_ref_count, 1); return ret; } -static inline bool page_pool_is_last_frag(struct page *page) +static inline bool page_pool_is_last_ref(struct page *page) { - /* If page_pool_defrag_page() returns 0, we were the last user */ - return page_pool_defrag_page(page, 1) == 0; + /* If page_pool_unref_page() returns 0, we were the last user */ + return page_pool_unref_page(page, 1) == 0; } /** @@ -297,10 +305,10 @@ static inline void page_pool_put_page(struct page_pool *pool, * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL - if (!page_pool_is_last_frag(page)) + if (!page_pool_is_last_ref(page)) return; - page_pool_put_defragged_page(pool, page, dma_sync_size, allow_direct); + page_pool_put_unrefed_page(pool, page, dma_sync_size, allow_direct); #endif } diff --git a/include/net/page_pool/types.h b/include/net/page_pool/types.h index ac286ea8ce2d..76481c465375 100644 --- a/include/net/page_pool/types.h +++ b/include/net/page_pool/types.h @@ -234,9 +234,9 @@ static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, } #endif -void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, - bool allow_direct); +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, + bool allow_direct); static inline bool is_page_pool_compiled_in(void) { -- cgit v1.2.3 From fb6e30a72539ce28c1323aef4190d35aac106f6f Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Tue, 12 Dec 2023 17:33:14 -0700 Subject: net: ethtool: pass a pointer to parameters to get/set_rxfh ethtool ops The get/set_rxfh ethtool ops currently takes the rxfh (RSS) parameters as direct function arguments. This will force us to change the API (and all drivers' functions) every time some new parameters are added. This is part 1/2 of the fix, as suggested in [1]: - First simplify the code by always providing a pointer to all params (indir, key and func); the fact that some of them may be NULL seems like a weird historic thing or a premature optimization. It will simplify the drivers if all pointers are always present. - Then make the functions take a dev pointer, and a pointer to a single struct wrapping all arguments. The set_* should also take an extack. Link: https://lore.kernel.org/netdev/20231121152906.2dd5f487@kernel.org/ [1] Suggested-by: Jakub Kicinski Suggested-by: Jacob Keller Signed-off-by: Ahmed Zaki Link: https://lore.kernel.org/r/20231213003321.605376-2-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 67b30940234b..3ab2b6a90419 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -596,6 +596,28 @@ struct ethtool_mm_stats { u64 MACMergeHoldCount; }; +/** + * struct ethtool_rxfh_param - RXFH (RSS) parameters + * @hfunc: Defines the current RSS hash function used by HW (or to be set to). + * Valid values are one of the %ETH_RSS_HASH_*. + * @indir_size: On SET, the array size of the user buffer for the + * indirection table, which may be zero, or + * %ETH_RXFH_INDIR_NO_CHANGE. On GET (read from the driver), + * the array size of the hardware indirection table. + * @indir: The indirection table of size @indir_size entries. + * @key_size: On SET, the array size of the user buffer for the hash key, + * which may be zero. On GET (read from the driver), the size of the + * hardware hash key. + * @key: The hash key of size @key_size bytes. + */ +struct ethtool_rxfh_param { + u8 hfunc; + u32 indir_size; + u32 *indir; + u32 key_size; + u8 *key; +}; + /** * struct ethtool_ops - optional netdev operations * @cap_link_lanes_supported: indicates if the driver supports lanes @@ -846,14 +868,14 @@ struct ethtool_ops { int (*reset)(struct net_device *, u32 *); u32 (*get_rxfh_key_size)(struct net_device *); u32 (*get_rxfh_indir_size)(struct net_device *); - int (*get_rxfh)(struct net_device *, u32 *indir, u8 *key, - u8 *hfunc); - int (*set_rxfh)(struct net_device *, const u32 *indir, - const u8 *key, const u8 hfunc); - int (*get_rxfh_context)(struct net_device *, u32 *indir, u8 *key, - u8 *hfunc, u32 rss_context); - int (*set_rxfh_context)(struct net_device *, const u32 *indir, - const u8 *key, const u8 hfunc, + int (*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *); + int (*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *, + struct netlink_ext_ack *extack); + int (*get_rxfh_context)(struct net_device *, + struct ethtool_rxfh_param *, + u32 rss_context); + int (*set_rxfh_context)(struct net_device *, + struct ethtool_rxfh_param *, u32 *rss_context, bool delete); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); -- cgit v1.2.3 From dcd8dbf9e734eb334113ea43186c1c26e9f497bb Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Tue, 12 Dec 2023 17:33:15 -0700 Subject: net: ethtool: get rid of get/set_rxfh_context functions Add the RSS context parameters to struct ethtool_rxfh_param and use the get/set_rxfh to handle the RSS contexts as well. This is part 2/2 of the fix suggested in [1]: - Add a rss_context member to the argument struct and a capability like cap_link_lanes_supported to indicate whether driver supports rss contexts, then you can remove *et_rxfh_context functions, and instead call *et_rxfh() with a non-zero rss_context. Link: https://lore.kernel.org/netdev/20231121152906.2dd5f487@kernel.org/ [1] CC: Jesse Brandeburg CC: Tony Nguyen CC: Marcin Wojtas CC: Russell King CC: Sunil Goutham CC: Geetha sowjanya CC: Subbaraya Sundeep CC: hariprasad CC: Saeed Mahameed CC: Leon Romanovsky CC: Edward Cree CC: Martin Habets Suggested-by: Jakub Kicinski Signed-off-by: Ahmed Zaki Link: https://lore.kernel.org/r/20231213003321.605376-3-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 3ab2b6a90419..66fe254c3e51 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -609,6 +609,12 @@ struct ethtool_mm_stats { * which may be zero. On GET (read from the driver), the size of the * hardware hash key. * @key: The hash key of size @key_size bytes. + * @rss_context: RSS context identifier. Context 0 is the default for normal + * traffic; other contexts can be referenced as the destination for RX flow + * classification rules. On SET, %ETH_RXFH_CONTEXT_ALLOC is used + * to allocate a new RSS context; on return this field will + * contain the ID of the newly allocated context. + * @rss_delete: Set to non-ZERO to remove the @rss_context context. */ struct ethtool_rxfh_param { u8 hfunc; @@ -616,12 +622,16 @@ struct ethtool_rxfh_param { u32 *indir; u32 key_size; u8 *key; + u32 rss_context; + u8 rss_delete; }; /** * struct ethtool_ops - optional netdev operations * @cap_link_lanes_supported: indicates if the driver supports lanes * parameter. + * @cap_rss_ctx_supported: indicates if the driver supports RSS + * contexts. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. * @get_drvinfo: Report driver/device information. Modern drivers no @@ -718,15 +728,6 @@ struct ethtool_rxfh_param { * will remain unchanged. * Returns a negative error code or zero. An error code must be returned * if at least one unsupported change was requested. - * @get_rxfh_context: Get the contents of the RX flow hash indirection table, - * hash key, and/or hash function assiciated to the given rss context. - * Returns a negative error code or zero. - * @set_rxfh_context: Create, remove and configure RSS contexts. Allows setting - * the contents of the RX flow hash indirection table, hash key, and/or - * hash function associated to the given context. Arguments which are set - * to %NULL or zero will remain unchanged. - * Returns a negative error code or zero. An error code must be returned - * if at least one unsupported change was requested. * @get_channels: Get number of channels. * @set_channels: Set number of channels. Returns a negative error code or * zero. @@ -809,6 +810,7 @@ struct ethtool_rxfh_param { */ struct ethtool_ops { u32 cap_link_lanes_supported:1; + u32 cap_rss_ctx_supported:1; u32 supported_coalesce_params; u32 supported_ring_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); @@ -871,12 +873,6 @@ struct ethtool_ops { int (*get_rxfh)(struct net_device *, struct ethtool_rxfh_param *); int (*set_rxfh)(struct net_device *, struct ethtool_rxfh_param *, struct netlink_ext_ack *extack); - int (*get_rxfh_context)(struct net_device *, - struct ethtool_rxfh_param *, - u32 rss_context); - int (*set_rxfh_context)(struct net_device *, - struct ethtool_rxfh_param *, - u32 *rss_context, bool delete); void (*get_channels)(struct net_device *, struct ethtool_channels *); int (*set_channels)(struct net_device *, struct ethtool_channels *); int (*get_dump_flag)(struct net_device *, struct ethtool_dump *); -- cgit v1.2.3 From 13e59344fb9d3c9d3acd138ae320b5b67b658694 Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Tue, 12 Dec 2023 17:33:16 -0700 Subject: net: ethtool: add support for symmetric-xor RSS hash Symmetric RSS hash functions are beneficial in applications that monitor both Tx and Rx packets of the same flow (IDS, software firewalls, ..etc). Getting all traffic of the same flow on the same RX queue results in higher CPU cache efficiency. A NIC that supports "symmetric-xor" can achieve this RSS hash symmetry by XORing the source and destination fields and pass the values to the RSS hash algorithm. The user may request RSS hash symmetry for a specific algorithm, via: # ethtool -X eth0 hfunc symmetric-xor or turn symmetry off (asymmetric) by: # ethtool -X eth0 hfunc The specific fields for each flow type should then be specified as usual via: # ethtool -N|-U eth0 rx-flow-hash s|d|f|n Reviewed-by: Wojciech Drewek Signed-off-by: Ahmed Zaki Link: https://lore.kernel.org/r/20231213003321.605376-4-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 6 ++++++ include/uapi/linux/ethtool.h | 13 ++++++++++++- include/uapi/linux/ethtool_netlink.h | 1 + 3 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 66fe254c3e51..cfcd952a1d4f 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -615,6 +615,8 @@ struct ethtool_mm_stats { * to allocate a new RSS context; on return this field will * contain the ID of the newly allocated context. * @rss_delete: Set to non-ZERO to remove the @rss_context context. + * @input_xfrm: Defines how the input data is transformed. Valid values are one + * of %RXH_XFRM_*. */ struct ethtool_rxfh_param { u8 hfunc; @@ -624,6 +626,7 @@ struct ethtool_rxfh_param { u8 *key; u32 rss_context; u8 rss_delete; + u8 input_xfrm; }; /** @@ -632,6 +635,8 @@ struct ethtool_rxfh_param { * parameter. * @cap_rss_ctx_supported: indicates if the driver supports RSS * contexts. + * @cap_rss_sym_xor_supported: indicates if the driver supports symmetric-xor + * RSS. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. * @get_drvinfo: Report driver/device information. Modern drivers no @@ -811,6 +816,7 @@ struct ethtool_rxfh_param { struct ethtool_ops { u32 cap_link_lanes_supported:1; u32 cap_rss_ctx_supported:1; + u32 cap_rss_sym_xor_supported:1; u32 supported_coalesce_params; u32 supported_ring_params; void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index f7fba0dc87e5..0787d561ace0 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1266,6 +1266,8 @@ struct ethtool_rxfh_indir { * hardware hash key. * @hfunc: Defines the current RSS hash function used by HW (or to be set to). * Valid values are one of the %ETH_RSS_HASH_*. + * @input_xfrm: Defines how the input data is transformed. Valid values are one + * of %RXH_XFRM_*. * @rsvd8: Reserved for future use; see the note on reserved space. * @rsvd32: Reserved for future use; see the note on reserved space. * @rss_config: RX ring/queue index for each hash value i.e., indirection table @@ -1285,7 +1287,8 @@ struct ethtool_rxfh { __u32 indir_size; __u32 key_size; __u8 hfunc; - __u8 rsvd8[3]; + __u8 input_xfrm; + __u8 rsvd8[2]; __u32 rsvd32; __u32 rss_config[]; }; @@ -1992,6 +1995,14 @@ static inline int ethtool_validate_duplex(__u8 duplex) #define WOL_MODE_COUNT 8 +/* RSS hash function data + * XOR the corresponding source and destination fields of each specified + * protocol. Both copies of the XOR'ed fields are fed into the RSS and RXHASH + * calculation. Note that this XORing reduces the input set entropy and could + * be exploited to reduce the RSS queue spread. + */ +#define RXH_XFRM_SYM_XOR (1 << 0) + /* L2-L4 network traffic flow types */ #define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ #define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 73e2c10dc2cc..3f89074aa06c 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -908,6 +908,7 @@ enum { ETHTOOL_A_RSS_HFUNC, /* u32 */ ETHTOOL_A_RSS_INDIR, /* binary */ ETHTOOL_A_RSS_HKEY, /* binary */ + ETHTOOL_A_RSS_INPUT_XFRM, /* u32 */ __ETHTOOL_A_RSS_CNT, ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), -- cgit v1.2.3 From dc6e44c9d6d68e8aa5de78d15f43f93145719b72 Mon Sep 17 00:00:00 2001 From: Qi Zhang Date: Tue, 12 Dec 2023 17:33:18 -0700 Subject: ice: refactor RSS configuration Refactor the driver to use a communication data structure for RSS config. To do so we introduce the new ice_rss_hash_cfg struct, and then pass it as an argument to several functions. Also introduce enum ice_rss_cfg_hdr_type to specify a more granular and flexible RSS configuration: ICE_RSS_OUTER_HEADERS - take outer layer as RSS input set ICE_RSS_INNER_HEADERS - take inner layer as RSS input set ICE_RSS_INNER_HEADERS_W_OUTER_IPV4 - take inner layer as RSS input set for packet with outer IPV4 ICE_RSS_INNER_HEADERS_W_OUTER_IPV6 - take inner layer as RSS input set for packet with outer IPV6 ICE_RSS_ANY_HEADERS - try with outer first then inner (same as the behaviour without this change) Finally, move the virtchnl_rss_algorithm enum to be with the other RSS related structures in the virtchnl.h file. There should be no functional change due to this patch. Reviewed-by: Wojciech Drewek Signed-off-by: Qi Zhang Co-developed-by: Jesse Brandeburg Signed-off-by: Jesse Brandeburg Co-developed-by: Ahmed Zaki Signed-off-by: Ahmed Zaki Link: https://lore.kernel.org/r/20231213003321.605376-6-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- include/linux/avf/virtchnl.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 6b3acf15be5c..b0e060cc79ac 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -911,6 +911,14 @@ struct virtchnl_rss_hena { VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hena); +/* Type of RSS algorithm */ +enum virtchnl_rss_algorithm { + VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC = 0, + VIRTCHNL_RSS_ALG_R_ASYMMETRIC = 1, + VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC = 2, + VIRTCHNL_RSS_ALG_XOR_SYMMETRIC = 3, +}; + /* VIRTCHNL_OP_ENABLE_CHANNELS * VIRTCHNL_OP_DISABLE_CHANNELS * VF sends these messages to enable or disable channels based on @@ -1095,14 +1103,6 @@ enum virtchnl_vfr_states { VIRTCHNL_VFR_VFACTIVE, }; -/* Type of RSS algorithm */ -enum virtchnl_rss_algorithm { - VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC = 0, - VIRTCHNL_RSS_ALG_R_ASYMMETRIC = 1, - VIRTCHNL_RSS_ALG_TOEPLITZ_SYMMETRIC = 2, - VIRTCHNL_RSS_ALG_XOR_SYMMETRIC = 3, -}; - #define VIRTCHNL_MAX_NUM_PROTO_HDRS 32 #define PROTO_HDR_SHIFT 5 #define PROTO_HDR_FIELD_START(proto_hdr_type) ((proto_hdr_type) << PROTO_HDR_SHIFT) -- cgit v1.2.3 From 4a3de3fb0eb6897488dd510006abd9673f1fb34c Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Tue, 12 Dec 2023 17:33:21 -0700 Subject: iavf: enable symmetric-xor RSS for Toeplitz hash function Allow the user to set the symmetric Toeplitz hash function via: # ethtool -X eth0 hfunc toeplitz symmetric-xor The driver will reject any new RSS configuration if a field other than (IP src/dst and L4 src/dst ports) is requested for hashing. The symmetric RSS will not be supported on PFs not advertising the ADV RSS Offload flag (ADV_RSS_SUPPORT()), for example the E700 series (i40e). Reviewed-by: Madhu Chittim Signed-off-by: Ahmed Zaki Link: https://lore.kernel.org/r/20231213003321.605376-9-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- include/linux/avf/virtchnl.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index b0e060cc79ac..a44d9dc7e3eb 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -118,6 +118,7 @@ enum virtchnl_ops { VIRTCHNL_OP_GET_STATS = 15, VIRTCHNL_OP_RSVD = 16, VIRTCHNL_OP_EVENT = 17, /* must ALWAYS be 17 */ + VIRTCHNL_OP_CONFIG_RSS_HFUNC = 18, /* opcode 19 is reserved */ VIRTCHNL_OP_IWARP = 20, /* advanced opcode */ VIRTCHNL_OP_RDMA = VIRTCHNL_OP_IWARP, @@ -919,6 +920,21 @@ enum virtchnl_rss_algorithm { VIRTCHNL_RSS_ALG_XOR_SYMMETRIC = 3, }; +/* VIRTCHNL_OP_CONFIG_RSS_HFUNC + * VF sends this message to configure the RSS hash function. Only supported + * if both PF and VF drivers set the VIRTCHNL_VF_OFFLOAD_RSS_PF bit during + * configuration negotiation. + * The hash function is initialized to VIRTCHNL_RSS_ALG_TOEPLITZ_ASYMMETRIC + * by the PF. + */ +struct virtchnl_rss_hfunc { + u16 vsi_id; + u16 rss_algorithm; /* enum virtchnl_rss_algorithm */ + u32 reserved; +}; + +VIRTCHNL_CHECK_STRUCT_LEN(8, virtchnl_rss_hfunc); + /* VIRTCHNL_OP_ENABLE_CHANNELS * VIRTCHNL_OP_DISABLE_CHANNELS * VF sends these messages to enable or disable channels based on @@ -1542,6 +1558,9 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, vrl->lut_entries); } break; + case VIRTCHNL_OP_CONFIG_RSS_HFUNC: + valid_len = sizeof(struct virtchnl_rss_hfunc); + break; case VIRTCHNL_OP_GET_RSS_HENA_CAPS: break; case VIRTCHNL_OP_SET_RSS_HENA: -- cgit v1.2.3 From 05b234565e02ec51df75ec48eb7370e64ed05e04 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 12 Dec 2023 20:35:58 -0800 Subject: wifi: cfg80211: fix spelling & punctutation Correct spelling and run-on sentences. Signed-off-by: Randy Dunlap Cc: Johannes Berg Cc: linux-wireless@vger.kernel.org Cc: Kalle Valo Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Link: https://msgid.link/20231213043558.10409-1-rdunlap@infradead.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4d6b9d801c2f..602960dafe0f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -52,7 +52,7 @@ * such wiphy can have zero, one, or many virtual interfaces associated with * it, which need to be identified as such by pointing the network interface's * @ieee80211_ptr pointer to a &struct wireless_dev which further describes - * the wireless part of the interface, normally this struct is embedded in the + * the wireless part of the interface. Normally this struct is embedded in the * network interface's private data area. Drivers can optionally allow creating * or destroying virtual interfaces on the fly, but without at least one or the * ability to create some the wireless device isn't useful. @@ -2584,7 +2584,7 @@ struct cfg80211_scan_info { * @short_ssid: short ssid to scan for * @bssid: bssid to scan for * @channel_idx: idx of the channel in the channel array in the scan request - * which the above info relvant to + * which the above info is relevant to * @unsolicited_probe: the AP transmits unsolicited probe response every 20 TU * @short_ssid_valid: @short_ssid is valid and can be used * @psc_no_listen: when set, and the channel is a PSC channel, no need to wait @@ -7504,7 +7504,7 @@ void cfg80211_notify_new_peer_candidate(struct net_device *dev, * RFkill integration in cfg80211 is almost invisible to drivers, * as cfg80211 automatically registers an rfkill instance for each * wireless device it knows about. Soft kill is also translated - * into disconnecting and turning all interfaces off, drivers are + * into disconnecting and turning all interfaces off. Drivers are * expected to turn off the device when all interfaces are down. * * However, devices may have a hard RFkill line, in which case they @@ -7552,7 +7552,7 @@ static inline void wiphy_rfkill_stop_polling(struct wiphy *wiphy) * the configuration mechanism. * * A driver supporting vendor commands must register them as an array - * in struct wiphy, with handlers for each one, each command has an + * in struct wiphy, with handlers for each one. Each command has an * OUI and sub command ID to identify it. * * Note that this feature should not be (ab)used to implement protocol @@ -7716,7 +7716,7 @@ static inline void cfg80211_vendor_event(struct sk_buff *skb, gfp_t gfp) * interact with driver-specific tools to aid, for instance, * factory programming. * - * This chapter describes how drivers interact with it, for more + * This chapter describes how drivers interact with it. For more * information see the nl80211 book's chapter on it. */ -- cgit v1.2.3 From 074b3cf442c518631f4b6d11d7fdfe143e17e955 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 12 Dec 2023 20:43:15 -0800 Subject: wifi: nl80211: fix grammar & spellos Correct spelling as reported by codespell. Correct run-on sentences and other grammar issues. Add hyphenation of adjectives. Correct some punctuation. Signed-off-by: Randy Dunlap Cc: Johannes Berg Cc: linux-wireless@vger.kernel.org Cc: Kalle Valo Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Link: https://msgid.link/20231213044315.19459-1-rdunlap@infradead.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 74 ++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 2d8468cbc457..a682b54bd3ba 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -72,7 +72,7 @@ * For drivers supporting TDLS with external setup (WIPHY_FLAG_SUPPORTS_TDLS * and WIPHY_FLAG_TDLS_EXTERNAL_SETUP), the station lifetime is as follows: * - a setup station entry is added, not yet authorized, without any rate - * or capability information, this just exists to avoid race conditions + * or capability information; this just exists to avoid race conditions * - when the TDLS setup is done, a single NL80211_CMD_SET_STATION is valid * to add rate and capability information to the station and at the same * time mark it authorized. @@ -87,7 +87,7 @@ * DOC: Frame transmission/registration support * * Frame transmission and registration support exists to allow userspace - * management entities such as wpa_supplicant react to management frames + * management entities such as wpa_supplicant to react to management frames * that are not being handled by the kernel. This includes, for example, * certain classes of action frames that cannot be handled in the kernel * for various reasons. @@ -113,7 +113,7 @@ * * Frame transmission allows userspace to send for example the required * responses to action frames. It is subject to some sanity checking, - * but many frames can be transmitted. When a frame was transmitted, its + * but many frames can be transmitted. When a frame is transmitted, its * status is indicated to the sending socket. * * For more technical details, see the corresponding command descriptions @@ -123,7 +123,7 @@ /** * DOC: Virtual interface / concurrency capabilities * - * Some devices are able to operate with virtual MACs, they can have + * Some devices are able to operate with virtual MACs; they can have * more than one virtual interface. The capability handling for this * is a bit complex though, as there may be a number of restrictions * on the types of concurrency that are supported. @@ -135,7 +135,7 @@ * Once concurrency is desired, more attributes must be observed: * To start with, since some interface types are purely managed in * software, like the AP-VLAN type in mac80211 for example, there's - * an additional list of these, they can be added at any time and + * an additional list of these; they can be added at any time and * are only restricted by some semantic restrictions (e.g. AP-VLAN * cannot be added without a corresponding AP interface). This list * is exported in the %NL80211_ATTR_SOFTWARE_IFTYPES attribute. @@ -164,7 +164,7 @@ * Packet coalesce feature helps to reduce number of received interrupts * to host by buffering these packets in firmware/hardware for some * predefined time. Received interrupt will be generated when one of the - * following events occur. + * following events occurs. * a) Expiration of hardware timer whose expiration time is set to maximum * coalescing delay of matching coalesce rule. * b) Coalescing buffer in hardware reaches its limit. @@ -174,7 +174,7 @@ * rule. * a) Maximum coalescing delay * b) List of packet patterns which needs to be matched - * c) Condition for coalescence. pattern 'match' or 'no match' + * c) Condition for coalescence: pattern 'match' or 'no match' * Multiple such rules can be created. */ @@ -213,7 +213,7 @@ /** * DOC: FILS shared key authentication offload * - * FILS shared key authentication offload can be advertized by drivers by + * FILS shared key authentication offload can be advertised by drivers by * setting @NL80211_EXT_FEATURE_FILS_SK_OFFLOAD flag. The drivers that support * FILS shared key authentication offload should be able to construct the * authentication and association frames for FILS shared key authentication and @@ -239,7 +239,7 @@ * The PMKSA can be maintained in userspace persistently so that it can be used * later after reboots or wifi turn off/on also. * - * %NL80211_ATTR_FILS_CACHE_ID is the cache identifier advertized by a FILS + * %NL80211_ATTR_FILS_CACHE_ID is the cache identifier advertised by a FILS * capable AP supporting PMK caching. It specifies the scope within which the * PMKSAs are cached in an ESS. %NL80211_CMD_SET_PMKSA and * %NL80211_CMD_DEL_PMKSA are enhanced to allow support for PMKSA caching based @@ -290,12 +290,12 @@ * If the configuration needs to be applied for specific peer then the MAC * address of the peer needs to be passed in %NL80211_ATTR_MAC, otherwise the * configuration will be applied for all the connected peers in the vif except - * any peers that have peer specific configuration for the TID by default; if - * the %NL80211_TID_CONFIG_ATTR_OVERRIDE flag is set, peer specific values + * any peers that have peer-specific configuration for the TID by default; if + * the %NL80211_TID_CONFIG_ATTR_OVERRIDE flag is set, peer-specific values * will be overwritten. * - * All this configuration is valid only for STA's current connection - * i.e. the configuration will be reset to default when the STA connects back + * All this configuration is valid only for STA's current connection, + * i.e., the configuration will be reset to default when the STA connects back * after disconnection/roaming, and this configuration will be cleared when * the interface goes down. */ @@ -521,7 +521,7 @@ * %NL80211_ATTR_SCHED_SCAN_PLANS. If %NL80211_ATTR_SCHED_SCAN_PLANS is * not specified and only %NL80211_ATTR_SCHED_SCAN_INTERVAL is specified, * scheduled scan will run in an infinite loop with the specified interval. - * These attributes are mutually exculsive, + * These attributes are mutually exclusive, * i.e. NL80211_ATTR_SCHED_SCAN_INTERVAL must not be passed if * NL80211_ATTR_SCHED_SCAN_PLANS is defined. * If for some reason scheduled scan is aborted by the driver, all scan @@ -552,7 +552,7 @@ * %NL80211_CMD_STOP_SCHED_SCAN command is received or when the interface * is brought down while a scheduled scan was running. * - * @NL80211_CMD_GET_SURVEY: get survey resuls, e.g. channel occupation + * @NL80211_CMD_GET_SURVEY: get survey results, e.g. channel occupation * or noise level * @NL80211_CMD_NEW_SURVEY_RESULTS: survey data notification (as a reply to * NL80211_CMD_GET_SURVEY and on the "scan" multicast group) @@ -563,7 +563,7 @@ * using %NL80211_ATTR_SSID, %NL80211_ATTR_FILS_CACHE_ID, * %NL80211_ATTR_PMKID, and %NL80211_ATTR_PMK in case of FILS * authentication where %NL80211_ATTR_FILS_CACHE_ID is the identifier - * advertized by a FILS capable AP identifying the scope of PMKSA in an + * advertised by a FILS capable AP identifying the scope of PMKSA in an * ESS. * @NL80211_CMD_DEL_PMKSA: Delete a PMKSA cache entry, using %NL80211_ATTR_MAC * (for the BSSID) and %NL80211_ATTR_PMKID or using %NL80211_ATTR_SSID, @@ -608,7 +608,7 @@ * BSSID in case of station mode). %NL80211_ATTR_SSID is used to specify * the SSID (mainly for association, but is included in authentication * request, too, to help BSS selection. %NL80211_ATTR_WIPHY_FREQ + - * %NL80211_ATTR_WIPHY_FREQ_OFFSET is used to specify the frequence of the + * %NL80211_ATTR_WIPHY_FREQ_OFFSET is used to specify the frequency of the * channel in MHz. %NL80211_ATTR_AUTH_TYPE is used to specify the * authentication type. %NL80211_ATTR_IE is used to define IEs * (VendorSpecificInfo, but also including RSN IE and FT IEs) to be added @@ -817,7 +817,7 @@ * reached. * @NL80211_CMD_SET_CHANNEL: Set the channel (using %NL80211_ATTR_WIPHY_FREQ * and the attributes determining channel width) the given interface - * (identifed by %NL80211_ATTR_IFINDEX) shall operate on. + * (identified by %NL80211_ATTR_IFINDEX) shall operate on. * In case multiple channels are supported by the device, the mechanism * with which it switches channels is implementation-defined. * When a monitor interface is given, it can only switch channel while @@ -889,7 +889,7 @@ * inform userspace of the new replay counter. * * @NL80211_CMD_PMKSA_CANDIDATE: This is used as an event to inform userspace - * of PMKSA caching dandidates. + * of PMKSA caching candidates. * * @NL80211_CMD_TDLS_OPER: Perform a high-level TDLS command (e.g. link setup). * In addition, this can be used as an event to request userspace to take @@ -925,7 +925,7 @@ * * @NL80211_CMD_PROBE_CLIENT: Probe an associated station on an AP interface * by sending a null data frame to it and reporting when the frame is - * acknowleged. This is used to allow timing out inactive clients. Uses + * acknowledged. This is used to allow timing out inactive clients. Uses * %NL80211_ATTR_IFINDEX and %NL80211_ATTR_MAC. The command returns a * direct reply with an %NL80211_ATTR_COOKIE that is later used to match * up the event with the request. The event includes the same data and @@ -1847,7 +1847,7 @@ enum nl80211_commands { * using %CMD_CONTROL_PORT_FRAME. If control port routing over NL80211 is * to be used then userspace must also use the %NL80211_ATTR_SOCKET_OWNER * flag. When used with %NL80211_ATTR_CONTROL_PORT_NO_PREAUTH, pre-auth - * frames are not forwared over the control port. + * frames are not forwarded over the control port. * * @NL80211_ATTR_TESTDATA: Testmode data blob, passed through to the driver. * We recommend using nested, driver-specific attributes within this. @@ -1984,10 +1984,10 @@ enum nl80211_commands { * bit. Depending on which antennas are selected in the bitmap, 802.11n * drivers can derive which chainmasks to use (if all antennas belonging to * a particular chain are disabled this chain should be disabled) and if - * a chain has diversity antennas wether diversity should be used or not. + * a chain has diversity antennas whether diversity should be used or not. * HT capabilities (STBC, TX Beamforming, Antenna selection) can be * derived from the available chains after applying the antenna mask. - * Non-802.11n drivers can derive wether to use diversity or not. + * Non-802.11n drivers can derive whether to use diversity or not. * Drivers may reject configurations or RX/TX mask combinations they cannot * support by returning -EINVAL. * @@ -2557,7 +2557,7 @@ enum nl80211_commands { * from successful FILS authentication and is used with * %NL80211_CMD_CONNECT. * - * @NL80211_ATTR_FILS_CACHE_ID: A 2-octet identifier advertized by a FILS AP + * @NL80211_ATTR_FILS_CACHE_ID: A 2-octet identifier advertised by a FILS AP * identifying the scope of PMKSAs. This is used with * @NL80211_CMD_SET_PMKSA and @NL80211_CMD_DEL_PMKSA. * @@ -4200,7 +4200,7 @@ enum nl80211_wmm_rule { * (100 * dBm). * @NL80211_FREQUENCY_ATTR_DFS_STATE: current state for DFS * (enum nl80211_dfs_state) - * @NL80211_FREQUENCY_ATTR_DFS_TIME: time in miliseconds for how long + * @NL80211_FREQUENCY_ATTR_DFS_TIME: time in milliseconds for how long * this channel is in this DFS state. * @NL80211_FREQUENCY_ATTR_NO_HT40_MINUS: HT40- isn't possible with this * channel as the control channel @@ -5518,7 +5518,7 @@ enum nl80211_tx_rate_setting { * (%NL80211_TID_CONFIG_ATTR_TIDS, %NL80211_TID_CONFIG_ATTR_OVERRIDE). * @NL80211_TID_CONFIG_ATTR_PEER_SUPP: same as the previous per-vif one, but * per peer instead. - * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribue, if set indicates + * @NL80211_TID_CONFIG_ATTR_OVERRIDE: flag attribute, if set indicates * that the new configuration overrides all previous peer * configurations, otherwise previous peer specific configurations * should be left untouched. @@ -5901,7 +5901,7 @@ enum nl80211_attr_coalesce_rule { /** * enum nl80211_coalesce_condition - coalesce rule conditions - * @NL80211_COALESCE_CONDITION_MATCH: coalaesce Rx packets when patterns + * @NL80211_COALESCE_CONDITION_MATCH: coalesce Rx packets when patterns * in a rule are matched. * @NL80211_COALESCE_CONDITION_NO_MATCH: coalesce Rx packets when patterns * in a rule are not matched. @@ -6000,7 +6000,7 @@ enum nl80211_if_combination_attrs { * enum nl80211_plink_state - state of a mesh peer link finite state machine * * @NL80211_PLINK_LISTEN: initial state, considered the implicit - * state of non existent mesh peer links + * state of non-existent mesh peer links * @NL80211_PLINK_OPN_SNT: mesh plink open frame has been sent to * this mesh peer * @NL80211_PLINK_OPN_RCVD: mesh plink open frame has been received @@ -6293,7 +6293,7 @@ enum nl80211_feature_flags { * request to use RRM (see %NL80211_ATTR_USE_RRM) with * %NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests, which will set * the ASSOC_REQ_USE_RRM flag in the association request even if - * NL80211_FEATURE_QUIET is not advertized. + * NL80211_FEATURE_QUIET is not advertised. * @NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER: This device supports MU-MIMO air * sniffer which means that it can be configured to hear packets from * certain groups which can be configured by the @@ -6305,7 +6305,7 @@ enum nl80211_feature_flags { * the BSS that the interface that requested the scan is connected to * (if available). * @NL80211_EXT_FEATURE_BSS_PARENT_TSF: Per BSS, this driver reports the - * time the last beacon/probe was received. For a non MLO connection, the + * time the last beacon/probe was received. For a non-MLO connection, the * time is the TSF of the BSS that the interface that requested the scan is * connected to (if available). For an MLO connection, the time is the TSF * of the BSS corresponding with link ID specified in the scan request (if @@ -6313,7 +6313,7 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SET_SCAN_DWELL: This driver supports configuration of * channel dwell time. * @NL80211_EXT_FEATURE_BEACON_RATE_LEGACY: Driver supports beacon rate - * configuration (AP/mesh), supporting a legacy (non HT/VHT) rate. + * configuration (AP/mesh), supporting a legacy (non-HT/VHT) rate. * @NL80211_EXT_FEATURE_BEACON_RATE_HT: Driver supports beacon rate * configuration (AP/mesh) with HT rates. * @NL80211_EXT_FEATURE_BEACON_RATE_VHT: Driver supports beacon rate @@ -6649,7 +6649,7 @@ enum nl80211_timeout_reason { * request parameters IE in the probe request * @NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP: accept broadcast probe responses * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_HIGH_TX_RATE: send probe request frames at - * rate of at least 5.5M. In case non OCE AP is discovered in the channel, + * rate of at least 5.5M. In case non-OCE AP is discovered in the channel, * only the first probe req in the channel will be sent in high rate. * @NL80211_SCAN_FLAG_OCE_PROBE_REQ_DEFERRAL_SUPPRESSION: allow probe request * tx deferral (dot11FILSProbeDelay shall be set to 15ms) @@ -6685,7 +6685,7 @@ enum nl80211_timeout_reason { * received on the 2.4/5 GHz channels to actively scan only the 6GHz * channels on which APs are expected to be found. Note that when not set, * the scan logic would scan all 6GHz channels, but since transmission of - * probe requests on non PSC channels is limited, it is highly likely that + * probe requests on non-PSC channels is limited, it is highly likely that * these channels would passively be scanned. Also note that when the flag * is set, in addition to the colocated APs, PSC channels would also be * scanned if the user space has asked for it. @@ -7017,7 +7017,7 @@ enum nl80211_nan_func_term_reason { * The instance ID for the follow up Service Discovery Frame. This is u8. * @NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID: relevant if the function's type * is follow up. This is a u8. - * The requestor instance ID for the follow up Service Discovery Frame. + * The requester instance ID for the follow up Service Discovery Frame. * @NL80211_NAN_FUNC_FOLLOW_UP_DEST: the MAC address of the recipient of the * follow up Service Discovery Frame. This is a binary attribute. * @NL80211_NAN_FUNC_CLOSE_RANGE: is this function limited for devices in a @@ -7407,7 +7407,7 @@ enum nl80211_peer_measurement_attrs { * @NL80211_PMSR_FTM_CAPA_ATTR_TRIGGER_BASED: flag attribute indicating if * trigger based ranging measurement is supported * @NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED: flag attribute indicating - * if non trigger based ranging measurement is supported + * if non-trigger-based ranging measurement is supported * * @NUM_NL80211_PMSR_FTM_CAPA_ATTR: internal * @NL80211_PMSR_FTM_CAPA_ATTR_MAX: highest attribute number @@ -7461,7 +7461,7 @@ enum nl80211_peer_measurement_ftm_capa { * if neither %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED nor * %NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED is set, EDCA based * ranging will be used. - * @NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED: request non trigger based + * @NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED: request non-trigger-based * ranging measurement (flag) * This attribute and %NL80211_PMSR_FTM_REQ_ATTR_TRIGGER_BASED are * mutually exclusive. @@ -7539,7 +7539,7 @@ enum nl80211_peer_measurement_ftm_failure_reasons { * @NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_ATTEMPTS: number of FTM Request frames * transmitted (u32, optional) * @NL80211_PMSR_FTM_RESP_ATTR_NUM_FTMR_SUCCESSES: number of FTM Request frames - * that were acknowleged (u32, optional) + * that were acknowledged (u32, optional) * @NL80211_PMSR_FTM_RESP_ATTR_BUSY_RETRY_TIME: retry time received from the * busy peer (u32, seconds) * @NL80211_PMSR_FTM_RESP_ATTR_NUM_BURSTS_EXP: actual number of bursts exponent -- cgit v1.2.3 From 04d25ccea2b3199269b7e500da33023b51418fde Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 12 Dec 2023 20:37:35 -0800 Subject: net, xdp: Correct grammar Use the correct verb form in 2 places in the XDP rx-queue comment. Signed-off-by: Randy Dunlap Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20231213043735.30208-1-rdunlap@infradead.org --- include/net/xdp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 8cd04a74dba5..e6770dd40c91 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -16,7 +16,7 @@ * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how - * the driver have configured a given RX-ring queue. + * the driver has configured a given RX-ring queue. * * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP @@ -32,7 +32,7 @@ * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver - * naturally need to stop the RX-ring before purging and reallocating + * naturally needs to stop the RX-ring before purging and reallocating * memory. In that process the driver MUST call unregister (which * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. -- cgit v1.2.3 From 8f0ec8c681755f523cf842bfe350ea40609b83a9 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Thu, 14 Dec 2023 15:49:02 -0700 Subject: bpf: xfrm: Add bpf_xdp_get_xfrm_state() kfunc This commit adds an unstable kfunc helper to access internal xfrm_state associated with an SA. This is intended to be used for the upcoming IPsec pcpu work to assign special pcpu SAs to a particular CPU. In other words: for custom software RSS. That being said, the function that this kfunc wraps is fairly generic and used for a lot of xfrm tasks. I'm sure people will find uses elsewhere over time. This commit also adds a corresponding bpf_xdp_xfrm_state_release() kfunc to release the refcnt acquired by bpf_xdp_get_xfrm_state(). The verifier will require that all acquired xfrm_state's are released. Co-developed-by: Antony Antony Signed-off-by: Antony Antony Acked-by: Steffen Klassert Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/a29699c42f5fad456b875c98dd11c6afc3ffb707.1702593901.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/net/xfrm.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index c9bb0f892f55..1d107241b901 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2190,4 +2190,13 @@ static inline int register_xfrm_interface_bpf(void) #endif +#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF) +int register_xfrm_state_bpf(void); +#else +static inline int register_xfrm_state_bpf(void) +{ + return 0; +} +#endif + #endif /* _NET_XFRM_H */ -- cgit v1.2.3 From bf873a800ac3234eba991603a450eaa517d27022 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 12 Dec 2023 20:35:11 -0800 Subject: net: skbuff: fix spelling errors Correct spelling as reported by codespell. Signed-off-by: Randy Dunlap Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20231213043511.10357-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b370eb8d70f7..7ce38874dbd1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1069,7 +1069,7 @@ struct sk_buff { refcount_t users; #ifdef CONFIG_SKB_EXTENSIONS - /* only useable after checking ->active_extensions != 0 */ + /* only usable after checking ->active_extensions != 0 */ struct skb_ext *extensions; #endif }; @@ -3311,7 +3311,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, unsigned int order) { /* This piece of code contains several assumptions. - * 1. This is for device Rx, therefor a cold page is preferred. + * 1. This is for device Rx, therefore a cold page is preferred. * 2. The expectation is the user wants a compound page. * 3. If requesting a order 0 page it will not be compound * due to the check to see if order has a value in prep_new_page @@ -4247,7 +4247,7 @@ static inline bool __skb_metadata_differs(const struct sk_buff *skb_a, { const void *a = skb_metadata_end(skb_a); const void *b = skb_metadata_end(skb_b); - /* Using more efficient varaiant than plain call to memcmp(). */ + /* Using more efficient variant than plain call to memcmp(). */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 diffs = 0; -- cgit v1.2.3 From fcb29877f7e18a1f27d7d6871f5f7bb6aaade575 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 12 Dec 2023 20:36:50 -0800 Subject: page_pool: fix typos and punctuation Correct spelling (s/and/any) and a run-on sentence. Spell out "multi". Signed-off-by: Randy Dunlap Acked-by: Jesper Dangaard Brouer Acked-by: Ilias Apalodimas Link: https://lore.kernel.org/r/20231213043650.12672-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 841e0a930bd7..ead2c0d24b2c 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -11,7 +11,7 @@ * The page_pool allocator is optimized for recycling page or page fragment used * by skb packet and xdp frame. * - * Basic use involves replacing and alloc_pages() calls with page_pool_alloc(), + * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), * which allocate memory with or without page splitting depending on the * requested memory size. * @@ -37,15 +37,15 @@ * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * - * page_pool_put_page() may be called multi times on the same page if a page is - * split into multi fragments. For the last fragment, it will either recycle the - * page, or in case of page->_refcount > 1, it will release the DMA mapping and - * in-flight state accounting. + * page_pool_put_page() may be called multiple times on the same page if a page + * is split into multiple fragments. For the last fragment, it will either + * recycle the page, or in case of page->_refcount > 1, it will release the DMA + * mapping and in-flight state accounting. * * dma_sync_single_range_for_device() is only called for the last fragment when * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the * last freed fragment to do the sync_for_device operation for all fragments in - * the same page when a page is split, the API user must setup pool->p.max_len + * the same page when a page is split. The API user must setup pool->p.max_len * and pool->p.offset correctly and ensure that page_pool_put_page() is called * with dma_sync_size being -1 for fragment API. */ -- cgit v1.2.3 From 2373699560a754079579b7722b50d1d38de1960e Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 28 Nov 2023 12:16:55 +0100 Subject: mac802154: Avoid new associations while disassociating While disassociating from a PAN ourselves, let's set the maximum number of associations temporarily to zero to be sure no new device tries to associate with us. Signed-off-by: Miquel Raynal Acked-by: Stefan Schmidt Acked-by: Alexander Aring Link: https://lore.kernel.org/linux-wpan/20231128111655.507479-6-miquel.raynal@bootlin.com --- include/net/cfg802154.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index a64bbcd71f10..cd95711b12b8 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -589,8 +589,10 @@ cfg802154_device_is_child(struct wpan_dev *wpan_dev, * cfg802154_set_max_associations - Limit the number of future associations * @wpan_dev: the wpan device * @max: the maximum number of devices we accept to associate + * @return: the old maximum value */ -void cfg802154_set_max_associations(struct wpan_dev *wpan_dev, unsigned int max); +unsigned int cfg802154_set_max_associations(struct wpan_dev *wpan_dev, + unsigned int max); /** * cfg802154_get_free_short_addr - Get a free address among the known devices -- cgit v1.2.3 From 02fed6d92badf08e2ac2cd1755d14c45c8f3d0ca Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Wed, 13 Dec 2023 02:01:47 -0800 Subject: net: mana: add msix index sharing between EQs This patch allows to assign and poll more than one EQ on the same msix index. It is achieved by introducing a list of attached EQs in each IRQ context. It also removes the existing msix_index map that tried to ensure that there is only one EQ at each msix_index. This patch exports symbols for creating EQs from other MANA kernel modules. Signed-off-by: Konstantin Taranov Signed-off-by: David S. Miller --- include/net/mana/gdma.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 88b6ef7ce1a6..76f2fd2645b7 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -293,6 +293,7 @@ struct gdma_queue { u32 head; u32 tail; + struct list_head entry; /* Extra fields specific to EQ/CQ. */ union { @@ -328,6 +329,7 @@ struct gdma_queue_spec { void *context; unsigned long log2_throttle_limit; + unsigned int msix_index; } eq; struct { @@ -344,7 +346,9 @@ struct gdma_queue_spec { struct gdma_irq_context { void (*handler)(void *arg); - void *arg; + /* Protect the eq_list */ + spinlock_t lock; + struct list_head eq_list; char name[MANA_IRQ_NAME_SZ]; }; @@ -355,7 +359,6 @@ struct gdma_context { unsigned int max_num_queues; unsigned int max_num_msix; unsigned int num_msix_usable; - struct gdma_resource msix_resource; struct gdma_irq_context *irq_contexts; /* L2 MTU */ -- cgit v1.2.3 From 0fe1798968115488c0c02f4633032a015b1faf97 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Thu, 14 Dec 2023 15:52:29 +0300 Subject: virtio/vsock: send credit update during setting SO_RCVLOWAT Send credit update message when SO_RCVLOWAT is updated and it is bigger than number of bytes in rx queue. It is needed, because 'poll()' will wait until number of bytes in rx queue will be not smaller than O_RCVLOWAT, so kick sender to send more data. Otherwise mutual hungup for tx/rx is possible: sender waits for free space and receiver is waiting data in 'poll()'. Rename 'set_rcvlowat' callback to 'notify_set_rcvlowat' and set 'sk->sk_rcvlowat' only in one place (i.e. 'vsock_set_rcvlowat'), so the transport doesn't need to do it. Fixes: b89d882dc9fc ("vsock/virtio: reduce credit update messages") Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- include/linux/virtio_vsock.h | 1 + include/net/af_vsock.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index ebb3ce63d64d..c82089dee0c8 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -256,4 +256,5 @@ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t read_actor); +int virtio_transport_notify_set_rcvlowat(struct vsock_sock *vsk, int val); #endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index e302c0e804d0..535701efc1e5 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -137,7 +137,6 @@ struct vsock_transport { u64 (*stream_rcvhiwat)(struct vsock_sock *); bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); - int (*set_rcvlowat)(struct vsock_sock *vsk, int val); /* SEQ_PACKET. */ ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, @@ -168,6 +167,7 @@ struct vsock_transport { struct vsock_transport_send_notify_data *); /* sk_lock held by the caller */ void (*notify_buffer_size)(struct vsock_sock *, u64 *); + int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); /* Shutdown. */ int (*shutdown)(struct vsock_sock *, int); -- cgit v1.2.3 From f5769faeec36b9d5b9df2c3e4f05a76d04ffd9c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Dec 2023 10:49:00 +0000 Subject: net: Namespace-ify sysctl_optmem_max optmem_max being used in tx zerocopy, we want to be able to control it on a netns basis. Following patch changes two tests. Tested: oqq130:~# cat /proc/sys/net/core/optmem_max 131072 oqq130:~# echo 1000000 >/proc/sys/net/core/optmem_max oqq130:~# cat /proc/sys/net/core/optmem_max 1000000 oqq130:~# unshare -n oqq130:~# cat /proc/sys/net/core/optmem_max 131072 oqq130:~# exit logout oqq130:~# cat /proc/sys/net/core/optmem_max 1000000 Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/netns/core.h | 1 + include/net/sock.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/core.h b/include/net/netns/core.h index a91ef9f8de60..78214f1b43a2 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -13,6 +13,7 @@ struct netns_core { struct ctl_table_header *sysctl_hdr; int sysctl_somaxconn; + int sysctl_optmem_max; u8 sysctl_txrehash; #ifdef CONFIG_PROC_FS diff --git a/include/net/sock.h b/include/net/sock.h index 1d6931caf0c3..8b6fe164b218 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2920,7 +2920,6 @@ extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; extern int sysctl_tstamp_allow_data; -extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; -- cgit v1.2.3 From b059aef76c519226730dd18777c0e15dad4fae21 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 14 Dec 2023 17:57:35 -0800 Subject: netlink: specs: mptcp: rename the MPTCP path management spec We assume in handful of places that the name of the spec is the same as the name of the family. We could fix that but it seems like a fair assumption to make. Rename the MPTCP spec instead. Reviewed-by: Mat Martineau Reviewed-by: Donald Hunter Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/mptcp_pm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index b5d11aece408..50589e5dd6a3 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* Do not edit directly, auto-generated from: */ -/* Documentation/netlink/specs/mptcp.yaml */ +/* Documentation/netlink/specs/mptcp_pm.yaml */ /* YNL-GEN uapi header */ #ifndef _UAPI_LINUX_MPTCP_PM_H -- cgit v1.2.3 From 4382159696c9af67ee047ed55f2dbf05480f52f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:17 +0100 Subject: cfi: Flip headers Normal include order is that linux/foo.h should include asm/foo.h, CFI has it the wrong way around. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sami Tolvanen Link: https://lore.kernel.org/r/20231215092707.231038174@infradead.org Signed-off-by: Alexei Starovoitov --- include/asm-generic/Kbuild | 1 + include/asm-generic/cfi.h | 5 +++++ include/linux/cfi.h | 1 + 3 files changed, 7 insertions(+) create mode 100644 include/asm-generic/cfi.h (limited to 'include') diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index def242528b1d..d436bee4d129 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -11,6 +11,7 @@ mandatory-y += bitops.h mandatory-y += bug.h mandatory-y += bugs.h mandatory-y += cacheflush.h +mandatory-y += cfi.h mandatory-y += checksum.h mandatory-y += compat.h mandatory-y += current.h diff --git a/include/asm-generic/cfi.h b/include/asm-generic/cfi.h new file mode 100644 index 000000000000..41fac3537bf9 --- /dev/null +++ b/include/asm-generic/cfi.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_GENERIC_CFI_H +#define __ASM_GENERIC_CFI_H + +#endif /* __ASM_GENERIC_CFI_H */ diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 3552ec82b725..2309d74e77e6 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -9,6 +9,7 @@ #include #include +#include #ifdef CONFIG_CFI_CLANG enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, -- cgit v1.2.3 From 4f9087f16651aca4a5f32da840a53f6660f0579a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:18 +0100 Subject: x86/cfi,bpf: Fix BPF JIT call The current BPF call convention is __nocfi, except when it calls !JIT things, then it calls regular C functions. It so happens that with FineIBT the __nocfi and C calling conventions are incompatible. Specifically __nocfi will call at func+0, while FineIBT will have endbr-poison there, which is not a valid indirect target. Causing #CP. Notably this only triggers on IBT enabled hardware, which is probably why this hasn't been reported (also, most people will have JIT on anyway). Implement proper CFI prologues for the BPF JIT codegen and drop __nocfi for x86. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.345270396@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 12 ++++++++++-- include/linux/cfi.h | 7 +++++++ 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c87c608a3689..9d84c376851a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -29,6 +29,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -1211,7 +1212,11 @@ struct bpf_dispatcher { #endif }; -static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( +#ifndef __bpfcall +#define __bpfcall __nocfi +#endif + +static __always_inline __bpfcall unsigned int bpf_dispatcher_nop_func( const void *ctx, const struct bpf_insn *insnsi, bpf_func_t bpf_func) @@ -1303,7 +1308,7 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func #define DEFINE_BPF_DISPATCHER(name) \ __BPF_DISPATCHER_SC(name); \ - noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ + noinline __bpfcall unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ bpf_func_t bpf_func) \ @@ -1453,6 +1458,9 @@ struct bpf_prog_aux { struct bpf_kfunc_desc_tab *kfunc_tab; struct bpf_kfunc_btf_tab *kfunc_btf_tab; u32 size_poke_tab; +#ifdef CONFIG_FINEIBT + struct bpf_ksym ksym_prefix; +#endif struct bpf_ksym ksym; const struct bpf_prog_ops *ops; struct bpf_map **used_maps; diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 2309d74e77e6..1ed2d96c0cfc 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -11,6 +11,13 @@ #include #include +#ifndef cfi_get_offset +static inline int cfi_get_offset(void) +{ + return 0; +} +#endif + #ifdef CONFIG_CFI_CLANG enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr, unsigned long *target, u32 type); -- cgit v1.2.3 From 2cd3e3772e41377f32d6eea643e0590774e9187c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:20 +0100 Subject: x86/cfi,bpf: Fix bpf_struct_ops CFI BPF struct_ops uses __arch_prepare_bpf_trampoline() to write trampolines for indirect function calls. These tramplines much have matching CFI. In order to obtain the correct CFI hash for the various methods, add a matching structure that contains stub functions, the compiler will generate correct CFI which we can pilfer for the trampolines. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.566977112@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9d84c376851a..db46b3359bf5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1060,6 +1060,17 @@ struct btf_func_model { */ #define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7) +/* + * Indicate the trampoline should be suitable to receive indirect calls; + * without this indirectly calling the generated code can result in #UD/#CP, + * depending on the CFI options. + * + * Used by bpf_struct_ops. + * + * Incompatible with FENTRY usage, overloads @func_addr argument. + */ +#define BPF_TRAMP_F_INDIRECT BIT(8) + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. */ @@ -1697,6 +1708,7 @@ struct bpf_struct_ops { struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS]; u32 type_id; u32 value_id; + void *cfi_stubs; }; #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL) @@ -1710,6 +1722,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, + void *stub_func, void *image, void *image_end); static inline bool bpf_try_module_get(const void *data, struct module *owner) { -- cgit v1.2.3 From e9d13b9d2f99ccf7afeab490d97eaa5ac9846598 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Dec 2023 10:12:21 +0100 Subject: cfi: Add CFI_NOSEAL() Add a CFI_NOSEAL() helper to mark functions that need to retain their CFI information, despite not otherwise leaking their address. Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20231215092707.669401084@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/cfi.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/cfi.h b/include/linux/cfi.h index 1ed2d96c0cfc..f0df518e11dd 100644 --- a/include/linux/cfi.h +++ b/include/linux/cfi.h @@ -46,4 +46,8 @@ static inline void module_cfi_finalize(const Elf_Ehdr *hdr, #endif /* CONFIG_ARCH_USES_CFI_TRAPS */ #endif /* CONFIG_MODULES */ +#ifndef CFI_NOSEAL +#define CFI_NOSEAL(x) +#endif + #endif /* _LINUX_CFI_H */ -- cgit v1.2.3 From 852486b35f344887786d63250946dd921a05d7e8 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 15 Dec 2023 10:12:23 +0100 Subject: x86/cfi,bpf: Fix bpf_exception_cb() signature As per the earlier patches, BPF sub-programs have bpf_callback_t signature and CFI expects callers to have matching signature. This is violated by bpf_prog_aux::bpf_exception_cb(). [peterz: Changelog] Reported-by: Peter Zijlstra Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/CAADnVQ+Z7UcXXBBhMubhcMM=R-dExk-uHtfOLtoLxQ1XxEpqEA@mail.gmail.com Link: https://lore.kernel.org/r/20231215092707.910319166@infradead.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index db46b3359bf5..5e694934cf37 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1484,7 +1484,7 @@ struct bpf_prog_aux { int cgroup_atype; /* enum cgroup_bpf_attach_type */ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; char name[BPF_OBJ_NAME_LEN]; - unsigned int (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp); + u64 (*bpf_exception_cb)(u64 cookie, u64 sp, u64 bp, u64, u64); #ifdef CONFIG_SECURITY void *security; #endif -- cgit v1.2.3 From 41db7626b73210cb99eea9cf672dcfce58c68ab2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Dec 2023 19:29:38 +0000 Subject: inet: returns a bool from inet_sk_get_local_port_range() Change inet_sk_get_local_port_range() to return a boolean, telling the callers if the port range was provided by IP_LOCAL_PORT_RANGE socket option. Adds documentation while we are at it. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20231214192939.1962891-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index b31be912489a..de0c69c57e3c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -356,7 +356,7 @@ static inline void inet_get_local_port_range(const struct net *net, int *low, in *low = range & 0xffff; *high = range >> 16; } -void inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); +bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high); #ifdef CONFIG_SYSCTL static inline bool inet_is_local_reserved_port(struct net *net, unsigned short port) -- cgit v1.2.3 From f7dc3248dcfbdd81b5be64272f38b87a8e8085e7 Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Fri, 15 Dec 2023 11:30:11 +0800 Subject: skbuff: Optimization of SKB coalescing for page pool In order to address the issues encountered with commit 1effe8ca4e34 ("skbuff: fix coalescing for page_pool fragment recycling"), the combination of the following condition was excluded from skb coalescing: from->pp_recycle = 1 from->cloned = 1 to->pp_recycle = 1 However, with page pool environments, the aforementioned combination can be quite common(ex. NetworkMananger may lead to the additional packet_type being registered, thus the cloning). In scenarios with a higher number of small packets, it can significantly affect the success rate of coalescing. For example, considering packets of 256 bytes size, our comparison of coalescing success rate is as follows: Without page pool: 70% With page pool: 13% Consequently, this has an impact on performance: Without page pool: 2.57 Gbits/sec With page pool: 2.26 Gbits/sec Therefore, it seems worthwhile to optimize this scenario and enable coalescing of this particular combination. To achieve this, we need to ensure the correct increment of the "from" SKB page's page pool reference count (pp_ref_count). Following this optimization, the success rate of coalescing measured in our environment has improved as follows: With page pool: 60% This success rate is approaching the rate achieved without using page pool, and the performance has also been improved: With page pool: 2.52 Gbits/sec Below is the performance comparison for small packets before and after this optimization. We observe no impact to packets larger than 4K. packet size before after improved (bytes) (Gbits/sec) (Gbits/sec) 128 1.19 1.27 7.13% 256 2.26 2.52 11.75% 512 4.13 4.81 16.50% 1024 6.17 6.73 9.05% 2048 14.54 15.47 6.45% 4096 25.44 27.87 9.52% Signed-off-by: Liang Chen Reviewed-by: Yunsheng Lin Suggested-by: Jason Wang Reviewed-by: Mina Almasry Signed-off-by: David S. Miller --- include/net/page_pool/helpers.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index ead2c0d24b2c..1d397c1a0043 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -277,6 +277,11 @@ static inline long page_pool_unref_page(struct page *page, long nr) return ret; } +static inline void page_pool_ref_page(struct page *page) +{ + atomic_long_inc(&page->pp_ref_count); +} + static inline bool page_pool_is_last_ref(struct page *page) { /* If page_pool_unref_page() returns 0, we were the last user */ -- cgit v1.2.3 From ebb30ccbbdbd6fae5177b676da4f4ac92bb4f635 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Fri, 15 Dec 2023 14:15:31 +0100 Subject: net: phy: make addr type u8 in phy_package_shared struct Switch addr type in phy_package_shared struct to u8. The value is already checked to be non negative and to be less than PHY_MAX_ADDR, hence u8 is better suited than using int. Signed-off-by: Christian Marangi Reviewed-by: Russell King (Oracle) Signed-off-by: David S. Miller --- include/linux/phy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index dbb5e13e3e1b..4b13cc85c4f5 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -338,7 +338,7 @@ struct mdio_bus_stats { * phy_package_leave(). */ struct phy_package_shared { - int addr; + u8 addr; refcount_t refcnt; unsigned long flags; size_t priv_size; -- cgit v1.2.3 From 9eea577eb1155fe4a183bc5e7bf269b0b2e7a6ba Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Fri, 15 Dec 2023 14:15:32 +0100 Subject: net: phy: extend PHY package API to support multiple global address Current API for PHY package are limited to single address to configure global settings for the PHY package. It was found that some PHY package (for example the qca807x, a PHY package that is shipped with a bundle of 5 PHY) requires multiple PHY address to configure global settings. An example scenario is a PHY that have a dedicated PHY for PSGMII/serdes calibrarion and have a specific PHY in the package where the global PHY mode is set and affects every other PHY in the package. Change the API in the following way: - Change phy_package_join() to take the base addr of the PHY package instead of the global PHY addr. - Make __/phy_package_write/read() require an additional arg that select what global PHY address to use by passing the offset from the base addr passed on phy_package_join(). Each user of this API is updated to follow this new implementation following a pattern where an enum is defined to declare the offset of the addr. We also drop the check if shared is defined as any user of the phy_package_read/write is expected to use phy_package_join first. Misuse of this will correctly trigger a kernel panic for NULL pointer exception. Signed-off-by: Christian Marangi Signed-off-by: David S. Miller --- include/linux/phy.h | 64 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 4b13cc85c4f5..d653f660c39d 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -327,7 +327,8 @@ struct mdio_bus_stats { /** * struct phy_package_shared - Shared information in PHY packages - * @addr: Common PHY address used to combine PHYs in one package + * @base_addr: Base PHY address of PHY package used to combine PHYs + * in one package and for offset calculation of phy_package_read/write * @refcnt: Number of PHYs connected to this shared data * @flags: Initialization of PHY package * @priv_size: Size of the shared private data @priv @@ -338,7 +339,7 @@ struct mdio_bus_stats { * phy_package_leave(). */ struct phy_package_shared { - u8 addr; + u8 base_addr; refcount_t refcnt; unsigned long flags; size_t priv_size; @@ -1976,10 +1977,10 @@ int phy_ethtool_get_link_ksettings(struct net_device *ndev, int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); -int phy_package_join(struct phy_device *phydev, int addr, size_t priv_size); +int phy_package_join(struct phy_device *phydev, int base_addr, size_t priv_size); void phy_package_leave(struct phy_device *phydev); int devm_phy_package_join(struct device *dev, struct phy_device *phydev, - int addr, size_t priv_size); + int base_addr, size_t priv_size); int __init mdio_bus_init(void); void mdio_bus_exit(void); @@ -2002,46 +2003,65 @@ int __phy_hwtstamp_set(struct phy_device *phydev, struct kernel_hwtstamp_config *config, struct netlink_ext_ack *extack); -static inline int phy_package_read(struct phy_device *phydev, u32 regnum) +static inline int phy_package_address(struct phy_device *phydev, + unsigned int addr_offset) { struct phy_package_shared *shared = phydev->shared; + u8 base_addr = shared->base_addr; - if (!shared) + if (addr_offset >= PHY_MAX_ADDR - base_addr) return -EIO; - return mdiobus_read(phydev->mdio.bus, shared->addr, regnum); + /* we know that addr will be in the range 0..31 and thus the + * implicit cast to a signed int is not a problem. + */ + return base_addr + addr_offset; } -static inline int __phy_package_read(struct phy_device *phydev, u32 regnum) +static inline int phy_package_read(struct phy_device *phydev, + unsigned int addr_offset, u32 regnum) { - struct phy_package_shared *shared = phydev->shared; + int addr = phy_package_address(phydev, addr_offset); - if (!shared) - return -EIO; + if (addr < 0) + return addr; + + return mdiobus_read(phydev->mdio.bus, addr, regnum); +} + +static inline int __phy_package_read(struct phy_device *phydev, + unsigned int addr_offset, u32 regnum) +{ + int addr = phy_package_address(phydev, addr_offset); + + if (addr < 0) + return addr; - return __mdiobus_read(phydev->mdio.bus, shared->addr, regnum); + return __mdiobus_read(phydev->mdio.bus, addr, regnum); } static inline int phy_package_write(struct phy_device *phydev, - u32 regnum, u16 val) + unsigned int addr_offset, u32 regnum, + u16 val) { - struct phy_package_shared *shared = phydev->shared; + int addr = phy_package_address(phydev, addr_offset); - if (!shared) - return -EIO; + if (addr < 0) + return addr; - return mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val); + return mdiobus_write(phydev->mdio.bus, addr, regnum, val); } static inline int __phy_package_write(struct phy_device *phydev, - u32 regnum, u16 val) + unsigned int addr_offset, u32 regnum, + u16 val) { - struct phy_package_shared *shared = phydev->shared; + int addr = phy_package_address(phydev, addr_offset); - if (!shared) - return -EIO; + if (addr < 0) + return addr; - return __mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val); + return __mdiobus_write(phydev->mdio.bus, addr, regnum, val); } static inline bool __phy_package_set_once(struct phy_device *phydev, -- cgit v1.2.3 From d63710fc0f1a501fd75a7025e3070a96ffa1645f Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Fri, 15 Dec 2023 14:15:34 +0100 Subject: net: phy: add support for PHY package MMD read/write Some PHY in PHY package may require to read/write MMD regs to correctly configure the PHY package. Add support for these additional required function in both lock and no lock variant. It's assumed that the entire PHY package is either C22 or C45. We use C22 or C45 way of writing/reading to mmd regs based on the passed phydev whether it's C22 or C45. Signed-off-by: Christian Marangi Signed-off-by: David S. Miller --- include/linux/phy.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index d653f660c39d..e9e85d347587 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2064,6 +2064,22 @@ static inline int __phy_package_write(struct phy_device *phydev, return __mdiobus_write(phydev->mdio.bus, addr, regnum, val); } +int __phy_package_read_mmd(struct phy_device *phydev, + unsigned int addr_offset, int devad, + u32 regnum); + +int phy_package_read_mmd(struct phy_device *phydev, + unsigned int addr_offset, int devad, + u32 regnum); + +int __phy_package_write_mmd(struct phy_device *phydev, + unsigned int addr_offset, int devad, + u32 regnum, u16 val); + +int phy_package_write_mmd(struct phy_device *phydev, + unsigned int addr_offset, int devad, + u32 regnum, u16 val); + static inline bool __phy_package_set_once(struct phy_device *phydev, unsigned int b) { -- cgit v1.2.3 From 32da0f00ddcb101730cf242289b2b10ede0e1156 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Fri, 15 Dec 2023 14:57:10 -0300 Subject: net: rtnl: introduce rcu_replace_pointer_rtnl Introduce the rcu_replace_pointer_rtnl helper to lockdep check rtnl lock rcu replacements, alongside the already existing helpers. This is a quality of life helper so instead of using: rcu_replace_pointer(rp, p, lockdep_rtnl_is_held()) .. or the open coded.. rtnl_dereference() / rcu_assign_pointer() .. or the lazy check version .. rcu_replace_pointer(rp, p, 1) Use: rcu_replace_pointer_rtnl(rp, p) Signed-off-by: Jamal Hadi Salim Signed-off-by: Victor Nogueira Signed-off-by: Pedro Tammela Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 6a8543b34e2c..410529fca18b 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -79,6 +79,18 @@ static inline bool lockdep_rtnl_is_held(void) #define rtnl_dereference(p) \ rcu_dereference_protected(p, lockdep_rtnl_is_held()) +/** + * rcu_replace_pointer_rtnl - replace an RCU pointer under rtnl_lock, returning + * its old value + * @rp: RCU pointer, whose value is returned + * @p: regular pointer + * + * Perform a replacement under rtnl_lock, where @rp is an RCU-annotated + * pointer. The old value of @rp is returned, and @rp is set to @p + */ +#define rcu_replace_pointer_rtnl(rp, p) \ + rcu_replace_pointer(rp, p, lockdep_rtnl_is_held()) + static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) { return rtnl_dereference(dev->ingress_queue); -- cgit v1.2.3 From 3314f2097dee43defc20554f961a8b17f4787e2d Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Tue, 5 Dec 2023 17:01:01 -0800 Subject: intel: add bit macro includes where needed This series is introducing the use of FIELD_GET and FIELD_PREP which requires bitfield.h to be included. Fix all the includes in this one change, and rearrange includes into alphabetical order to ease readability and future maintenance. virtchnl.h and it's usage was modified to have it's own includes as it should. This required including bits.h for virtchnl.h. Reviewed-by: Marcin Szycik Signed-off-by: Jesse Brandeburg Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index a44d9dc7e3eb..8e177b67e82f 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -5,6 +5,7 @@ #define _VIRTCHNL_H_ #include +#include #include #include -- cgit v1.2.3 From a731132424adeda4d5383ef61afae2e804063fb7 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 16 Dec 2023 13:29:57 +0100 Subject: genetlink: introduce per-sock family private storage Introduce an xarray for Generic netlink family to store per-socket private. Initialize this xarray only if family uses per-socket privs. Introduce genl_sk_priv_get() to get the socket priv pointer for a family and initialize it in case it does not exist. Introduce __genl_sk_priv_get() to obtain socket priv pointer for a family under RCU read lock. Allow family to specify the priv size, init() and destroy() callbacks. Signed-off-by: Jiri Pirko Signed-off-by: Paolo Abeni --- include/net/genetlink.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index c53244f20437..6bc37f392a9a 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -51,6 +51,9 @@ struct genl_info; * @split_ops: the split do/dump form of operation definition * @n_split_ops: number of entries in @split_ops, not that with split do/dump * ops the number of entries is not the same as number of commands + * @sock_priv_size: the size of per-socket private memory + * @sock_priv_init: the per-socket private memory initializer + * @sock_priv_destroy: the per-socket private memory destructor * * Attribute policies (the combination of @policy and @maxattr fields) * can be attached at the family level or at the operation level. @@ -84,11 +87,17 @@ struct genl_family { const struct genl_multicast_group *mcgrps; struct module *module; + size_t sock_priv_size; + void (*sock_priv_init)(void *priv); + void (*sock_priv_destroy)(void *priv); + /* private: internal use only */ /* protocol family identifier */ int id; /* starting number of multicast group IDs in this family */ unsigned int mcgrp_offset; + /* list of per-socket privs */ + struct xarray *sock_privs; }; /** @@ -298,6 +307,8 @@ static inline bool genl_info_is_ntf(const struct genl_info *info) return !info->nlhdr; } +void *__genl_sk_priv_get(struct genl_family *family, struct sock *sk); +void *genl_sk_priv_get(struct genl_family *family, struct sock *sk); int genl_register_family(struct genl_family *family); int genl_unregister_family(const struct genl_family *family); void genl_notify(const struct genl_family *family, struct sk_buff *skb, -- cgit v1.2.3 From 403863e985e8eba608d53b2907caaf37b6176290 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 16 Dec 2023 13:29:58 +0100 Subject: netlink: introduce typedef for filter function Make the code using filter function a bit nicer by consolidating the filter function arguments using typedef. Suggested-by: Andy Shevchenko Signed-off-by: Jiri Pirko Signed-off-by: Paolo Abeni --- include/linux/connector.h | 3 +-- include/linux/netlink.h | 6 ++++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/connector.h b/include/linux/connector.h index cec2d99ae902..70bc1160f3d8 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -100,8 +100,7 @@ void cn_del_callback(const struct cb_id *id); */ int cn_netlink_send_mult(struct cn_msg *msg, u16 len, u32 portid, u32 group, gfp_t gfp_mask, - int (*filter)(struct sock *dsk, struct sk_buff *skb, - void *data), + netlink_filter_fn filter, void *filter_data); /** diff --git a/include/linux/netlink.h b/include/linux/netlink.h index abe91ed6b9aa..1a4445bf2ab9 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -228,10 +228,12 @@ bool netlink_strict_get_check(struct sk_buff *skb); int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); + +typedef int (*netlink_filter_fn)(struct sock *dsk, struct sk_buff *skb, void *data); + int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation, - int (*filter)(struct sock *dsk, - struct sk_buff *skb, void *data), + netlink_filter_fn filter, void *filter_data); int netlink_set_err(struct sock *ssk, __u32 portid, __u32 group, int code); int netlink_register_notifier(struct notifier_block *nb); -- cgit v1.2.3 From 971b4ad88293bef00160e1d38659077fe3a93af6 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 16 Dec 2023 13:29:59 +0100 Subject: genetlink: introduce helpers to do filtered multicast Currently it is possible for netlink kernel user to pass custom filter function to broadcast send function netlink_broadcast_filtered(). However, this is not exposed to multicast send and to generic netlink users. Extend the api and introduce a netlink helper nlmsg_multicast_filtered() and a generic netlink helper genlmsg_multicast_netns_filtered() to allow generic netlink families to specify filter function while sending multicast messages. Signed-off-by: Jiri Pirko Signed-off-by: Paolo Abeni --- include/net/genetlink.h | 35 +++++++++++++++++++++++++++++++---- include/net/netlink.h | 31 +++++++++++++++++++++++++++---- 2 files changed, 58 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 6bc37f392a9a..85c63d4f16dd 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -448,6 +448,35 @@ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) nlmsg_cancel(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); } +/** + * genlmsg_multicast_netns_filtered - multicast a netlink message + * to a specific netns with filter + * function + * @family: the generic netlink family + * @net: the net namespace + * @skb: netlink message as socket buffer + * @portid: own netlink portid to avoid sending to yourself + * @group: offset of multicast group in groups array + * @flags: allocation flags + * @filter: filter function + * @filter_data: filter function private data + * + * Return: 0 on success, negative error code for failure. + */ +static inline int +genlmsg_multicast_netns_filtered(const struct genl_family *family, + struct net *net, struct sk_buff *skb, + u32 portid, unsigned int group, gfp_t flags, + netlink_filter_fn filter, + void *filter_data) +{ + if (WARN_ON_ONCE(group >= family->n_mcgrps)) + return -EINVAL; + group = family->mcgrp_offset + group; + return nlmsg_multicast_filtered(net->genl_sock, skb, portid, group, + flags, filter, filter_data); +} + /** * genlmsg_multicast_netns - multicast a netlink message to a specific netns * @family: the generic netlink family @@ -461,10 +490,8 @@ static inline int genlmsg_multicast_netns(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { - if (WARN_ON_ONCE(group >= family->n_mcgrps)) - return -EINVAL; - group = family->mcgrp_offset + group; - return nlmsg_multicast(net->genl_sock, skb, portid, group, flags); + return genlmsg_multicast_netns_filtered(family, net, skb, portid, + group, flags, NULL, NULL); } /** diff --git a/include/net/netlink.h b/include/net/netlink.h index 28039e57070a..c19ff921b661 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -1087,27 +1087,50 @@ static inline void nlmsg_free(struct sk_buff *skb) } /** - * nlmsg_multicast - multicast a netlink message + * nlmsg_multicast_filtered - multicast a netlink message with filter function * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags + * @filter: filter function + * @filter_data: filter function private data + * + * Return: 0 on success, negative error code for failure. */ -static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, - u32 portid, unsigned int group, gfp_t flags) +static inline int nlmsg_multicast_filtered(struct sock *sk, struct sk_buff *skb, + u32 portid, unsigned int group, + gfp_t flags, + netlink_filter_fn filter, + void *filter_data) { int err; NETLINK_CB(skb).dst_group = group; - err = netlink_broadcast(sk, skb, portid, group, flags); + err = netlink_broadcast_filtered(sk, skb, portid, group, flags, + filter, filter_data); if (err > 0) err = 0; return err; } +/** + * nlmsg_multicast - multicast a netlink message + * @sk: netlink socket to spread messages to + * @skb: netlink message as socket buffer + * @portid: own netlink portid to avoid sending to yourself + * @group: multicast group id + * @flags: allocation flags + */ +static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, + u32 portid, unsigned int group, gfp_t flags) +{ + return nlmsg_multicast_filtered(sk, skb, portid, group, flags, + NULL, NULL); +} + /** * nlmsg_unicast - unicast a netlink message * @sk: netlink socket to spread message to -- cgit v1.2.3 From 13b127d2578432e1e521310b69944c5a1b30679c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sat, 16 Dec 2023 13:30:00 +0100 Subject: devlink: add a command to set notification filter and use it for multicasts Currently the user listening on a socket for devlink notifications gets always all messages for all existing instances, even if he is interested only in one of those. That may cause unnecessary overhead on setups with thousands of instances present. User is currently able to narrow down the devlink objects replies to dump commands by specifying select attributes. Allow similar approach for notifications. Introduce a new devlink NOTIFY_FILTER_SET which the user passes the select attributes. Store these per-socket and use them for filtering messages during multicast send. Signed-off-by: Jiri Pirko Signed-off-by: Paolo Abeni --- include/uapi/linux/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index b3c8383d342d..130cae0d3e20 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -139,6 +139,8 @@ enum devlink_command { DEVLINK_CMD_SELFTESTS_GET, /* can dump */ DEVLINK_CMD_SELFTESTS_RUN, + DEVLINK_CMD_NOTIFY_FILTER_SET, + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 -- cgit v1.2.3 From d17aff807f845cf93926c28705216639c7279110 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 19 Dec 2023 07:37:35 -0800 Subject: Revert BPF token-related functionality This patch includes the following revert (one conflicting BPF FS patch and three token patch sets, represented by merge commits): - revert 0f5d5454c723 "Merge branch 'bpf-fs-mount-options-parsing-follow-ups'"; - revert 750e785796bb "bpf: Support uid and gid when mounting bpffs"; - revert 733763285acf "Merge branch 'bpf-token-support-in-libbpf-s-bpf-object'"; - revert c35919dcce28 "Merge branch 'bpf-token-and-bpf-fs-based-delegation'". Link: https://lore.kernel.org/bpf/CAHk-=wg7JuFYwGy=GOMbRCtOL+jwSQsdUaBsRWkDVYbxipbM5A@mail.gmail.com Signed-off-by: Andrii Nakryiko --- include/linux/bpf.h | 85 +++++-------------------------------------- include/linux/filter.h | 2 +- include/linux/lsm_hook_defs.h | 15 ++------ include/linux/security.h | 43 ++++------------------ include/uapi/linux/bpf.h | 42 --------------------- 5 files changed, 22 insertions(+), 165 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2f54cc0436c4..7a8d4c81a39a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -52,10 +52,6 @@ struct module; struct bpf_func_state; struct ftrace_ops; struct cgroup; -struct bpf_token; -struct user_namespace; -struct super_block; -struct inode; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1488,7 +1484,6 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif - struct bpf_token *token; struct bpf_prog_offload *offload; struct btf *btf; struct bpf_func_info *func_info; @@ -1613,31 +1608,6 @@ struct bpf_link_primer { u32 id; }; -struct bpf_mount_opts { - kuid_t uid; - kgid_t gid; - umode_t mode; - - /* BPF token-related delegation options */ - u64 delegate_cmds; - u64 delegate_maps; - u64 delegate_progs; - u64 delegate_attachs; -}; - -struct bpf_token { - struct work_struct work; - atomic64_t refcnt; - struct user_namespace *userns; - u64 allowed_cmds; - u64 allowed_maps; - u64 allowed_progs; - u64 allowed_attachs; -#ifdef CONFIG_SECURITY - void *security; -#endif -}; - struct bpf_struct_ops_value; struct btf_member; @@ -2097,7 +2067,6 @@ static inline void bpf_enable_instrumentation(void) migrate_enable(); } -extern const struct super_operations bpf_super_ops; extern const struct file_operations bpf_map_fops; extern const struct file_operations bpf_prog_fops; extern const struct file_operations bpf_iter_fops; @@ -2232,26 +2201,24 @@ static inline void bpf_map_dec_elem_count(struct bpf_map *map) extern int sysctl_unprivileged_bpf_disabled; -bool bpf_token_capable(const struct bpf_token *token, int cap); - -static inline bool bpf_allow_ptr_leaks(const struct bpf_token *token) +static inline bool bpf_allow_ptr_leaks(void) { - return bpf_token_capable(token, CAP_PERFMON); + return perfmon_capable(); } -static inline bool bpf_allow_uninit_stack(const struct bpf_token *token) +static inline bool bpf_allow_uninit_stack(void) { - return bpf_token_capable(token, CAP_PERFMON); + return perfmon_capable(); } -static inline bool bpf_bypass_spec_v1(const struct bpf_token *token) +static inline bool bpf_bypass_spec_v1(void) { - return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); + return cpu_mitigations_off() || perfmon_capable(); } -static inline bool bpf_bypass_spec_v4(const struct bpf_token *token) +static inline bool bpf_bypass_spec_v4(void) { - return cpu_mitigations_off() || bpf_token_capable(token, CAP_PERFMON); + return cpu_mitigations_off() || perfmon_capable(); } int bpf_map_new_fd(struct bpf_map *map, int flags); @@ -2268,21 +2235,8 @@ int bpf_link_new_fd(struct bpf_link *link); struct bpf_link *bpf_link_get_from_fd(u32 ufd); struct bpf_link *bpf_link_get_curr_or_next(u32 *id); -void bpf_token_inc(struct bpf_token *token); -void bpf_token_put(struct bpf_token *token); -int bpf_token_create(union bpf_attr *attr); -struct bpf_token *bpf_token_get_from_fd(u32 ufd); - -bool bpf_token_allow_cmd(const struct bpf_token *token, enum bpf_cmd cmd); -bool bpf_token_allow_map_type(const struct bpf_token *token, enum bpf_map_type type); -bool bpf_token_allow_prog_type(const struct bpf_token *token, - enum bpf_prog_type prog_type, - enum bpf_attach_type attach_type); - int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname); int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags); -struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, - umode_t mode); #define BPF_ITER_FUNC_PREFIX "bpf_iter_" #define DEFINE_BPF_ITER_FUNC(target, args...) \ @@ -2526,8 +2480,7 @@ const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type struct bpf_prog *bpf_prog_by_id(u32 id); struct bpf_link *bpf_link_by_id(u32 id); -const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id, - const struct bpf_prog *prog); +const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); void bpf_task_storage_free(struct task_struct *task); void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); @@ -2646,24 +2599,6 @@ static inline int bpf_obj_get_user(const char __user *pathname, int flags) return -EOPNOTSUPP; } -static inline bool bpf_token_capable(const struct bpf_token *token, int cap) -{ - return capable(cap) || (cap != CAP_SYS_ADMIN && capable(CAP_SYS_ADMIN)); -} - -static inline void bpf_token_inc(struct bpf_token *token) -{ -} - -static inline void bpf_token_put(struct bpf_token *token) -{ -} - -static inline struct bpf_token *bpf_token_get_from_fd(u32 ufd) -{ - return ERR_PTR(-EOPNOTSUPP); -} - static inline void __dev_flush(void) { } @@ -2787,7 +2722,7 @@ static inline int btf_struct_access(struct bpf_verifier_log *log, } static inline const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +bpf_base_func_proto(enum bpf_func_id func_id) { return NULL; } diff --git a/include/linux/filter.h b/include/linux/filter.h index 12d907f17d36..68fb6c8142fe 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1139,7 +1139,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) return false; if (!bpf_jit_harden) return false; - if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF)) + if (bpf_jit_harden == 1 && bpf_capable()) return false; return true; diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 3fdd00b452ac..ff217a5ce552 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -398,17 +398,10 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) -LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) -LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) -LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) -LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) -LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, - struct path *path) -LSM_HOOK(void, LSM_RET_VOID, bpf_token_free, struct bpf_token *token) -LSM_HOOK(int, 0, bpf_token_cmd, const struct bpf_token *token, enum bpf_cmd cmd) -LSM_HOOK(int, 0, bpf_token_capable, const struct bpf_token *token, int cap) +LSM_HOOK(int, 0, bpf_map_alloc_security, struct bpf_map *map) +LSM_HOOK(void, LSM_RET_VOID, bpf_map_free_security, struct bpf_map *map) +LSM_HOOK(int, 0, bpf_prog_alloc_security, struct bpf_prog_aux *aux) +LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free_security, struct bpf_prog_aux *aux) #endif /* CONFIG_BPF_SYSCALL */ LSM_HOOK(int, 0, locked_down, enum lockdown_reason what) diff --git a/include/linux/security.h b/include/linux/security.h index 00809d2d5c38..1d1df326c881 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -32,7 +32,6 @@ #include #include #include -#include struct linux_binprm; struct cred; @@ -2021,22 +2020,15 @@ static inline void securityfs_remove(struct dentry *dentry) union bpf_attr; struct bpf_map; struct bpf_prog; -struct bpf_token; +struct bpf_prog_aux; #ifdef CONFIG_SECURITY extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); -extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token); +extern int security_bpf_map_alloc(struct bpf_map *map); extern void security_bpf_map_free(struct bpf_map *map); -extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token); -extern void security_bpf_prog_free(struct bpf_prog *prog); -extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, - struct path *path); -extern void security_bpf_token_free(struct bpf_token *token); -extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd); -extern int security_bpf_token_capable(const struct bpf_token *token, int cap); +extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux); +extern void security_bpf_prog_free(struct bpf_prog_aux *aux); #else static inline int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) @@ -2054,8 +2046,7 @@ static inline int security_bpf_prog(struct bpf_prog *prog) return 0; } -static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) +static inline int security_bpf_map_alloc(struct bpf_map *map) { return 0; } @@ -2063,33 +2054,13 @@ static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *a static inline void security_bpf_map_free(struct bpf_map *map) { } -static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) +static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux) { return 0; } -static inline void security_bpf_prog_free(struct bpf_prog *prog) +static inline void security_bpf_prog_free(struct bpf_prog_aux *aux) { } - -static inline int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, - struct path *path) -{ - return 0; -} - -static inline void security_bpf_token_free(struct bpf_token *token) -{ } - -static inline int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd) -{ - return 0; -} - -static inline int security_bpf_token_capable(const struct bpf_token *token, int cap) -{ - return 0; -} #endif /* CONFIG_SECURITY */ #endif /* CONFIG_BPF_SYSCALL */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 42f4d3090efe..754e68ca8744 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -847,36 +847,6 @@ union bpf_iter_link_info { * Returns zero on success. On error, -1 is returned and *errno* * is set appropriately. * - * BPF_TOKEN_CREATE - * Description - * Create BPF token with embedded information about what - * BPF-related functionality it allows: - * - a set of allowed bpf() syscall commands; - * - a set of allowed BPF map types to be created with - * BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed; - * - a set of allowed BPF program types and BPF program attach - * types to be loaded with BPF_PROG_LOAD command, if - * BPF_PROG_LOAD itself is allowed. - * - * BPF token is created (derived) from an instance of BPF FS, - * assuming it has necessary delegation mount options specified. - * This BPF token can be passed as an extra parameter to various - * bpf() syscall commands to grant BPF subsystem functionality to - * unprivileged processes. - * - * When created, BPF token is "associated" with the owning - * user namespace of BPF FS instance (super block) that it was - * derived from, and subsequent BPF operations performed with - * BPF token would be performing capabilities checks (i.e., - * CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within - * that user namespace. Without BPF token, such capabilities - * have to be granted in init user namespace, making bpf() - * syscall incompatible with user namespace, for the most part. - * - * Return - * A new file descriptor (a nonnegative integer), or -1 if an - * error occurred (in which case, *errno* is set appropriately). - * * NOTES * eBPF objects (maps and programs) can be shared between processes. * @@ -931,8 +901,6 @@ enum bpf_cmd { BPF_ITER_CREATE, BPF_LINK_DETACH, BPF_PROG_BIND_MAP, - BPF_TOKEN_CREATE, - __MAX_BPF_CMD, }; enum bpf_map_type { @@ -983,7 +951,6 @@ enum bpf_map_type { BPF_MAP_TYPE_BLOOM_FILTER, BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, - __MAX_BPF_MAP_TYPE }; /* Note that tracing related programs such as @@ -1028,7 +995,6 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ BPF_PROG_TYPE_NETFILTER, - __MAX_BPF_PROG_TYPE }; enum bpf_attach_type { @@ -1437,7 +1403,6 @@ union bpf_attr { * to using 5 hash functions). */ __u64 map_extra; - __u32 map_token_fd; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -1507,7 +1472,6 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 log_true_size; - __u32 prog_token_fd; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -1620,7 +1584,6 @@ union bpf_attr { * truncated), or smaller (if log buffer wasn't filled completely). */ __u32 btf_log_true_size; - __u32 btf_token_fd; }; struct { @@ -1751,11 +1714,6 @@ union bpf_attr { __u32 flags; /* extra flags */ } prog_bind_map; - struct { /* struct used by BPF_TOKEN_CREATE command */ - __u32 flags; - __u32 bpffs_fd; - } token_create; - } __attribute__((aligned(8))); /* The description below is an attempt at providing documentation to eBPF -- cgit v1.2.3 From 1c1c2b37325934b4318c46966feb03ad9fe83d8a Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 15 Dec 2023 15:38:51 +0300 Subject: wifi: cfg80211: introduce cfg80211_ssid_eq() Since SSIDs comparison is commonly used across many drivers, introduce generic 'cfg80211_ssid_eq()' to replace driver-private implementations. Signed-off-by: Dmitry Antipov Link: https://msgid.link/20231215123859.196350-1-dmantipov@yandex.ru [fix kernel-doc return docs] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ac1fb326dcda..033a0a25397f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7165,6 +7165,23 @@ enum cfg80211_bss_frame_type { int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen, enum nl80211_band band); +/** + * cfg80211_ssid_eq - compare two SSIDs + * @a: first SSID + * @b: second SSID + * + * Return: %true if SSIDs are equal, %false otherwise. + */ +static inline bool +cfg80211_ssid_eq(struct cfg80211_ssid *a, struct cfg80211_ssid *b) +{ + if (WARN_ON(!a || !b)) + return false; + if (a->ssid_len != b->ssid_len) + return false; + return memcmp(a->ssid, b->ssid, a->ssid_len) ? false : true; +} + /** * cfg80211_inform_bss_data - inform cfg80211 of a new BSS * -- cgit v1.2.3 From 4ba1d0f23414135e4f426dae4cb5cdc2ce246f89 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:25 -0800 Subject: bpf: abstract away global subprog arg preparation logic from reg state setup btf_prepare_func_args() is used to understand expectations and restrictions on global subprog arguments. But current implementation is hard to extend, as it intermixes BTF-based func prototype parsing and interpretation logic with setting up register state at subprog entry. Worse still, those registers are not completely set up inside btf_prepare_func_args(), requiring some more logic later in do_check_common(). Like calling mark_reg_unknown() and similar initialization operations. This intermixing of BTF interpretation and register state setup is problematic. First, it causes duplication of BTF parsing logic for global subprog verification (to set up initial state of global subprog) and global subprog call sites analysis (when we need to check that whatever is being passed into global subprog matches expectations), performed in btf_check_subprog_call(). Given we want to extend global func argument with tags later, this duplication is problematic. So refactor btf_prepare_func_args() to do only BTF-based func proto and args parsing, returning high-level argument "expectations" only, with no regard to specifics of register state. I.e., if it's a context argument, instead of setting register state to PTR_TO_CTX, we return ARG_PTR_TO_CTX enum for that argument as "an argument specification" for further processing inside do_check_common(). Similarly for SCALAR arguments, PTR_TO_MEM, etc. This allows to reuse btf_prepare_func_args() in following patches at global subprog call site analysis time. It also keeps register setup code consistently in one place, do_check_common(). Besides all this, we cache this argument specs information inside env->subprog_info, eliminating the need to redo these potentially expensive BTF traversals, especially if BPF program's BTF is big and/or there are lots of global subprog calls. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +-- include/linux/bpf_verifier.h | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7a8d4c81a39a..c050c82cc9a5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2470,8 +2470,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); -int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg, u32 *nargs); +int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); const char *btf_find_decl_tag_value(const struct btf *btf, const struct btf_type *pt, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c2819a6579a5..5742e9c0a7b8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -606,6 +606,13 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) #define BPF_MAX_SUBPROGS 256 +struct bpf_subprog_arg_info { + enum bpf_arg_type arg_type; + union { + u32 mem_size; + }; +}; + struct bpf_subprog_info { /* 'start' has to be the first field otherwise find_subprog() won't work */ u32 start; /* insn idx of function entry point */ @@ -617,6 +624,10 @@ struct bpf_subprog_info { bool is_cb: 1; bool is_async_cb: 1; bool is_exception_cb: 1; + bool args_cached: 1; + + u8 arg_cnt; + struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; }; struct bpf_verifier_env; @@ -727,6 +738,11 @@ struct bpf_verifier_env { char tmp_str_buf[TMP_STR_BUF_LEN]; }; +static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) +{ + return &env->subprog_info[subprog]; +} + __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, -- cgit v1.2.3 From 5eccd2db42d77e3570619c32d39e39bf486607cf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:26 -0800 Subject: bpf: reuse btf_prepare_func_args() check for main program BTF validation Instead of btf_check_subprog_arg_match(), use btf_prepare_func_args() logic to validate "trustworthiness" of main BPF program's BTF information, if it is present. We ignored results of original BTF check anyway, often times producing confusing and ominously-sounding "reg type unsupported for arg#0 function" message, which has no apparent effect on program correctness and verification process. All the -EFAULT returning sanity checks are already performed in check_btf_info_early(), so there is zero reason to have this duplication of logic between btf_check_subprog_call() and btf_check_subprog_arg_match(). Dropping btf_check_subprog_arg_match() simplifies btf_check_func_arg_match() further removing `bool processing_call` flag. One subtle bit that was done by btf_check_subprog_arg_match() was potentially marking main program's BTF as unreliable. We do this explicitly now with a dedicated simple check, preserving the original behavior, but now based on well factored btf_prepare_func_args() logic. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c050c82cc9a5..d0d7eff22b8a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2466,8 +2466,6 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf_func_model *m); struct bpf_reg_state; -int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs); int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog); -- cgit v1.2.3 From e26080d0da87f20222ca6712b65f95a856fadee0 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:27 -0800 Subject: bpf: prepare btf_prepare_func_args() for handling static subprogs Generalize btf_prepare_func_args() to support both global and static subprogs. We are going to utilize this property in the next patch, reusing btf_prepare_func_args() for subprog call logic instead of reparsing BTF information in a completely separate implementation. btf_prepare_func_args() now detects whether subprog is global or static makes slight logic adjustments for static func cases, like not failing fatally (-EFAULT) for conditions that are allowable for static subprogs. Somewhat subtle (but major!) difference is the handling of pointer arguments. Both global and static functions need to handle special context arguments (which are pointers to predefined type names), but static subprogs give up on any other pointers, falling back to marking subprog as "unreliable", disabling the use of BTF type information altogether. For global functions, though, we are assuming that such pointers to unrecognized types are just pointers to fixed-sized memory region (or error out if size cannot be established, like for `void *` pointers). This patch accommodates these small differences and sets up a stage for refactoring in the next patch, eliminating a separate BTF-based parsing logic in btf_check_func_arg_match(). Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5742e9c0a7b8..d3ea9ef04767 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -738,6 +738,11 @@ struct bpf_verifier_env { char tmp_str_buf[TMP_STR_BUF_LEN]; }; +static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) +{ + return &env->prog->aux->func_info_aux[subprog]; +} + static inline struct bpf_subprog_info *subprog_info(struct bpf_verifier_env *env, int subprog) { return &env->subprog_info[subprog]; -- cgit v1.2.3 From c5a7244759b1eeacc59d0426fb73859afa942d0d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 14 Dec 2023 17:13:28 -0800 Subject: bpf: move subprog call logic back to verifier.c Subprog call logic in btf_check_subprog_call() currently has both a lot of BTF parsing logic (which is, presumably, what justified putting it into btf.c), but also a bunch of register state checks, some of each utilize deep verifier logic helpers, necessarily exported from verifier.c: check_ptr_off_reg(), check_func_arg_reg_off(), and check_mem_reg(). Going forward, btf_check_subprog_call() will have a minimum of BTF-related logic, but will get more internal verifier logic related to register state manipulation. So move it into verifier.c to minimize amount of verifier-specific logic exposed to btf.c. We do this move before refactoring btf_check_func_arg_match() to preserve as much history post-refactoring as possible. No functional changes. Acked-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20231215011334.2307144-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 -- include/linux/bpf_verifier.h | 8 -------- 2 files changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d0d7eff22b8a..7671530d6e4e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2466,8 +2466,6 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf_func_model *m); struct bpf_reg_state; -int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *regs); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, struct btf *btf, const struct btf_type *t); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d3ea9ef04767..d07d857ca67f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -785,14 +785,6 @@ bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, void bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); -int check_ptr_off_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno); -int check_func_arg_reg_off(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno, - enum bpf_arg_type arg_type); -int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno, u32 mem_size); - /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, struct btf *btf, u32 btf_id) -- cgit v1.2.3 From e37a11fca41864c9f652ff81296b82e6f65a4242 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 17 Dec 2023 10:32:36 +0200 Subject: bridge: add MDB state mask uAPI attribute Currently, the 'state' field in 'struct br_port_msg' can be set to 1 if the MDB entry is permanent or 0 if it is temporary. Additional states might be added in the future. In a similar fashion to 'NDA_NDM_STATE_MASK', add an MDB state mask uAPI attribute that will allow the upcoming bulk deletion API to bulk delete MDB entries with a certain state or any state. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 2e23f99dc0f1..a5b743a2f775 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -757,6 +757,7 @@ enum { MDBE_ATTR_VNI, MDBE_ATTR_IFINDEX, MDBE_ATTR_SRC_VNI, + MDBE_ATTR_STATE_MASK, __MDBE_ATTR_MAX, }; #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) -- cgit v1.2.3 From 1a36e0f50f963465e9b2b980d250ab38b8fcd7a3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sun, 17 Dec 2023 10:32:38 +0200 Subject: net: Add MDB bulk deletion device operation Add MDB net device operation that will be invoked by rtnetlink code in response to received 'RTM_DELMDB' messages with the 'NLM_F_BULK' flag set. Subsequent patches will implement the operation in the bridge and VXLAN drivers. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1b935ee341b4..75c7725e5e4f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1329,6 +1329,9 @@ struct netdev_net_notifier { * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], * struct netlink_ext_ack *extack); * Deletes the MDB entry from dev. + * int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[], + * struct netlink_ext_ack *extack); + * Bulk deletes MDB entries from dev. * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, * struct netlink_callback *cb); * Dumps MDB entries from dev. The first argument (marker) in the netlink @@ -1611,6 +1614,9 @@ struct net_device_ops { int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); + int (*ndo_mdb_del_bulk)(struct net_device *dev, + struct nlattr *tb[], + struct netlink_ext_ack *extack); int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb); -- cgit v1.2.3 From fb2780721ca5e9f78bbe4544b819b929a982df9c Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Sat, 16 Dec 2023 17:44:34 -0300 Subject: net: sched: Move drop_reason to struct tc_skb_cb Move drop_reason from struct tcf_result to skb cb - more specifically to struct tc_skb_cb. With that, we'll be able to also set the drop reason for the remaining qdiscs (aside from clsact) that do not have access to tcf_result when time comes to set the skb drop reason. Signed-off-by: Victor Nogueira Acked-by: Daniel Borkmann Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 14 ++++++++++++-- include/net/pkt_sched.h | 3 ++- include/net/sch_generic.h | 1 - 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index a76c9171db0e..761e4500cca0 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -154,10 +154,20 @@ __cls_set_class(unsigned long *clp, unsigned long cl) return xchg(clp, cl); } -static inline void tcf_set_drop_reason(struct tcf_result *res, +struct tc_skb_cb; + +static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb); + +static inline enum skb_drop_reason +tcf_get_drop_reason(const struct sk_buff *skb) +{ + return tc_skb_cb(skb)->drop_reason; +} + +static inline void tcf_set_drop_reason(const struct sk_buff *skb, enum skb_drop_reason reason) { - res->drop_reason = reason; + tc_skb_cb(skb)->drop_reason = reason; } static inline void diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 9fa1d0794dfa..9b559aa5c079 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -277,12 +277,13 @@ static inline void skb_txtime_consumed(struct sk_buff *skb) struct tc_skb_cb { struct qdisc_skb_cb qdisc_cb; + u32 drop_reason; + u16 zone; /* Only valid if post_ct = true */ u16 mru; u8 post_ct:1; u8 post_ct_snat:1; u8 post_ct_dnat:1; - u16 zone; /* Only valid if post_ct = true */ }; static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index dcb9160e6467..c499b56bb215 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -332,7 +332,6 @@ struct tcf_result { }; const struct tcf_proto *goto_tp; }; - enum skb_drop_reason drop_reason; }; struct tcf_chain; -- cgit v1.2.3 From b6a3c6066afc2cb7b92f45c67ab0b12ded81cb11 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Sat, 16 Dec 2023 17:44:35 -0300 Subject: net: sched: Make tc-related drop reason more flexible for remaining qdiscs Incrementing on Daniel's patch[1], make tc-related drop reason more flexible for remaining qdiscs - that is, all qdiscs aside from clsact. In essence, the drop reason will be set by cls_api and act_api in case any error occurred in the data path. With that, we can give the user more detailed information so that they can distinguish between a policy drop or an error drop. [1] https://lore.kernel.org/all/20231009092655.22025-1-daniel@iogearbox.net Signed-off-by: Victor Nogueira Acked-by: Daniel Borkmann Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 16 ---------------- include/net/pkt_sched.h | 19 ------------------- include/net/sch_generic.h | 31 +++++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 761e4500cca0..f308e8268651 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -154,22 +154,6 @@ __cls_set_class(unsigned long *clp, unsigned long cl) return xchg(clp, cl); } -struct tc_skb_cb; - -static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb); - -static inline enum skb_drop_reason -tcf_get_drop_reason(const struct sk_buff *skb) -{ - return tc_skb_cb(skb)->drop_reason; -} - -static inline void tcf_set_drop_reason(const struct sk_buff *skb, - enum skb_drop_reason reason) -{ - tc_skb_cb(skb)->drop_reason = reason; -} - static inline void __tcf_bind_filter(struct Qdisc *q, struct tcf_result *r, unsigned long base) { diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 9b559aa5c079..1e200d9a066d 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -275,25 +275,6 @@ static inline void skb_txtime_consumed(struct sk_buff *skb) skb->tstamp = ktime_set(0, 0); } -struct tc_skb_cb { - struct qdisc_skb_cb qdisc_cb; - u32 drop_reason; - - u16 zone; /* Only valid if post_ct = true */ - u16 mru; - u8 post_ct:1; - u8 post_ct_snat:1; - u8 post_ct_dnat:1; -}; - -static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) -{ - struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; - - BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); - return cb; -} - static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, unsigned long cl, struct qdisc_walker *arg) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index c499b56bb215..1d70c2c1572f 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1036,6 +1036,37 @@ static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) return skb; } +struct tc_skb_cb { + struct qdisc_skb_cb qdisc_cb; + u32 drop_reason; + + u16 zone; /* Only valid if post_ct = true */ + u16 mru; + u8 post_ct:1; + u8 post_ct_snat:1; + u8 post_ct_dnat:1; +}; + +static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) +{ + struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb; + + BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); + return cb; +} + +static inline enum skb_drop_reason +tcf_get_drop_reason(const struct sk_buff *skb) +{ + return tc_skb_cb(skb)->drop_reason; +} + +static inline void tcf_set_drop_reason(const struct sk_buff *skb, + enum skb_drop_reason reason) +{ + tc_skb_cb(skb)->drop_reason = reason; +} + /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ -- cgit v1.2.3 From 4cf24dc8934074725042c0bd10b91f4d4b5269bb Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Sat, 16 Dec 2023 17:44:36 -0300 Subject: net: sched: Add initial TC error skb drop reasons Continue expanding Daniel's patch by adding new skb drop reasons that are idiosyncratic to TC. More specifically: - SKB_DROP_REASON_TC_COOKIE_ERROR: An error occurred whilst processing a tc ext cookie. - SKB_DROP_REASON_TC_CHAIN_NOTFOUND: tc chain lookup failed. - SKB_DROP_REASON_TC_RECLASSIFY_LOOP: tc exceeded max reclassify loop iterations Signed-off-by: Victor Nogueira Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/net/dropreason-core.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 278e4c7d465c..6d3a20163260 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -85,8 +85,10 @@ FN(IPV6_NDISC_BAD_OPTIONS) \ FN(IPV6_NDISC_NS_OTHERHOST) \ FN(QUEUE_PURGE) \ - FN(TC_ERROR) \ + FN(TC_COOKIE_ERROR) \ FN(PACKET_SOCK_ERROR) \ + FN(TC_CHAIN_NOTFOUND) \ + FN(TC_RECLASSIFY_LOOP) \ FNe(MAX) /** @@ -377,13 +379,23 @@ enum skb_drop_reason { SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST, /** @SKB_DROP_REASON_QUEUE_PURGE: bulk free. */ SKB_DROP_REASON_QUEUE_PURGE, - /** @SKB_DROP_REASON_TC_ERROR: generic internal tc error. */ - SKB_DROP_REASON_TC_ERROR, + /** + * @SKB_DROP_REASON_TC_COOKIE_ERROR: An error occurred whilst + * processing a tc ext cookie. + */ + SKB_DROP_REASON_TC_COOKIE_ERROR, /** * @SKB_DROP_REASON_PACKET_SOCK_ERROR: generic packet socket errors * after its filter matches an incoming packet. */ SKB_DROP_REASON_PACKET_SOCK_ERROR, + /** @SKB_DROP_REASON_TC_CHAIN_NOTFOUND: tc chain lookup failed. */ + SKB_DROP_REASON_TC_CHAIN_NOTFOUND, + /** + * @SKB_DROP_REASON_TC_RECLASSIFY_LOOP: tc exceeded max reclassify loop + * iterations. + */ + SKB_DROP_REASON_TC_RECLASSIFY_LOOP, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen -- cgit v1.2.3 From a7e7b40c4bc115dbf2a2bb453d7bbb2e0ea99703 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 15 Dec 2023 19:31:14 -0800 Subject: net/mlx5e: Use the correct lag ports number when creating TISes The cited commit moved the code of mlx5e_create_tises() and changed the loop to create TISes over MLX5_MAX_PORTS constant value, instead of getting the correct lag ports supported by the device, which can cause FW errors on devices with less than MLX5_MAX_PORTS ports. Change that back to mlx5e_get_num_lag_ports(mdev). Also IPoIB interfaces create there own TISes, they don't use the eth TISes, pass a flag to indicate that. Fixes: b25bd37c859f ("net/mlx5: Move TISes from priv to mdev HW resources") Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7ee5b79ff3d6..aafb36c9e5d9 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -681,6 +681,7 @@ struct mlx5e_resources { struct mlx5_sq_bfreg bfreg; #define MLX5_MAX_NUM_TC 8 u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; + bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; struct mutex uplink_netdev_lock; -- cgit v1.2.3 From e04984a37398b3f4f5a79c993b94c6b1224184cc Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 19 Dec 2023 14:46:20 +0200 Subject: net/mlx5: Fix query of sd_group field The sd_group field moved in the HW spec from the MPIR register to the vport context. Align the query accordingly. Fixes: f5e956329960 ("net/mlx5: Expose Management PCIe Index Register (MPIR)") Signed-off-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 10 +++++++--- include/linux/mlx5/vport.h | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fee20fc010c2..bf2d51952e48 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4030,8 +4030,13 @@ struct mlx5_ifc_nic_vport_context_bits { u8 affiliation_criteria[0x4]; u8 affiliated_vhca_id[0x10]; - u8 reserved_at_60[0xd0]; + u8 reserved_at_60[0xa0]; + u8 reserved_at_100[0x1]; + u8 sd_group[0x3]; + u8 reserved_at_104[0x1c]; + + u8 reserved_at_120[0x10]; u8 mtu[0x10]; u8 system_image_guid[0x40]; @@ -10116,8 +10121,7 @@ struct mlx5_ifc_mpir_reg_bits { u8 reserved_at_20[0x20]; u8 local_port[0x8]; - u8 reserved_at_28[0x15]; - u8 sd_group[0x3]; + u8 reserved_at_28[0x18]; u8 reserved_at_60[0x20]; }; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index fbb9bf447889..c36cc6d82926 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -72,6 +72,7 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu); int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu); int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, u64 *system_image_guid); +int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group); int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, u16 vport, u64 node_guid); -- cgit v1.2.3 From c88c49ac9c18fb7c3fa431126de1d8f8f555e912 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 5 Dec 2023 23:54:21 +0200 Subject: net/mlx5: Enable SD feature Have an actual mlx5_sd instance in the core device, and fix the getter accordingly. This allows SD stuff to flow, the feature becomes supported only here. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index aafb36c9e5d9..cd286b681970 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -822,6 +822,7 @@ struct mlx5_core_dev { struct blocking_notifier_head macsec_nh; #endif u64 num_ipsec_offloads; + struct mlx5_sd *sd; }; struct mlx5_db { -- cgit v1.2.3 From 22c4640698a1d47606b5a4264a584e8046641784 Mon Sep 17 00:00:00 2001 From: Armen Ratner Date: Fri, 8 Sep 2023 14:53:09 -0500 Subject: net/mlx5: Implement management PF Ethernet profile Add management PF modules, which introduce support for the structures needed to create the resources for the MGMT PF to work. Also, add the necessary calls and functions to establish this functionality. Signed-off-by: Armen Ratner Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Reviewed-by: Daniel Jurgens --- include/linux/mlx5/driver.h | 8 ++++++++ include/linux/mlx5/mlx5_ifc.h | 14 +++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index cd286b681970..2bba88c67f58 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1224,6 +1224,14 @@ static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) return dev->caps.embedded_cpu; } +static inline bool mlx5_core_is_mgmt_pf(const struct mlx5_core_dev *dev) +{ + if (!MLX5_CAP_GEN_2(dev, local_mng_port_valid)) + return false; + + return MLX5_CAP_GEN_2(dev, local_mng_port); +} + static inline bool mlx5_core_is_ecpf_esw_manager(const struct mlx5_core_dev *dev) { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index bf2d51952e48..586569209254 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1954,8 +1954,10 @@ enum { struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_0[0x80]; - u8 migratable[0x1]; - u8 reserved_at_81[0x1f]; + u8 migratable[0x1]; + u8 reserved_at_81[0x19]; + u8 local_mng_port[0x1]; + u8 reserved_at_9b[0x5]; u8 max_reformat_insert_size[0x8]; u8 max_reformat_insert_offset[0x8]; @@ -1973,7 +1975,13 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 allowed_object_for_other_vhca_access[0x40]; - u8 reserved_at_140[0x60]; + u8 reserved_at_140[0x20]; + + u8 reserved_at_160[0xa]; + u8 local_mng_port_valid[0x1]; + u8 reserved_at_16b[0x15]; + + u8 reserved_at_180[0x20]; u8 flow_table_type_2_type[0x8]; u8 reserved_at_1a8[0x3]; -- cgit v1.2.3 From 3361597890ba901e4b666c2467b2cba349b0f63b Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 19 Dec 2023 17:01:39 -0700 Subject: wifi: cfg80211: address several kerneldoc warnings include/net/cfg80211.h includes a number of kerneldoc entries for struct members that do not exist, leading to these warnings: ./include/net/cfg80211.h:3192: warning: Excess struct member 'band_pref' description in 'cfg80211_bss_selection' ./include/net/cfg80211.h:3192: warning: Excess struct member 'adjust' description in 'cfg80211_bss_selection' ./include/net/cfg80211.h:6181: warning: Excess struct member 'bssid' description in 'wireless_dev' ./include/net/cfg80211.h:6181: warning: Excess struct member 'beacon_interval' description in 'wireless_dev' ./include/net/cfg80211.h:7299: warning: Excess struct member 'bss' description in 'cfg80211_rx_assoc_resp_data' Remove and/or repair each entry to address the warnings and ensure a proper docs build for the affected structures. Signed-off-by: Jonathan Corbet Reviewed-by: Randy Dunlap Link: https://msgid.link/87plz1g2sc.fsf@meer.lwn.net Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 033a0a25397f..92b956944c9f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3225,8 +3225,8 @@ struct cfg80211_ibss_params { * * @behaviour: requested BSS selection behaviour. * @param: parameters for requestion behaviour. - * @band_pref: preferred band for %NL80211_BSS_SELECT_ATTR_BAND_PREF. - * @adjust: parameters for %NL80211_BSS_SELECT_ATTR_RSSI_ADJUST. + * @param.band_pref: preferred band for %NL80211_BSS_SELECT_ATTR_BAND_PREF. + * @param.adjust: parameters for %NL80211_BSS_SELECT_ATTR_RSSI_ADJUST. */ struct cfg80211_bss_selection { enum nl80211_bss_select_attr behaviour; @@ -6063,7 +6063,6 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy, * wireless device if it has no netdev * @u: union containing data specific to @iftype * @connected: indicates if connected or not (STA mode) - * @bssid: (private) Used by the internal configuration code * @wext: (private) Used by the internal wireless extensions compat code * @wext.ibss: (private) IBSS data part of wext handling * @wext.connect: (private) connection handling data @@ -6083,8 +6082,6 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy, * @mgmt_registrations: list of registrations for management frames * @mgmt_registrations_need_update: mgmt registrations were updated, * need to propagate the update to the driver - * @beacon_interval: beacon interval used on this device for transmitting - * beacons, 0 when not valid * @address: The address for this device, valid only if @netdev is %NULL * @is_running: true if this is a non-netdev device that has been started, e.g. * the P2P Device. @@ -7363,8 +7360,6 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); /** * struct cfg80211_rx_assoc_resp_data - association response data - * @bss: the BSS that association was requested with, ownership of the pointer - * moves to cfg80211 in the call to cfg80211_rx_assoc_resp() * @buf: (Re)Association Response frame (header + body) * @len: length of the frame data * @uapsd_queues: bitmap of queues configured for uapsd. Same format @@ -7374,6 +7369,8 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); * @ap_mld_addr: AP MLD address (in case of MLO) * @links: per-link information indexed by link ID, use links[0] for * non-MLO connections + * @links.bss: the BSS that association was requested with, ownership of the + * pointer moves to cfg80211 in the call to cfg80211_rx_assoc_resp() * @links.status: Set this (along with a BSS pointer) for links that * were rejected by the AP. */ -- cgit v1.2.3 From 756df9853491e28acbe7fca9f8146a784f026117 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Wed, 20 Dec 2023 15:57:55 -0700 Subject: wifi: mac80211: address some kerneldoc warnings include/net/mac80111.h contains a number of either excess or incorrect kerneldoc entries for structure members, leading to these warnings: ./include/net/mac80211.h:491: warning: Excess struct member 'rssi' description in 'ieee80211_event' ./include/net/mac80211.h:491: warning: Excess struct member 'mlme' description in 'ieee80211_event' ./include/net/mac80211.h:491: warning: Excess struct member 'ba' description in 'ieee80211_event' ./include/net/mac80211.h:777: warning: Excess struct member 'ack_enabled' description in 'ieee80211_bss_conf' ./include/net/mac80211.h:1222: warning: Excess struct member 'ampdu_ack_len' description in 'ieee80211_tx_info' ./include/net/mac80211.h:1222: warning: Excess struct member 'ampdu_len' description in 'ieee80211_tx_info' ./include/net/mac80211.h:1222: warning: Excess struct member 'ack_signal' description in 'ieee80211_tx_info' ./include/net/mac80211.h:2920: warning: Excess struct member 'radiotap_he' description in 'ieee80211_hw' Fix or remove the entries as needed. This change removes 208 warnings from a "make htmldocs" build. Signed-off-by: Jonathan Corbet Reviewed-by: Simon Horman Link: https://msgid.link/87zfy4bhxo.fsf@meer.lwn.net Signed-off-by: Johannes Berg --- include/net/mac80211.h | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 77a71b1396b1..6d2e94bc841c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -476,9 +476,9 @@ struct ieee80211_ba_event { /** * struct ieee80211_event - event to be sent to the driver * @type: The event itself. See &enum ieee80211_event_type. - * @rssi: relevant if &type is %RSSI_EVENT - * @mlme: relevant if &type is %AUTH_EVENT - * @ba: relevant if &type is %BAR_RX_EVENT or %BA_FRAME_TIMEOUT + * @u.rssi: relevant if &type is %RSSI_EVENT + * @u.mlme: relevant if &type is %AUTH_EVENT + * @u.ba: relevant if &type is %BAR_RX_EVENT or %BA_FRAME_TIMEOUT * @u:union holding the fields above */ struct ieee80211_event { @@ -541,8 +541,6 @@ struct ieee80211_fils_discovery { * @link_id: link ID, or 0 for non-MLO * @htc_trig_based_pkt_ext: default PE in 4us units, if BSS supports HE * @uora_exists: is the UORA element advertised by AP - * @ack_enabled: indicates support to receive a multi-TID that solicits either - * ACK, BACK or both * @uora_ocw_range: UORA element's OCW Range field * @frame_time_rts_th: HE duration RTS threshold, in units of 32us * @he_support: does this BSS support HE @@ -1150,11 +1148,6 @@ ieee80211_rate_get_vht_nss(const struct ieee80211_tx_rate *rate) * @ack: union part for pure ACK data * @ack.cookie: cookie for the ACK * @driver_data: array of driver_data pointers - * @ampdu_ack_len: number of acked aggregated frames. - * relevant only if IEEE80211_TX_STAT_AMPDU was set. - * @ampdu_len: number of aggregated frames. - * relevant only if IEEE80211_TX_STAT_AMPDU was set. - * @ack_signal: signal strength of the ACK frame */ struct ieee80211_tx_info { /* common information */ @@ -2835,8 +2828,6 @@ enum ieee80211_hw_flags { * the default is _GI | _BANDWIDTH. * Use the %IEEE80211_RADIOTAP_VHT_KNOWN_\* values. * - * @radiotap_he: HE radiotap validity flags - * * @radiotap_timestamp: Information for the radiotap timestamp field; if the * @units_pos member is set to a non-negative value then the timestamp * field will be added and populated from the &struct ieee80211_rx_status -- cgit v1.2.3 From 41a313d875e0c5822efb50e8221b8d58811609bb Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 20 Dec 2023 13:41:34 +0200 Subject: wifi: cfg80211: reg: Support P2P operation on DFS channels FCC-594280 D01 Section B.3 allows peer-to-peer and ad hoc devices to operate on DFS channels while they operate under the control of a concurrent DFS master. For example, it is possible to have a P2P GO on a DFS channel as long as BSS connection is active on the same channel. Allow such operation by adding additional regulatory flags to indicate DFS concurrent channels and capable devices. Add the required relaxations in DFS regulatory checks. Signed-off-by: Andrei Otcheretianski Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20231220133549.bdfb8a9c7c54.I973563562969a27fea8ec5685b96a3a47afe142f@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ include/uapi/linux/nl80211.h | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 92b956944c9f..501d4421514f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -117,6 +117,7 @@ struct wiphy; * This may be due to the driver or due to regulatory bandwidth * restrictions. * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel. + * @IEEE80211_CHAN_DFS_CONCURRENT: See %NL80211_RRF_DFS_CONCURRENT */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = 1<<0, @@ -140,6 +141,7 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_16MHZ = 1<<18, IEEE80211_CHAN_NO_320MHZ = 1<<19, IEEE80211_CHAN_NO_EHT = 1<<20, + IEEE80211_CHAN_DFS_CONCURRENT = 1<<21, }; #define IEEE80211_CHAN_NO_HT40 \ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a682b54bd3ba..466da830e65f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4256,6 +4256,10 @@ enum nl80211_wmm_rule { * in current regulatory domain. * @NL80211_FREQUENCY_ATTR_PSD: Power spectral density (in dBm) that * is allowed on this channel in current regulatory domain. + * @NL80211_FREQUENCY_ATTR_DFS_CONCURRENT: Operation on this channel is + * allowed for peer-to-peer or adhoc communication under the control + * of a DFS master which operates on the same channel (FCC-594280 D01 + * Section B.3). Should be used together with %NL80211_RRF_DFS only. * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4295,6 +4299,7 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_NO_320MHZ, NL80211_FREQUENCY_ATTR_NO_EHT, NL80211_FREQUENCY_ATTR_PSD, + NL80211_FREQUENCY_ATTR_DFS_CONCURRENT, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, @@ -4500,6 +4505,10 @@ enum nl80211_sched_scan_match_attr { * @NL80211_RRF_NO_320MHZ: 320MHz operation not allowed * @NL80211_RRF_NO_EHT: EHT operation not allowed * @NL80211_RRF_PSD: Ruleset has power spectral density value + * @NL80211_RRF_DFS_CONCURRENT: Operation on this channel is allowed for + peer-to-peer or adhoc communication under the control of a DFS master + which operates on the same channel (FCC-594280 D01 Section B.3). + Should be used together with %NL80211_RRF_DFS only. */ enum nl80211_reg_rule_flags { NL80211_RRF_NO_OFDM = 1<<0, @@ -4521,6 +4530,7 @@ enum nl80211_reg_rule_flags { NL80211_RRF_NO_320MHZ = 1<<18, NL80211_RRF_NO_EHT = 1<<19, NL80211_RRF_PSD = 1<<20, + NL80211_RRF_DFS_CONCURRENT = 1<<21, }; #define NL80211_RRF_PASSIVE_SCAN NL80211_RRF_NO_IR @@ -6492,6 +6502,11 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_OWE_OFFLOAD_AP: Driver/Device wants to do OWE DH IE * handling in AP mode. * + * @NL80211_EXT_FEATURE_DFS_CONCURRENT: The device supports peer-to-peer or + * ad hoc operation on DFS channels under the control of a concurrent + * DFS master on the same channel as described in FCC-594280 D01 + * (Section B.3). This, for example, allows P2P GO and P2P clients to + * operate on DFS channels as long as there's a concurrent BSS connection. * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6565,6 +6580,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_AUTH_AND_DEAUTH_RANDOM_TA, NL80211_EXT_FEATURE_OWE_OFFLOAD, NL80211_EXT_FEATURE_OWE_OFFLOAD_AP, + NL80211_EXT_FEATURE_DFS_CONCURRENT, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 9be61558dec0af359ce3139c8450228de7f0796d Mon Sep 17 00:00:00 2001 From: Andrei Otcheretianski Date: Wed, 20 Dec 2023 13:41:35 +0200 Subject: wifi: cfg80211: Schedule regulatory check on BSS STA channel change Due to different relaxation policies it may be needed to re-check channels after a BSS station interface is disconnected or performed a channel switch. Signed-off-by: Andrei Otcheretianski Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20231220133549.1f2f8475bcf1.I1879d259d8d756159c8060f61f4bce172e6d323e@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 501d4421514f..745974d45ea4 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -9391,6 +9391,16 @@ bool cfg80211_valid_disable_subchannel_bitmap(u16 *bitmap, */ void cfg80211_links_removed(struct net_device *dev, u16 link_mask); +/** + * cfg80211_schedule_channels_check - schedule regulatory check if needed + * @wdev: the wireless device to check + * + * In case the device supports NO_IR or DFS relaxations, schedule regulatory + * channels check, as previous concurrent operation conditions may not + * hold anymore. + */ +void cfg80211_schedule_channels_check(struct wireless_dev *wdev); + #ifdef CONFIG_CFG80211_DEBUGFS /** * wiphy_locked_debugfs_read - do a locked read in debugfs -- cgit v1.2.3 From 645f3d85129d8aac3b896ba685fbc20a31c2c036 Mon Sep 17 00:00:00 2001 From: Mukesh Sisodiya Date: Wed, 20 Dec 2023 13:41:38 +0200 Subject: wifi: cfg80211: handle UHB AP and STA power type UHB AP send supported power type(LPI, SP, VLP) in beacon and probe response IE and STA should connect to these AP only if their regulatory support the AP power type. Beacon/Probe response are reported to userspace with reason "STA regulatory not supporting to connect to AP based on transmitted power type" and it should not connect to AP. Signed-off-by: Mukesh Sisodiya Reviewed-by: Gregory Greenman Signed-off-by: Miri Korenblit Link: https://msgid.link/20231220133549.cbfbef9170a9.I432f78438de18aa9f5c9006be12e41dc34cc47c5@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + include/net/cfg80211.h | 6 ++++++ include/uapi/linux/nl80211.h | 13 +++++++++++++ 3 files changed, 20 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 8ad008591e32..2f5554482047 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2720,6 +2720,7 @@ static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len) #define IEEE80211_6GHZ_CTRL_REG_LPI_AP 0 #define IEEE80211_6GHZ_CTRL_REG_SP_AP 1 +#define IEEE80211_6GHZ_CTRL_REG_VLP_AP 2 /** * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 745974d45ea4..cf79656ce09c 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -118,6 +118,10 @@ struct wiphy; * restrictions. * @IEEE80211_CHAN_NO_EHT: EHT operation is not permitted on this channel. * @IEEE80211_CHAN_DFS_CONCURRENT: See %NL80211_RRF_DFS_CONCURRENT + * @IEEE80211_CHAN_NO_UHB_VLP_CLIENT: Client connection with VLP AP + * not permitted using this channel + * @IEEE80211_CHAN_NO_UHB_AFC_CLIENT: Client connection with AFC AP + * not permitted using this channel */ enum ieee80211_channel_flags { IEEE80211_CHAN_DISABLED = 1<<0, @@ -142,6 +146,8 @@ enum ieee80211_channel_flags { IEEE80211_CHAN_NO_320MHZ = 1<<19, IEEE80211_CHAN_NO_EHT = 1<<20, IEEE80211_CHAN_DFS_CONCURRENT = 1<<21, + IEEE80211_CHAN_NO_UHB_VLP_CLIENT= 1<<22, + IEEE80211_CHAN_NO_UHB_AFC_CLIENT= 1<<23, }; #define IEEE80211_CHAN_NO_HT40 \ diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 466da830e65f..1ccdcae24372 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4260,6 +4260,10 @@ enum nl80211_wmm_rule { * allowed for peer-to-peer or adhoc communication under the control * of a DFS master which operates on the same channel (FCC-594280 D01 * Section B.3). Should be used together with %NL80211_RRF_DFS only. + * @NL80211_FREQUENCY_ATTR_NO_UHB_VLP_CLIENT: Client connection to VLP AP + * not allowed using this channel + * @NL80211_FREQUENCY_ATTR_NO_UHB_AFC_CLIENT: Client connection to AFC AP + * not allowed using this channel * @NL80211_FREQUENCY_ATTR_MAX: highest frequency attribute number * currently defined * @__NL80211_FREQUENCY_ATTR_AFTER_LAST: internal use @@ -4300,6 +4304,8 @@ enum nl80211_frequency_attr { NL80211_FREQUENCY_ATTR_NO_EHT, NL80211_FREQUENCY_ATTR_PSD, NL80211_FREQUENCY_ATTR_DFS_CONCURRENT, + NL80211_FREQUENCY_ATTR_NO_UHB_VLP_CLIENT, + NL80211_FREQUENCY_ATTR_NO_UHB_AFC_CLIENT, /* keep last */ __NL80211_FREQUENCY_ATTR_AFTER_LAST, @@ -4509,6 +4515,8 @@ enum nl80211_sched_scan_match_attr { peer-to-peer or adhoc communication under the control of a DFS master which operates on the same channel (FCC-594280 D01 Section B.3). Should be used together with %NL80211_RRF_DFS only. + * @NL80211_RRF_NO_UHB_VLP_CLIENT: Client connection to VLP AP not allowed + * @NL80211_RRF_NO_UHB_AFC_CLIENT: Client connection to AFC AP not allowed */ enum nl80211_reg_rule_flags { NL80211_RRF_NO_OFDM = 1<<0, @@ -4531,6 +4539,8 @@ enum nl80211_reg_rule_flags { NL80211_RRF_NO_EHT = 1<<19, NL80211_RRF_PSD = 1<<20, NL80211_RRF_DFS_CONCURRENT = 1<<21, + NL80211_RRF_NO_UHB_VLP_CLIENT = 1<<22, + NL80211_RRF_NO_UHB_AFC_CLIENT = 1<<23, }; #define NL80211_RRF_PASSIVE_SCAN NL80211_RRF_NO_IR @@ -5086,9 +5096,12 @@ enum nl80211_bss_use_for { * BSS isn't possible * @NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY: NSTR nonprimary links aren't * supported by the device, and this BSS entry represents one. + * @NL80211_BSS_CANNOT_USE_UHB_PWR_MISMATCH: STA is not supporting + * the AP power type (SP, VLP, AP) that the AP uses. */ enum nl80211_bss_cannot_use_reasons { NL80211_BSS_CANNOT_USE_NSTR_NONPRIMARY = 1 << 0, + NL80211_BSS_CANNOT_USE_UHB_PWR_MISMATCH = 1 << 1, }; /** -- cgit v1.2.3 From d5b6f6d595b446c1693fdaa6aeb4a65b94d5fc90 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 20 Dec 2023 13:41:39 +0200 Subject: wifi: mac80211: rework RX timestamp flags We only have a single flag free, and before using that for another mactime flag, instead refactor the mactime flags to use a 2-bit field. Signed-off-by: Johannes Berg Reviewed-by: Gregory Greenman Reviewed-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20231220133549.d0e664832d14.I20c8900106f9bf81316bed778b1e3ce145785274@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 6d2e94bc841c..941980c7aa21 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1355,6 +1355,9 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * the frame. * @RX_FLAG_FAILED_PLCP_CRC: Set this flag if the PCLP check failed on * the frame. + * @RX_FLAG_MACTIME: The timestamp passed in the RX status (@mactime + * field) is valid if this field is non-zero, and the position + * where the timestamp was sampled depends on the value. * @RX_FLAG_MACTIME_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the first symbol of the MPDU * was received. This is useful in monitor mode and for proper IBSS @@ -1434,12 +1437,12 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), RX_FLAG_DECRYPTED = BIT(1), - RX_FLAG_MACTIME_PLCP_START = BIT(2), + RX_FLAG_ONLY_MONITOR = BIT(2), RX_FLAG_MMIC_STRIPPED = BIT(3), RX_FLAG_IV_STRIPPED = BIT(4), RX_FLAG_FAILED_FCS_CRC = BIT(5), RX_FLAG_FAILED_PLCP_CRC = BIT(6), - RX_FLAG_MACTIME_START = BIT(7), + /* one free bit at 7 */ RX_FLAG_NO_SIGNAL_VAL = BIT(8), RX_FLAG_AMPDU_DETAILS = BIT(9), RX_FLAG_PN_VALIDATED = BIT(10), @@ -1448,8 +1451,10 @@ enum mac80211_rx_flags { RX_FLAG_AMPDU_IS_LAST = BIT(13), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(14), RX_FLAG_AMPDU_DELIM_CRC_KNOWN = BIT(15), - RX_FLAG_MACTIME_END = BIT(16), - RX_FLAG_ONLY_MONITOR = BIT(17), + RX_FLAG_MACTIME = BIT(16) | BIT(17), + RX_FLAG_MACTIME_PLCP_START = 1 << 16, + RX_FLAG_MACTIME_START = 2 << 16, + RX_FLAG_MACTIME_END = 3 << 16, RX_FLAG_SKIP_MONITOR = BIT(18), RX_FLAG_AMSDU_MORE = BIT(19), RX_FLAG_RADIOTAP_TLV_AT_END = BIT(20), -- cgit v1.2.3 From e62c0fcc0e06a36b40615885eb86eb407ac8d0dd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 20 Dec 2023 13:41:40 +0200 Subject: wifi: mac80211: allow 64-bit radiotap timestamps When reporting the radiotap timestamp, the mactime field is usually unused, we take the data from the device_timestamp. However, there can be cases where the radiotap timestamp is better reported as a 64-bit value, so since the mactime is free, add a flag to support using the mactime as a 64-bit radiotap timestamp. Signed-off-by: Johannes Berg Reviewed-by: Gregory Greenman Reviewed-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20231220133549.00c8b9234f0c.Ie3ce5eae33cce88fa01178e7aea94661ded1ac24@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 941980c7aa21..46cc44430216 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1367,6 +1367,11 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * (including FCS) was received. * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the SYNC preamble was received. + * @RX_FLAG_MACTIME_IS_RTAP_TS64: The timestamp passed in the RX status @mactime + * is only for use in the radiotap timestamp header, not otherwise a valid + * @mactime value. Note this is a separate flag so that we continue to see + * %RX_FLAG_MACTIME as unset. Also note that in this case the timestamp is + * reported to be 64 bits wide, not just 32. * @RX_FLAG_NO_SIGNAL_VAL: The signal strength value is not present. * Valid only for data frames (mainly A-MPDU) * @RX_FLAG_AMPDU_DETAILS: A-MPDU details are known, in particular the reference @@ -1442,7 +1447,7 @@ enum mac80211_rx_flags { RX_FLAG_IV_STRIPPED = BIT(4), RX_FLAG_FAILED_FCS_CRC = BIT(5), RX_FLAG_FAILED_PLCP_CRC = BIT(6), - /* one free bit at 7 */ + RX_FLAG_MACTIME_IS_RTAP_TS64 = BIT(7), RX_FLAG_NO_SIGNAL_VAL = BIT(8), RX_FLAG_AMPDU_DETAILS = BIT(9), RX_FLAG_PN_VALIDATED = BIT(10), -- cgit v1.2.3 From e993af2ed2882f5dade10474d7fd1e2403aa0ab7 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Wed, 20 Dec 2023 13:41:45 +0200 Subject: wifi: mac80211: add a driver callback to check active_links During ieee80211_set_active_links() we do (among the others): 1. Call drv_change_vif_links() with both old_active and new_active 2. Unassign the chanctx for the removed link(s) (if any) 3. Assign chanctx to the added link(s) (if any) 4. Call drv_change_vif_links() with the new_active links bitmap The problem here is that during step #1 the driver doesn't know whether we will activate multiple links simultaneously or are just doing a link switch, so it can't check there if multiple links are supported/enabled. (Some of the drivers might enable/disable this option dynamically) And during step #3, in which the driver already knows that, returning an error code (for example when multiple links are not supported or disabled), will cause a warning, and we will still complete the transition to the new_active links. (It is hard to undo things in that stage, since we released channels etc.) Therefore add a driver callback to check if the desired new_active links will be supported by the driver or not. This callback will be called in the beginning of ieee80211_set_active_links() so we won't do anything before we are sure it is supported. Signed-off-by: Miri Korenblit Reviewed-by: Gregory Greenman Reviewed-by: Johannes Berg Link: https://msgid.link/20231220133549.64c4d70b33b8.I79708619be76b8ecd4ef3975205b8f903e24a2cd@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 46cc44430216..d400fe2e8668 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4272,6 +4272,8 @@ struct ieee80211_prep_tx_info { * disable background CAC/radar detection. * @net_fill_forward_path: Called from .ndo_fill_forward_path in order to * resolve a path for hardware flow offloading + * @can_activate_links: Checks if a specific active_links bitmap is + * supported by the driver. * @change_vif_links: Change the valid links on an interface, note that while * removing the old link information is still valid (link_conf pointer), * but may immediately disappear after the function returns. The old or @@ -4652,6 +4654,9 @@ struct ieee80211_ops { struct ieee80211_sta *sta, struct net_device_path_ctx *ctx, struct net_device_path *path); + bool (*can_activate_links)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + u16 active_links); int (*change_vif_links)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u16 old_links, u16 new_links, -- cgit v1.2.3 From 5a78a8121c4d8e37035274c094e3af15fb79f004 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 18 Dec 2023 20:07:42 -0700 Subject: net/ipv6: Remove gc_link warn on in fib6_info_release A revert of 3dec89b14d37 ("net/ipv6: Remove expired routes with a separated list of routes") was sent for net-next. Revert the remainder of 5a08d0065a915 which added a warn on if a fib entry is still on the gc_link list to avoid compile failures when net is merged to net-next Signed-off-by: David Ahern Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20231219030742.25715-1-dsahern@kernel.org Signed-off-by: Paolo Abeni --- include/net/ip6_fib.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 95ed495c3a40..1ba9f4ddf2f6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -328,10 +328,8 @@ static inline bool fib6_info_hold_safe(struct fib6_info *f6i) static inline void fib6_info_release(struct fib6_info *f6i) { - if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) { - DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link)); + if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) call_rcu(&f6i->rcu, fib6_info_destroy_rcu); - } } enum fib6_walk_state { -- cgit v1.2.3 From 3fde94b6e930f5a0fd4f6458da8d559c898f2322 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 20 Nov 2023 17:29:58 +0100 Subject: netfilter: flowtable: reorder nf_flowtable struct members Place the read-mostly parts accessed by the datapath first. In particular, we do access ->flags member (to see if HW offload is enabled) for every single packet, but this is placed in the 5th cacheline. priority could stay where it is, but move it too to cover a hole. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 692d5955911c..956c752ceb31 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -74,12 +74,13 @@ enum nf_flowtable_flags { }; struct nf_flowtable { - struct list_head list; - struct rhashtable rhashtable; - int priority; + unsigned int flags; /* readonly in datapath */ + int priority; /* control path (padding hole) */ + struct rhashtable rhashtable; /* datapath, read-mostly members come first */ + + struct list_head list; /* slowpath parts */ const struct nf_flowtable_type *type; struct delayed_work gc_work; - unsigned int flags; struct flow_block flow_block; struct rw_semaphore flow_block_lock; /* Guards flow_block */ possible_net_t net; -- cgit v1.2.3 From fa224d0c094a458e9ebf5ea9b1c696136b7af427 Mon Sep 17 00:00:00 2001 From: Iulia Tanasescu Date: Mon, 13 Nov 2023 17:38:00 +0200 Subject: Bluetooth: ISO: Reassociate a socket with an active BIS For ISO Broadcast, all BISes from a BIG have the same lifespan - they cannot be created or terminated independently from each other. This links together all BIS hcons that are part of the same BIG, so all hcons are kept alive as long as the BIG is active. If multiple BIS sockets are opened for a BIG handle, and only part of them are closed at some point, the associated hcons will be marked as open. If new sockets will later be opened for the same BIG, they will be reassociated with the open BIS hcons. All BIS hcons will be cleaned up and the BIG will be terminated when the last BIS socket is closed from userspace. Signed-off-by: Iulia Tanasescu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a3a1ea2696a8..6a32c06d7fdc 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1297,6 +1297,30 @@ static inline struct hci_conn *hci_conn_hash_lookup_big(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn * +hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (bacmp(&c->dst, BDADDR_ANY) || c->type != ISO_LINK || + c->state != state) + continue; + + if (handle == c->iso_qos.bcast.big) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + static inline struct hci_conn * hci_conn_hash_lookup_pa_sync_big_handle(struct hci_dev *hdev, __u8 big) { -- cgit v1.2.3 From 78db544b5d276b70c6ea2c2909ffed96b10229a3 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 6 Sep 2023 14:13:35 -0700 Subject: Bluetooth: hci_core: Remove le_restart_scan work This removes le_restart_scan work and instead just disables controller duplicate filtering when discovery result_filtering is enabled and HCI_QUIRK_STRICT_DUPLICATE_FILTER is set. Link: https://github.com/bluez/bluez/issues/573 Link: https://github.com/bluez/bluez/issues/572 Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 6a32c06d7fdc..cf560a9e16b6 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -539,7 +539,6 @@ struct hci_dev { struct work_struct tx_work; struct delayed_work le_scan_disable; - struct delayed_work le_scan_restart; struct sk_buff_head rx_q; struct sk_buff_head raw_q; -- cgit v1.2.3 From d03376c185926098cb4d668d6458801eb785c0a5 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 30 Nov 2023 14:58:03 +0100 Subject: Bluetooth: Fix bogus check for re-auth no supported with non-ssp This reverts 19f8def031bfa50c579149b200bfeeb919727b27 "Bluetooth: Fix auth_complete_evt for legacy units" which seems to be working around a bug on a broken controller rather then any limitation imposed by the Bluetooth spec, in fact if there ws not possible to re-auth the command shall fail not succeed. Fixes: 19f8def031bf ("Bluetooth: Fix auth_complete_evt for legacy units") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index cf560a9e16b6..8f8dd9173714 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -956,7 +956,6 @@ void hci_inquiry_cache_flush(struct hci_dev *hdev); /* ----- HCI Connections ----- */ enum { HCI_CONN_AUTH_PEND, - HCI_CONN_REAUTH_PEND, HCI_CONN_ENCRYPT_PEND, HCI_CONN_RSWITCH_PEND, HCI_CONN_MODE_CHANGE_PEND, -- cgit v1.2.3 From 06a8c04f89949576b715f11350f1896b62cecb0a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:24 +0900 Subject: tcp: Save v4 address as v4-mapped-v6 in inet_bind2_bucket.v6_rcv_saddr. In bhash2, IPv4/IPv6 addresses are saved in two union members, which complicate address checks in inet_bind2_bucket_addr_match() and inet_bind2_bucket_match_addr_any() considering uninitialised memory and v4-mapped-v6 conflicts. Let's simplify that by saving IPv4 address as v4-mapped-v6 address and defining tb2.rcv_saddr as tb2.v6_rcv_saddr.s6_addr32[3]. Then, we can compare v6 address as is, and after checking v4-mapped-v6, we can compare v4 address easily. Also, we can remove tb2->family. Note these functions will be further refactored in the next patch. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 11 ++++------- include/net/ipv6.h | 5 ----- 2 files changed, 4 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 3ecfeadbfa06..171cc235d045 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -96,14 +96,11 @@ struct inet_bind2_bucket { int l3mdev; unsigned short port; #if IS_ENABLED(CONFIG_IPV6) - unsigned short family; -#endif - union { -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr v6_rcv_saddr; + struct in6_addr v6_rcv_saddr; +#define rcv_saddr v6_rcv_saddr.s6_addr32[3] +#else + __be32 rcv_saddr; #endif - __be32 rcv_saddr; - }; /* Node in the bhash2 inet_bind_hashbucket chain */ struct hlist_node node; /* List of sockets hashed to this bucket */ diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 78d38dd88aba..cf25ea21d770 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -784,11 +784,6 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) cpu_to_be32(0x0000ffff))) == 0UL; } -static inline bool ipv6_addr_v4mapped_any(const struct in6_addr *a) -{ - return ipv6_addr_v4mapped(a) && ipv4_is_zeronet(a->s6_addr32[3]); -} - static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]); -- cgit v1.2.3 From 5a22bba13d0162d278d6e31232259ddcce196b8e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:25 +0900 Subject: tcp: Save address type in inet_bind2_bucket. inet_bind2_bucket_addr_match() and inet_bind2_bucket_match_addr_any() are called for each bhash2 bucket to check conflicts. Thus, we call ipv6_addr_any() and ipv6_addr_v4mapped() over and over during bind(). Let's avoid calling them by saving the address type in inet_bind2_bucket. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 171cc235d045..260e673ede22 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -96,6 +96,7 @@ struct inet_bind2_bucket { int l3mdev; unsigned short port; #if IS_ENABLED(CONFIG_IPV6) + unsigned short addr_type; struct in6_addr v6_rcv_saddr; #define rcv_saddr v6_rcv_saddr.s6_addr32[3] #else -- cgit v1.2.3 From 822fb91fc724786372eed4f4397409c70afc08b8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:27 +0900 Subject: tcp: Link bhash2 to bhash. bhash2 added a new member sk_bind2_node in struct sock to link sockets to bhash2 in addition to bhash. bhash is still needed to search conflicting sockets efficiently from a port for the wildcard address. However, bhash itself need not have sockets. If we link each bhash2 bucket to the corresponding bhash bucket, we can iterate the same set of the sockets from bhash2 via bhash. This patch links bhash2 to bhash only, and the actual use will be in the later patches. Finally, we will remove sk_bind2_node. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 260e673ede22..25ba471ba161 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -89,6 +89,7 @@ struct inet_bind_bucket { bool fast_ipv6_only; struct hlist_node node; struct hlist_head owners; + struct hlist_head bhash2; }; struct inet_bind2_bucket { @@ -104,6 +105,7 @@ struct inet_bind2_bucket { #endif /* Node in the bhash2 inet_bind_hashbucket chain */ struct hlist_node node; + struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; /* bhash has twsk in owners, but bhash2 has twsk in @@ -239,7 +241,7 @@ bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, struct inet_bind2_bucket * inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - unsigned short port, int l3mdev, + struct inet_bind_bucket *tb, const struct sock *sk); void inet_bind2_bucket_destroy(struct kmem_cache *cachep, -- cgit v1.2.3 From b2cb9f9ef240b4afaa1838b74fad077f17682f61 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:31 +0900 Subject: tcp: Unlink sk from bhash. Now we do not use tb->owners and can unlink sockets from bhash. sk_bind_node/tw_bind_node are available for bhash2 and will be used in the following patch. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 25ba471ba161..98ba728aec08 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -88,7 +88,6 @@ struct inet_bind_bucket { unsigned short fast_sk_family; bool fast_ipv6_only; struct hlist_node node; - struct hlist_head owners; struct hlist_head bhash2; }; -- cgit v1.2.3 From 770041d337a85923810dd3aeebea38037a28d534 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:32 +0900 Subject: tcp: Link sk and twsk to tb2->owners using skc_bind_node. Now we can use sk_bind_node/tw_bind_node for bhash2, which means we need not link TIME_WAIT sockets separately. The dead code and sk_bind2_node will be removed in the next patch. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index ba6642811db4..21c86e4c71fc 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -873,16 +873,6 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_add_head(&sk->sk_bind_node, list); } -static inline void __sk_del_bind2_node(struct sock *sk) -{ - __hlist_del(&sk->sk_bind2_node); -} - -static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) -{ - hlist_add_head(&sk->sk_bind2_node, list); -} - #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ -- cgit v1.2.3 From 8191792c18c55ee444588100cf0464bd8c0bf5f3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 19 Dec 2023 09:18:33 +0900 Subject: tcp: Remove dead code and fields for bhash2. Now all sockets including TIME_WAIT are linked to bhash2 using sock_common.skc_bind_node. We no longer use inet_bind2_bucket.deathrow, sock.sk_bind2_node, and inet_timewait_sock.tw_bind2_node. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 4 ---- include/net/inet_timewait_sock.h | 4 ---- include/net/sock.h | 4 ---- 3 files changed, 12 deletions(-) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 98ba728aec08..7f1b38458743 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -107,10 +107,6 @@ struct inet_bind2_bucket { struct hlist_node bhash_node; /* List of sockets hashed to this bucket */ struct hlist_head owners; - /* bhash has twsk in owners, but bhash2 has twsk in - * deathrow not to add a member in struct sock_common. - */ - struct hlist_head deathrow; }; static inline struct net *ib_net(const struct inet_bind_bucket *ib) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index b14999ff55db..f28da08a37b4 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -75,13 +75,9 @@ struct inet_timewait_sock { struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; struct inet_bind2_bucket *tw_tb2; - struct hlist_node tw_bind2_node; }; #define tw_tclass tw_tos -#define twsk_for_each_bound_bhash2(__tw, list) \ - hlist_for_each_entry(__tw, list, tw_bind2_node) - static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) { return (struct inet_timewait_sock *)sk; diff --git a/include/net/sock.h b/include/net/sock.h index 21c86e4c71fc..2fb3ea2d2ec3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -352,7 +352,6 @@ struct sk_filter; * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @ns_tracker: tracker for netns reference - * @sk_bind2_node: bind node in the bhash2 table */ struct sock { /* @@ -544,7 +543,6 @@ struct sock { #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; - struct hlist_node sk_bind2_node; }; enum sk_pacing { @@ -890,8 +888,6 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) -#define sk_for_each_bound_bhash2(__sk, list) \ - hlist_for_each_entry(__sk, list, sk_bind2_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset -- cgit v1.2.3 From dcc3e46472d678f4af5ce1194a23649231c5d241 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 18 Dec 2023 17:26:26 -0700 Subject: net: skbuff: Remove some excess struct-member documentation Remove documentation for nonexistent structure members, addressing these warnings: ./include/linux/skbuff.h:1063: warning: Excess struct member 'sp' description in 'sk_buff' ./include/linux/skbuff.h:1063: warning: Excess struct member 'nf_bridge' description in 'sk_buff' Signed-off-by: Jonathan Corbet Reviewed-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ea5c8ab3ed00..50e92c8471dc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -754,7 +754,6 @@ typedef unsigned char *sk_buff_data_t; * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here * @_skb_refdst: destination entry (with norefcount bit) - * @sp: the security path, used for xfrm * @len: Length of actual data * @data_len: Data length * @mac_len: Length of link layer header @@ -788,7 +787,6 @@ typedef unsigned char *sk_buff_data_t; * @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue) * @_sk_redir: socket redirection information for skmsg * @_nfct: Associated connection, if any (with nfctinfo bits) - * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @skb_iif: ifindex of device we arrived on * @tc_index: Traffic control index * @hash: the packet hash -- cgit v1.2.3 From b40584d145700addc70cc29e4f0850a4ed955b1c Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Tue, 19 Dec 2023 22:26:13 +0800 Subject: net/smc: compatible with 128-bits extended GID of virtual ISM device According to virtual ISM support feature defined by SMCv2.1, GIDs of virtual ISM device are UUIDs defined by RFC4122, which are 128-bits long. So some adaptation work is required. And note that the GIDs of existing platform firmware ISM devices still remain 64-bits long. Signed-off-by: Wen Gu Reviewed-by: Alexandra Winter Signed-off-by: David S. Miller --- include/net/smc.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/smc.h b/include/net/smc.h index a002552be29c..a0dc1187e96e 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -52,9 +52,14 @@ struct smcd_dmb { struct smcd_dev; struct ism_client; +struct smcd_gid { + u64 gid; + u64 gid_ext; +}; + struct smcd_ops { - int (*query_remote_gid)(struct smcd_dev *dev, u64 rgid, u32 vid_valid, - u32 vid); + int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid, + u32 vid_valid, u32 vid); int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb, struct ism_client *client); int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb); @@ -62,14 +67,14 @@ struct smcd_ops { int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id); int (*set_vlan_required)(struct smcd_dev *dev); int (*reset_vlan_required)(struct smcd_dev *dev); - int (*signal_event)(struct smcd_dev *dev, u64 rgid, u32 trigger_irq, - u32 event_code, u64 info); + int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid, + u32 trigger_irq, u32 event_code, u64 info); int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); int (*supports_v2)(void); u8* (*get_system_eid)(void); - u64 (*get_local_gid)(struct smcd_dev *dev); + void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); u16 (*get_chid)(struct smcd_dev *dev); struct device* (*get_dev)(struct smcd_dev *dev); }; -- cgit v1.2.3 From 01fd1617dbc6f558efd1811f2bc433659d1e8304 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Tue, 19 Dec 2023 22:26:14 +0800 Subject: net/smc: support extended GID in SMC-D lgr netlink attribute Virtual ISM devices introduced in SMCv2.1 requires a 128 bit extended GID vs. the existing ISM 64bit GID. So the 2nd 64 bit of extended GID should be included in SMC-D linkgroup netlink attribute as well. Signed-off-by: Wen Gu Signed-off-by: David S. Miller --- include/uapi/linux/smc.h | 2 ++ include/uapi/linux/smc_diag.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 837fcd4b0abc..b531e3ef011a 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -160,6 +160,8 @@ enum { SMC_NLA_LGR_D_CHID, /* u16 */ SMC_NLA_LGR_D_PAD, /* flag */ SMC_NLA_LGR_D_V2_COMMON, /* nest */ + SMC_NLA_LGR_D_EXT_GID, /* u64 */ + SMC_NLA_LGR_D_PEER_EXT_GID, /* u64 */ __SMC_NLA_LGR_D_MAX, SMC_NLA_LGR_D_MAX = __SMC_NLA_LGR_D_MAX - 1 }; diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index 8cb3a6fef553..58eceb7f5df2 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -107,6 +107,8 @@ struct smcd_diag_dmbinfo { /* SMC-D Socket internals */ __aligned_u64 my_gid; /* My GID */ __aligned_u64 token; /* Token of DMB */ __aligned_u64 peer_token; /* Token of remote DMBE */ + __aligned_u64 peer_gid_ext; /* Peer GID (extended part) */ + __aligned_u64 my_gid_ext; /* My GID (extended part) */ }; #endif /* _UAPI_SMC_DIAG_H_ */ -- cgit v1.2.3 From b3bf76024f645369e1fc45e0b08a2bd24f200d9b Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Tue, 19 Dec 2023 22:26:16 +0800 Subject: net/smc: manage system EID in SMC stack instead of ISM driver The System EID (SEID) is an internal EID that is used by the SMCv2 software stack that has a predefined and constant value representing the s390 physical machine that the OS is executing on. So it should be managed by SMC stack instead of ISM driver and be consistent for all ISMv2 device (including virtual ISM devices) on s390 architecture. Suggested-by: Alexandra Winter Signed-off-by: Wen Gu Reviewed-and-tested-by: Wenjia Zhang Reviewed-by: Alexandra Winter Signed-off-by: David S. Miller --- include/linux/ism.h | 1 - include/net/smc.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/ism.h b/include/linux/ism.h index 9a4c204df3da..5428edd90982 100644 --- a/include/linux/ism.h +++ b/include/linux/ism.h @@ -86,7 +86,6 @@ int ism_register_dmb(struct ism_dev *dev, struct ism_dmb *dmb, int ism_unregister_dmb(struct ism_dev *dev, struct ism_dmb *dmb); int ism_move(struct ism_dev *dev, u64 dmb_tok, unsigned int idx, bool sf, unsigned int offset, void *data, unsigned int size); -u8 *ism_get_seid(void); const struct smcd_ops *ism_get_smcd_ops(void); diff --git a/include/net/smc.h b/include/net/smc.h index a0dc1187e96e..c9dcb30e3fd9 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -73,7 +73,6 @@ struct smcd_ops { bool sf, unsigned int offset, void *data, unsigned int size); int (*supports_v2)(void); - u8* (*get_system_eid)(void); void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid); u16 (*get_chid)(struct smcd_dev *dev); struct device* (*get_dev)(struct smcd_dev *dev); -- cgit v1.2.3 From b1dffcf0da221d1f9d8007dfa2a41a325921d7fa Mon Sep 17 00:00:00 2001 From: Denis Kirjanov Date: Tue, 19 Dec 2023 17:38:20 +0300 Subject: net: remove SOCK_DEBUG macro Since there are no more users of the macro let's finally burn it Signed-off-by: Denis Kirjanov Signed-off-by: David S. Miller --- include/net/sock.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 2fb3ea2d2ec3..30d3980dd57c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -76,19 +76,6 @@ * the other protocols. */ -/* Define this to get the SOCK_DBG debugging facility. */ -#define SOCK_DEBUGGING -#ifdef SOCK_DEBUGGING -#define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \ - printk(KERN_DEBUG msg); } while (0) -#else -/* Validate arguments and do nothing */ -static inline __printf(2, 3) -void SOCK_DEBUG(const struct sock *sk, const char *msg, ...) -{ -} -#endif - /* This is the per-socket lock. The spinlock provides a synchronization * between user contexts and software interrupt processing, whereas the * mini-semaphore synchronizes multiple users amongst themselves. -- cgit v1.2.3 From 913b47d3424e7d99eaf34b798c47dfa840c64a08 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Tue, 19 Dec 2023 15:16:19 -0300 Subject: net/sched: Introduce tc block netdev tracking infra This commit makes tc blocks track which ports have been added to them. And, with that, we'll be able to use this new information to send packets to the block's ports. Which will be done in the patch #3 of this series. Suggested-by: Jiri Pirko Co-developed-by: Jamal Hadi Salim Signed-off-by: Jamal Hadi Salim Co-developed-by: Pedro Tammela Signed-off-by: Pedro Tammela Signed-off-by: Victor Nogueira Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 1d70c2c1572f..aca11127154f 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -19,6 +19,7 @@ #include #include #include +#include struct Qdisc_ops; struct qdisc_walker; @@ -456,6 +457,7 @@ struct tcf_chain { }; struct tcf_block { + struct xarray ports; /* datapath accessible */ /* Lock protects tcf_block and lifetime-management data of chains * attached to the block (refcnt, action_refcnt, explicitly_created). */ -- cgit v1.2.3 From a7042cf8f23191c3a460c627c0c39463afb5d335 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Tue, 19 Dec 2023 15:16:20 -0300 Subject: net/sched: cls_api: Expose tc block to the datapath The datapath can now find the block of the port in which the packet arrived at. In the next patch we show a possible usage of this patch in a new version of mirred that multicasts to all ports except for the port in which the packet arrived on. Co-developed-by: Jamal Hadi Salim Signed-off-by: Jamal Hadi Salim Co-developed-by: Pedro Tammela Signed-off-by: Pedro Tammela Signed-off-by: Victor Nogueira Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index aca11127154f..ba3e1b315de8 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -484,6 +484,8 @@ struct tcf_block { struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */ }; +struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index); + static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain) { return lockdep_is_held(&chain->filter_chain_lock); -- cgit v1.2.3 From 42f39036cda808d3de243192a2cf5125f12f3047 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Tue, 19 Dec 2023 15:16:23 -0300 Subject: net/sched: act_mirred: Allow mirred to block So far the mirred action has dealt with syntax that handles mirror/redirection for netdev. A matching packet is redirected or mirrored to a target netdev. In this patch we enable mirred to mirror to a tc block as well. IOW, the new syntax looks as follows: ... mirred [index INDEX] < | > > Examples of mirroring or redirecting to a tc block: $ tc filter add block 22 protocol ip pref 25 \ flower dst_ip 192.168.0.0/16 action mirred egress mirror blockid 22 $ tc filter add block 22 protocol ip pref 25 \ flower dst_ip 10.10.10.10/32 action mirred egress redirect blockid 22 Co-developed-by: Jamal Hadi Salim Signed-off-by: Jamal Hadi Salim Co-developed-by: Pedro Tammela Signed-off-by: Pedro Tammela Signed-off-by: Victor Nogueira Signed-off-by: David S. Miller --- include/net/tc_act/tc_mirred.h | 1 + include/uapi/linux/tc_act/tc_mirred.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 32ce8ea36950..75722d967bf2 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -8,6 +8,7 @@ struct tcf_mirred { struct tc_action common; int tcfm_eaction; + u32 tcfm_blockid; bool tcfm_mac_header_xmit; struct net_device __rcu *tcfm_dev; netdevice_tracker tcfm_dev_tracker; diff --git a/include/uapi/linux/tc_act/tc_mirred.h b/include/uapi/linux/tc_act/tc_mirred.h index 2500a0005d05..c61e76f3c23b 100644 --- a/include/uapi/linux/tc_act/tc_mirred.h +++ b/include/uapi/linux/tc_act/tc_mirred.h @@ -21,6 +21,7 @@ enum { TCA_MIRRED_TM, TCA_MIRRED_PARMS, TCA_MIRRED_PAD, + TCA_MIRRED_BLOCKID, __TCA_MIRRED_MAX }; #define TCA_MIRRED_MAX (__TCA_MIRRED_MAX - 1) -- cgit v1.2.3 From 90abde49ea85a8af9a56bbab8c419aefc77f919a Mon Sep 17 00:00:00 2001 From: "Radu Pirea (NXP OSS)" Date: Tue, 19 Dec 2023 16:53:25 +0200 Subject: net: rename dsa_realloc_skb to skb_ensure_writable_head_tail Rename dsa_realloc_skb to skb_ensure_writable_head_tail and move it to skbuff.c to use it as helper. Signed-off-by: Radu Pirea (NXP OSS) Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 50e92c8471dc..a5ae952454c8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4007,6 +4007,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features unsigned int offset); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len); +int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev); int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); -- cgit v1.2.3 From b1c036e835b67320316e20e562cc3b4daf8fa08b Mon Sep 17 00:00:00 2001 From: "Radu Pirea (NXP OSS)" Date: Tue, 19 Dec 2023 16:53:27 +0200 Subject: net: macsec: move sci_to_cpu to macsec header Move sci_to_cpu to the MACsec header to use it in drivers. Signed-off-by: Radu Pirea (NXP OSS) Signed-off-by: David S. Miller --- include/net/macsec.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/macsec.h b/include/net/macsec.h index ebf9bc54036a..a5665e9623f2 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -325,4 +325,9 @@ static inline void *macsec_netdev_priv(const struct net_device *dev) return netdev_priv(dev); } +static inline u64 sci_to_cpu(sci_t sci) +{ + return be64_to_cpu((__force __be64)sci); +} + #endif /* _NET_MACSEC_H_ */ -- cgit v1.2.3 From eb97b9bd38f93ab7cab0be2a36a7f6180c004ac2 Mon Sep 17 00:00:00 2001 From: "Radu Pirea (NXP OSS)" Date: Tue, 19 Dec 2023 16:53:28 +0200 Subject: net: macsec: documentation for macsec_context and macsec_ops Add description for fields of struct macsec_context and struct macsec_ops. Signed-off-by: Radu Pirea (NXP OSS) Signed-off-by: David S. Miller --- include/net/macsec.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include') diff --git a/include/net/macsec.h b/include/net/macsec.h index a5665e9623f2..0821fa5088c0 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -247,6 +247,23 @@ struct macsec_secy { /** * struct macsec_context - MACsec context for hardware offloading + * @netdev: a valid pointer to a struct net_device if @offload == + * MACSEC_OFFLOAD_MAC + * @phydev: a valid pointer to a struct phy_device if @offload == + * MACSEC_OFFLOAD_PHY + * @offload: MACsec offload status + * @secy: pointer to a MACsec SecY + * @rx_sc: pointer to a RX SC + * @update_pn: when updating the SA, update the next PN + * @assoc_num: association number of the target SA + * @key: key of the target SA + * @rx_sa: pointer to an RX SA if a RX SA is added/updated/removed + * @tx_sa: pointer to an TX SA if a TX SA is added/updated/removed + * @tx_sc_stats: pointer to TX SC stats structure + * @tx_sa_stats: pointer to TX SA stats structure + * @rx_sc_stats: pointer to RX SC stats structure + * @rx_sa_stats: pointer to RX SA stats structure + * @dev_stats: pointer to dev stats structure */ struct macsec_context { union { @@ -277,6 +294,28 @@ struct macsec_context { /** * struct macsec_ops - MACsec offloading operations + * @mdo_dev_open: called when the MACsec interface transitions to the up state + * @mdo_dev_stop: called when the MACsec interface transitions to the down + * state + * @mdo_add_secy: called when a new SecY is added + * @mdo_upd_secy: called when the SecY flags are changed or the MAC address of + * the MACsec interface is changed + * @mdo_del_secy: called when the hw offload is disabled or the MACsec + * interface is removed + * @mdo_add_rxsc: called when a new RX SC is added + * @mdo_upd_rxsc: called when a certain RX SC is updated + * @mdo_del_rxsc: called when a certain RX SC is removed + * @mdo_add_rxsa: called when a new RX SA is added + * @mdo_upd_rxsa: called when a certain RX SA is updated + * @mdo_del_rxsa: called when a certain RX SA is removed + * @mdo_add_txsa: called when a new TX SA is added + * @mdo_upd_txsa: called when a certain TX SA is updated + * @mdo_del_txsa: called when a certain TX SA is removed + * @mdo_get_dev_stats: called when dev stats are read + * @mdo_get_tx_sc_stats: called when TX SC stats are read + * @mdo_get_tx_sa_stats: called when TX SA stats are read + * @mdo_get_rx_sc_stats: called when RX SC stats are read + * @mdo_get_rx_sa_stats: called when RX SA stats are read */ struct macsec_ops { /* Device wide */ -- cgit v1.2.3 From a73d8779d61ad99b966c932a1715bd4d9006a9de Mon Sep 17 00:00:00 2001 From: "Radu Pirea (NXP OSS)" Date: Tue, 19 Dec 2023 16:53:30 +0200 Subject: net: macsec: introduce mdo_insert_tx_tag Offloading MACsec in PHYs requires inserting the SecTAG and the ICV in the ethernet frame. This operation will increase the frame size with up to 32 bytes. If the frames are sent at line rate, the PHY will not have enough room to insert the SecTAG and the ICV. Some PHYs use a hardware buffer to store a number of ethernet frames and, if it fills up, a pause frame is sent to the MAC to control the flow. This HW implementation does not need any modification in the stack. Other PHYs might offer to use a specific ethertype with some padding bytes present in the ethernet frame. This ethertype and its associated bytes will be replaced by the SecTAG and ICV. mdo_insert_tx_tag allows the PHY drivers to add any specific tag in the skb. Signed-off-by: Radu Pirea (NXP OSS) Signed-off-by: David S. Miller --- include/net/macsec.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/macsec.h b/include/net/macsec.h index 0821fa5088c0..dbd22180cc5c 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -316,6 +316,11 @@ struct macsec_context { * @mdo_get_tx_sa_stats: called when TX SA stats are read * @mdo_get_rx_sc_stats: called when RX SC stats are read * @mdo_get_rx_sa_stats: called when RX SA stats are read + * @mdo_insert_tx_tag: called to insert the TX tag + * @needed_headroom: number of bytes reserved at the beginning of the sk_buff + * for the TX tag + * @needed_tailroom: number of bytes reserved at the end of the sk_buff for the + * TX tag */ struct macsec_ops { /* Device wide */ @@ -342,6 +347,11 @@ struct macsec_ops { int (*mdo_get_tx_sa_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sc_stats)(struct macsec_context *ctx); int (*mdo_get_rx_sa_stats)(struct macsec_context *ctx); + /* Offload tag */ + int (*mdo_insert_tx_tag)(struct phy_device *phydev, + struct sk_buff *skb); + unsigned int needed_headroom; + unsigned int needed_tailroom; }; void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa); -- cgit v1.2.3 From 144377c340f292c5d8c039eb790318754eec557e Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 19 Dec 2023 16:51:12 -0700 Subject: net: sock: remove excess structure-member documentation Remove a couple of kerneldoc entries for struct members that do not exist, addressing these warnings: ./include/net/sock.h:548: warning: Excess struct member '__sk_flags_offset' description in 'sock' ./include/net/sock.h:548: warning: Excess struct member 'sk_padding' description in 'sock' Signed-off-by: Jonathan Corbet Reviewed-by: Randy Dunlap Signed-off-by: David S. Miller --- include/net/sock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 30d3980dd57c..a7f815c7cfdf 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -264,8 +264,6 @@ struct sk_filter; * @sk_pacing_status: Pacing status (requested, handled by sch_fq) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes - * @__sk_flags_offset: empty field used to determine location of bitfield - * @sk_padding: unused element for alignment * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) -- cgit v1.2.3 From d0c3891db2d279b2f5ff8fd174e0b09e75dea039 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 19 Dec 2023 16:53:46 -0700 Subject: ethtool: reformat kerneldoc for struct ethtool_link_settings The kernel doc comments for struct ethtool_link_settings includes documentation for three fields that were never present there, leading to these docs-build warnings: ./include/uapi/linux/ethtool.h:2207: warning: Excess struct member 'supported' description in 'ethtool_link_settings' ./include/uapi/linux/ethtool.h:2207: warning: Excess struct member 'advertising' description in 'ethtool_link_settings' ./include/uapi/linux/ethtool.h:2207: warning: Excess struct member 'lp_advertising' description in 'ethtool_link_settings' Remove the entries to make the warnings go away. There was some information there on how data in >link_mode_masks is formatted; move that to the body of the comment to preserve it. Signed-off-by: Jonathan Corbet Reviewed-by: Randy Dunlap Signed-off-by: David S. Miller --- include/uapi/linux/ethtool.h | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 0787d561ace0..85c412c23ab5 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2139,18 +2139,6 @@ enum ethtool_reset_flags { * refused. For drivers: ignore this field (use kernel's * __ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will * be overwritten by kernel. - * @supported: Bitmap with each bit meaning given by - * %ethtool_link_mode_bit_indices for the link modes, physical - * connectors and other link features for which the interface - * supports autonegotiation or auto-detection. Read-only. - * @advertising: Bitmap with each bit meaning given by - * %ethtool_link_mode_bit_indices for the link modes, physical - * connectors and other link features that are advertised through - * autonegotiation or enabled for auto-detection. - * @lp_advertising: Bitmap with each bit meaning given by - * %ethtool_link_mode_bit_indices for the link modes, and other - * link features that the link partner advertised through - * autonegotiation; 0 if unknown or not applicable. Read-only. * @transceiver: Used to distinguish different possible PHY types, * reported consistently by PHYLIB. Read-only. * @master_slave_cfg: Master/slave port mode. @@ -2192,6 +2180,21 @@ enum ethtool_reset_flags { * %set_link_ksettings() should validate all fields other than @cmd * and @link_mode_masks_nwords that are not described as read-only or * deprecated, and must ignore all fields described as read-only. + * + * @link_mode_masks is divided into three bitfields, each of length + * @link_mode_masks_nwords: + * - supported: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features for which the interface + * supports autonegotiation or auto-detection. Read-only. + * - advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, physical + * connectors and other link features that are advertised through + * autonegotiation or enabled for auto-detection. + * - lp_advertising: Bitmap with each bit meaning given by + * %ethtool_link_mode_bit_indices for the link modes, and other + * link features that the link partner advertised through + * autonegotiation; 0 if unknown or not applicable. Read-only. */ struct ethtool_link_settings { __u32 cmd; -- cgit v1.2.3 From 1271ca00aa7f9bb3fd94cb7ac8f654de71099580 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Tue, 19 Dec 2023 16:55:31 -0700 Subject: ethtool: reformat kerneldoc for struct ethtool_fec_stats The kerneldoc comment for struct ethtool_fec_stats attempts to describe the "total" and "lanes" fields of the ethtool_fec_stat substructure in a way leading to these warnings: ./include/linux/ethtool.h:424: warning: Excess struct member 'lane' description in 'ethtool_fec_stats' ./include/linux/ethtool.h:424: warning: Excess struct member 'total' description in 'ethtool_fec_stats' Reformat the comment to retain the information while eliminating the warnings. Signed-off-by: Jonathan Corbet Reviewed-by: Randy Dunlap Signed-off-by: David S. Miller --- include/linux/ethtool.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index cfcd952a1d4f..325e0778e937 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -411,8 +411,10 @@ struct ethtool_pause_stats { * not entire FEC data blocks. This is a non-standard statistic. * Reported to user space as %ETHTOOL_A_FEC_STAT_CORR_BITS. * - * @lane: per-lane/PCS-instance counts as defined by the standard - * @total: error counts for the entire port, for drivers incapable of reporting + * For each of the above fields, the two substructure members are: + * + * - @lanes: per-lane/PCS-instance counts as defined by the standard + * - @total: error counts for the entire port, for drivers incapable of reporting * per-lane stats * * Drivers should fill in either only total or per-lane statistics, core -- cgit v1.2.3 From f732ba4ac9f3626e235cc33a4f1756f17884dd32 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 20 Dec 2023 08:41:18 +0100 Subject: iucv: make iucv_bus const Now that the driver core can properly handle constant struct bus_type, move the iucv_bus variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Wenjia Zhang Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: linux-s390@vger.kernel.org Cc: netdev@vger.kernel.org Acked-by: Alexandra Winter Signed-off-by: Greg Kroah-Hartman Signed-off-by: David S. Miller --- include/net/iucv/iucv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h index f9e88401d7da..8b2055d64a6b 100644 --- a/include/net/iucv/iucv.h +++ b/include/net/iucv/iucv.h @@ -80,7 +80,7 @@ struct iucv_array { u32 length; } __attribute__ ((aligned (8))); -extern struct bus_type iucv_bus; +extern const struct bus_type iucv_bus; extern struct device *iucv_root; /* @@ -489,7 +489,7 @@ struct iucv_interface { int (*path_sever)(struct iucv_path *path, u8 userdata[16]); int (*iucv_register)(struct iucv_handler *handler, int smp); void (*iucv_unregister)(struct iucv_handler *handler, int smp); - struct bus_type *bus; + const struct bus_type *bus; struct device *root; }; -- cgit v1.2.3 From cd4d7263d58ab98fd4dee876776e4da6c328faa3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 20 Dec 2023 17:43:58 +0200 Subject: genetlink: Use internal flags for multicast groups As explained in commit e03781879a0d ("drop_monitor: Require 'CAP_SYS_ADMIN' when joining "events" group"), the "flags" field in the multicast group structure reuses uAPI flags despite the field not being exposed to user space. This makes it impossible to extend its use without adding new uAPI flags, which is inappropriate for internal kernel checks. Solve this by adding internal flags (i.e., "GENL_MCAST_*") and convert the existing users to use them instead of the uAPI flags. Tested using the reproducers in commit 44ec98ea5ea9 ("psample: Require 'CAP_NET_ADMIN' when joining "packets" group") and commit e03781879a0d ("drop_monitor: Require 'CAP_SYS_ADMIN' when joining "events" group"). No functional changes intended. Signed-off-by: Ido Schimmel Reviewed-by: Mat Martineau Reviewed-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/net/genetlink.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 85c63d4f16dd..e61469129402 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -8,16 +8,19 @@ #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN) +/* Binding to multicast group requires %CAP_NET_ADMIN */ +#define GENL_MCAST_CAP_NET_ADMIN BIT(0) +/* Binding to multicast group requires %CAP_SYS_ADMIN */ +#define GENL_MCAST_CAP_SYS_ADMIN BIT(1) + /** * struct genl_multicast_group - generic netlink multicast group * @name: name of the multicast group, names are per-family - * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) - * @cap_sys_admin: whether %CAP_SYS_ADMIN is required for binding + * @flags: GENL_MCAST_* flags */ struct genl_multicast_group { char name[GENL_NAMSIZ]; u8 flags; - u8 cap_sys_admin:1; }; struct genl_split_ops; -- cgit v1.2.3 From cff9c565e65f3622e8dc1dcc21c1520a083dff35 Mon Sep 17 00:00:00 2001 From: Luiz Angelo Daros de Luca Date: Wed, 20 Dec 2023 01:52:29 -0300 Subject: net: mdio: get/put device node during (un)registration The __of_mdiobus_register() function was storing the device node in dev.of_node without increasing its reference count. It implicitly relied on the caller to maintain the allocated node until the mdiobus was unregistered. Now, __of_mdiobus_register() will acquire the node before assigning it, and of_mdiobus_unregister_callback() will be called at the end of mdio_unregister(). Drivers can now release the node immediately after MDIO registration. Some of them are already doing that even before this patch. Signed-off-by: Luiz Angelo Daros de Luca Signed-off-by: David S. Miller --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index e9e85d347587..ede891776d8b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -434,6 +434,9 @@ struct mii_bus { /** @shared: shared state across different PHYs */ struct phy_package_shared *shared[PHY_MAX_ADDR]; + + /** @__unregister_callback: called at the last step of unregistration */ + void (*__unregister_callback)(struct mii_bus *bus); }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) -- cgit v1.2.3 From 02018c544ef113e980a2349eba89003d6f399d22 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 21 Dec 2023 19:00:34 +0100 Subject: net: phy: Introduce ethernet link topology representation Link topologies containing multiple network PHYs attached to the same net_device can be found when using a PHY as a media converter for use with an SFP connector, on which an SFP transceiver containing a PHY can be used. With the current model, the transceiver's PHY can't be used for operations such as cable testing, timestamping, macsec offload, etc. The reason being that most of the logic for these configuration, coming from either ethtool netlink or ioctls tend to use netdev->phydev, which in multi-phy systems will reference the PHY closest to the MAC. Introduce a numbering scheme allowing to enumerate PHY devices that belong to any netdev, which can in turn allow userspace to take more precise decisions with regard to each PHY's configuration. The numbering is maintained per-netdev, in a phy_device_list. The numbering works similarly to a netdevice's ifindex, with identifiers that are only recycled once INT_MAX has been reached. This prevents races that could occur between PHY listing and SFP transceiver removal/insertion. The identifiers are assigned at phy_attach time, as the numbering depends on the netdevice the phy is attached to. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +- include/linux/phy.h | 4 ++ include/linux/phy_link_topology.h | 67 ++++++++++++++++++++++++++++++++++ include/linux/phy_link_topology_core.h | 19 ++++++++++ include/uapi/linux/ethtool.h | 16 ++++++++ 5 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 include/linux/phy_link_topology.h create mode 100644 include/linux/phy_link_topology_core.h (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 75c7725e5e4f..5baa5517f533 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -40,7 +40,6 @@ #include #endif #include - #include #include #include @@ -52,6 +51,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -2047,6 +2047,7 @@ enum netdev_stat_type { * @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp * * @priomap: XXX: need comments on this one + * @link_topo: Physical link topology tracking attached PHYs * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. @@ -2441,6 +2442,7 @@ struct net_device { #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif + struct phy_link_topology link_topo; struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; diff --git a/include/linux/phy.h b/include/linux/phy.h index ede891776d8b..ea9416797b89 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -547,6 +547,9 @@ struct macsec_ops; * @drv: Pointer to the driver for this PHY instance * @devlink: Create a link between phy dev and mac dev, if the external phy * used by current mac interface is managed by another mac interface. + * @phyindex: Unique id across the phy's parent tree of phys to address the PHY + * from userspace, similar to ifindex. A zero index means the PHY + * wasn't assigned an id yet. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. @@ -646,6 +649,7 @@ struct phy_device { struct device_link *devlink; + u32 phyindex; u32 phy_id; struct phy_c45_device_ids c45_ids; diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h new file mode 100644 index 000000000000..91902263ec0e --- /dev/null +++ b/include/linux/phy_link_topology.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PHY device list allow maintaining a list of PHY devices that are + * part of a netdevice's link topology. PHYs can for example be chained, + * as is the case when using a PHY that exposes an SFP module, on which an + * SFP transceiver that embeds a PHY is connected. + * + * This list can then be used by userspace to leverage individual PHY + * capabilities. + */ +#ifndef __PHY_LINK_TOPOLOGY_H +#define __PHY_LINK_TOPOLOGY_H + +#include +#include + +struct xarray; +struct phy_device; +struct net_device; +struct sfp_bus; + +struct phy_device_node { + enum phy_upstream upstream_type; + + union { + struct net_device *netdev; + struct phy_device *phydev; + } upstream; + + struct sfp_bus *parent_sfp_bus; + + struct phy_device *phy; +}; + +static inline struct phy_device * +phy_link_topo_get_phy(struct phy_link_topology *topo, u32 phyindex) +{ + struct phy_device_node *pdn = xa_load(&topo->phys, phyindex); + + if (pdn) + return pdn->phy; + + return NULL; +} + +#if IS_ENABLED(CONFIG_PHYLIB) +int phy_link_topo_add_phy(struct phy_link_topology *topo, + struct phy_device *phy, + enum phy_upstream upt, void *upstream); + +void phy_link_topo_del_phy(struct phy_link_topology *lt, struct phy_device *phy); + +#else +static inline int phy_link_topo_add_phy(struct phy_link_topology *topo, + struct phy_device *phy, + enum phy_upstream upt, void *upstream) +{ + return 0; +} + +static inline void phy_link_topo_del_phy(struct phy_link_topology *topo, + struct phy_device *phy) +{ +} +#endif + +#endif /* __PHY_LINK_TOPOLOGY_H */ diff --git a/include/linux/phy_link_topology_core.h b/include/linux/phy_link_topology_core.h new file mode 100644 index 000000000000..78c75f909489 --- /dev/null +++ b/include/linux/phy_link_topology_core.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PHY_LINK_TOPOLOGY_CORE_H +#define __PHY_LINK_TOPOLOGY_CORE_H + +struct xarray; + +struct phy_link_topology { + struct xarray phys; + + u32 next_phy_index; +}; + +static inline void phy_link_topo_init(struct phy_link_topology *topo) +{ + xa_init_flags(&topo->phys, XA_FLAGS_ALLOC1); + topo->next_phy_index = 1; +} + +#endif /* __PHY_LINK_TOPOLOGY_CORE_H */ diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 85c412c23ab5..60801df9d8c0 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2219,4 +2219,20 @@ struct ethtool_link_settings { * __u32 map_lp_advertising[link_mode_masks_nwords]; */ }; + +/** + * enum phy_upstream - Represents the upstream component a given PHY device + * is connected to, as in what is on the other end of the MII bus. Most PHYs + * will be attached to an Ethernet MAC controller, but in some cases, there's + * an intermediate PHY used as a media-converter, which will driver another + * MII interface as its output. + * @PHY_UPSTREAM_MAC: Upstream component is a MAC (a switch port, + * or ethernet controller) + * @PHY_UPSTREAM_PHY: Upstream component is a PHY (likely a media converter) + */ +enum phy_upstream { + PHY_UPSTREAM_MAC, + PHY_UPSTREAM_PHY, +}; + #endif /* _UAPI_LINUX_ETHTOOL_H */ -- cgit v1.2.3 From 9c5625f559ad6fe9f6f733c11475bf470e637d34 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 21 Dec 2023 19:00:35 +0100 Subject: net: sfp: pass the phy_device when disconnecting an sfp module's PHY Pass the phy_device as a parameter to the sfp upstream .disconnect_phy operation. This is preparatory work to help track phy devices across a net_device's link. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- include/linux/sfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 9346cd44814d..0573e53b0c11 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -544,7 +544,7 @@ struct sfp_upstream_ops { void (*link_down)(void *priv); void (*link_up)(void *priv); int (*connect_phy)(void *priv, struct phy_device *); - void (*disconnect_phy)(void *priv); + void (*disconnect_phy)(void *priv, struct phy_device *); }; #if IS_ENABLED(CONFIG_SFP) -- cgit v1.2.3 From 034fcc210349b873ece7356905be5c6ca11eef2a Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 21 Dec 2023 19:00:36 +0100 Subject: net: phy: add helpers to handle sfp phy connect/disconnect There are a few PHY drivers that can handle SFP modules through their sfp_upstream_ops. Introduce Phylib helpers to keep track of connected SFP PHYs in a netdevice's namespace, by adding the SFP PHY to the upstream PHY's netdev's namespace. By doing so, these SFP PHYs can be enumerated and exposed to users, which will be able to use their capabilities. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index ea9416797b89..ac22b8e28a85 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1729,6 +1729,8 @@ int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); +int phy_sfp_connect_phy(void *upstream, struct phy_device *phy); +void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy); void phy_sfp_attach(void *upstream, struct sfp_bus *bus); void phy_sfp_detach(void *upstream, struct sfp_bus *bus); int phy_sfp_probe(struct phy_device *phydev, -- cgit v1.2.3 From dedd702a35793ab462fce4c737eeba0badf9718e Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 21 Dec 2023 19:00:37 +0100 Subject: net: sfp: Add helper to return the SFP bus name Knowing the bus name is helpful when we want to expose the link topology to userspace, add a helper to return the SFP bus name. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/sfp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 0573e53b0c11..55c0ab17c9e2 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -570,6 +570,7 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode); int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, const struct sfp_upstream_ops *ops); void sfp_bus_del_upstream(struct sfp_bus *bus); +const char *sfp_get_name(struct sfp_bus *bus); #else static inline int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, @@ -648,6 +649,11 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, static inline void sfp_bus_del_upstream(struct sfp_bus *bus) { } + +static inline const char *sfp_get_name(struct sfp_bus *bus) +{ + return NULL; +} #endif #endif -- cgit v1.2.3 From 2ab0edb505faa9ac90dee1732571390f074e8113 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 21 Dec 2023 19:00:38 +0100 Subject: net: ethtool: Allow passing a phy index for some commands Some netlink commands are target towards ethernet PHYs, to control some of their features. As there's several such commands, add the ability to pass a PHY index in the ethnl request, which will populate the generic ethnl_req_info with the relevant phydev when the command targets a PHY. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/ethtool_netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 3f89074aa06c..422e8cfdd98c 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -133,6 +133,7 @@ enum { ETHTOOL_A_HEADER_DEV_INDEX, /* u32 */ ETHTOOL_A_HEADER_DEV_NAME, /* string */ ETHTOOL_A_HEADER_FLAGS, /* u32 - ETHTOOL_FLAG_* */ + ETHTOOL_A_HEADER_PHY_INDEX, /* u32 */ /* add new constants above here */ __ETHTOOL_A_HEADER_CNT, -- cgit v1.2.3 From 63d5eaf35ac36cad00cfb3809d794ef0078c822b Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 21 Dec 2023 19:00:40 +0100 Subject: net: ethtool: Introduce a command to list PHYs on an interface As we have the ability to track the PHYs connected to a net_device through the link_topology, we can expose this list to userspace. This allows userspace to use these identifiers for phy-specific commands and take the decision of which PHY to target by knowing the link topology. Add PHY_GET and PHY_DUMP, which can be a filtered DUMP operation to list devices on only one interface. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- include/uapi/linux/ethtool_netlink.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 422e8cfdd98c..00cd7ad16709 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -57,6 +57,7 @@ enum { ETHTOOL_MSG_PLCA_GET_STATUS, ETHTOOL_MSG_MM_GET, ETHTOOL_MSG_MM_SET, + ETHTOOL_MSG_PHY_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -109,6 +110,8 @@ enum { ETHTOOL_MSG_PLCA_NTF, ETHTOOL_MSG_MM_GET_REPLY, ETHTOOL_MSG_MM_NTF, + ETHTOOL_MSG_PHY_GET_REPLY, + ETHTOOL_MSG_PHY_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -977,6 +980,32 @@ enum { ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) }; +enum { + ETHTOOL_A_PHY_UPSTREAM_UNSPEC, + ETHTOOL_A_PHY_UPSTREAM_INDEX, /* u32 */ + ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, /* string */ + + /* add new constants above here */ + __ETHTOOL_A_PHY_UPSTREAM_CNT, + ETHTOOL_A_PHY_UPSTREAM_MAX = (__ETHTOOL_A_PHY_UPSTREAM_CNT - 1) +}; + +enum { + ETHTOOL_A_PHY_UNSPEC, + ETHTOOL_A_PHY_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PHY_INDEX, /* u32 */ + ETHTOOL_A_PHY_DRVNAME, /* string */ + ETHTOOL_A_PHY_NAME, /* string */ + ETHTOOL_A_PHY_UPSTREAM_TYPE, /* u8 */ + ETHTOOL_A_PHY_UPSTREAM, /* nest - _A_PHY_UPSTREAM_* */ + ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, /* string */ + ETHTOOL_A_PHY_ID, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_PHY_CNT, + ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 -- cgit v1.2.3 From 993498e537af9260e697219ce41b41b22b6199cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Dec 2023 14:07:47 +0000 Subject: net-device: move gso_partial_features to net_device_read_tx dev->gso_partial_features is read from tx fast path for GSO packets. Move it to appropriate section to avoid a cache line miss. Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables") Signed-off-by: Eric Dumazet Cc: Coco Li Cc: David Ahern Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5baa5517f533..d59db9adcc96 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2115,6 +2115,7 @@ struct net_device { const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; struct netdev_queue *_tx; + netdev_features_t gso_partial_features; unsigned int real_num_tx_queues; unsigned int gso_max_size; unsigned int gso_ipv4_max_size; @@ -2211,7 +2212,6 @@ struct net_device { netdev_features_t vlan_features; netdev_features_t hw_enc_features; netdev_features_t mpls_features; - netdev_features_t gso_partial_features; unsigned int min_mtu; unsigned int max_mtu; -- cgit v1.2.3 From ba24ea129126362e7139fed4e13701ca5b71ac0b Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Thu, 21 Dec 2023 16:31:03 -0500 Subject: net/sched: Retire ipt action The tc ipt action was intended to run all netfilter/iptables target. Unfortunately it has not benefitted over the years from proper updates when netfilter changes, and for that reason it has remained rudimentary. Pinging a bunch of people that i was aware were using this indicates that removing it wont affect them. Retire it to reduce maintenance efforts. Buh-bye. Reviewed-by: Victor Noguiera Reviewed-by: Pedro Tammela Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/tc_act/tc_ipt.h | 17 ----------------- include/net/tc_wrapper.h | 4 ---- include/uapi/linux/pkt_cls.h | 4 ++-- include/uapi/linux/tc_act/tc_ipt.h | 20 -------------------- 4 files changed, 2 insertions(+), 43 deletions(-) delete mode 100644 include/net/tc_act/tc_ipt.h delete mode 100644 include/uapi/linux/tc_act/tc_ipt.h (limited to 'include') diff --git a/include/net/tc_act/tc_ipt.h b/include/net/tc_act/tc_ipt.h deleted file mode 100644 index 4225fcb1c6ba..000000000000 --- a/include/net/tc_act/tc_ipt.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __NET_TC_IPT_H -#define __NET_TC_IPT_H - -#include - -struct xt_entry_target; - -struct tcf_ipt { - struct tc_action common; - u32 tcfi_hook; - char *tcfi_tname; - struct xt_entry_target *tcfi_t; -}; -#define to_ipt(a) ((struct tcf_ipt *)a) - -#endif /* __NET_TC_IPT_H */ diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h index a6d481b5bcbc..a608546bcefc 100644 --- a/include/net/tc_wrapper.h +++ b/include/net/tc_wrapper.h @@ -117,10 +117,6 @@ static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, if (a->ops->act == tcf_ife_act) return tcf_ife_act(skb, a, res); #endif -#if IS_BUILTIN(CONFIG_NET_ACT_IPT) - if (a->ops->act == tcf_ipt_act) - return tcf_ipt_act(skb, a, res); -#endif #if IS_BUILTIN(CONFIG_NET_ACT_SIMP) if (a->ops->act == tcf_simp_act) return tcf_simp_act(skb, a, res); diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index c7082cc60d21..2fec9b51d28d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -99,7 +99,7 @@ enum { * versions. */ #define TCA_ACT_GACT 5 -#define TCA_ACT_IPT 6 +#define TCA_ACT_IPT 6 /* obsoleted, can be reused */ #define TCA_ACT_PEDIT 7 #define TCA_ACT_MIRRED 8 #define TCA_ACT_NAT 9 @@ -120,7 +120,7 @@ enum tca_id { TCA_ID_UNSPEC = 0, TCA_ID_POLICE = 1, TCA_ID_GACT = TCA_ACT_GACT, - TCA_ID_IPT = TCA_ACT_IPT, + TCA_ID_IPT = TCA_ACT_IPT, /* Obsoleted, can be reused */ TCA_ID_PEDIT = TCA_ACT_PEDIT, TCA_ID_MIRRED = TCA_ACT_MIRRED, TCA_ID_NAT = TCA_ACT_NAT, diff --git a/include/uapi/linux/tc_act/tc_ipt.h b/include/uapi/linux/tc_act/tc_ipt.h deleted file mode 100644 index c48d7da6750d..000000000000 --- a/include/uapi/linux/tc_act/tc_ipt.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_TC_IPT_H -#define __LINUX_TC_IPT_H - -#include - -enum { - TCA_IPT_UNSPEC, - TCA_IPT_TABLE, - TCA_IPT_HOOK, - TCA_IPT_INDEX, - TCA_IPT_CNT, - TCA_IPT_TM, - TCA_IPT_TARG, - TCA_IPT_PAD, - __TCA_IPT_MAX -}; -#define TCA_IPT_MAX (__TCA_IPT_MAX - 1) - -#endif -- cgit v1.2.3 From 41bc3e8fc1f728085da0ca6dbc1bef4a2ddb543c Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 23 Dec 2023 09:01:50 -0500 Subject: net/sched: Remove uapi support for rsvp classifier commit 265b4da82dbf ("net/sched: Retire rsvp classifier") retired the TC RSVP classifier. Remove UAPI for it. Iproute2 will sync by equally removing it from user space. Reviewed-by: Victor Nogueira Reviewed-by: Pedro Tammela Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 31 ------------------------------- 1 file changed, 31 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 2fec9b51d28d..fe922b61b99e 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -280,37 +280,6 @@ struct tc_u32_pcnt { #define TC_U32_MAXDEPTH 8 - -/* RSVP filter */ - -enum { - TCA_RSVP_UNSPEC, - TCA_RSVP_CLASSID, - TCA_RSVP_DST, - TCA_RSVP_SRC, - TCA_RSVP_PINFO, - TCA_RSVP_POLICE, - TCA_RSVP_ACT, - __TCA_RSVP_MAX -}; - -#define TCA_RSVP_MAX (__TCA_RSVP_MAX - 1 ) - -struct tc_rsvp_gpi { - __u32 key; - __u32 mask; - int offset; -}; - -struct tc_rsvp_pinfo { - struct tc_rsvp_gpi dpi; - struct tc_rsvp_gpi spi; - __u8 protocol; - __u8 tunnelid; - __u8 tunnelhdr; - __u8 pad; -}; - /* ROUTE filter */ enum { -- cgit v1.2.3 From 82b2545ed9a465e4c470d2dbbb461522f767c56f Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 23 Dec 2023 09:01:51 -0500 Subject: net/sched: Remove uapi support for tcindex classifier commit 8c710f75256b ("net/sched: Retire tcindex classifier") retired the TC tcindex classifier. Remove UAPI for it. Iproute2 will sync by equally removing it from user space. Reviewed-by: Victor Nogueira Reviewed-by: Pedro Tammela Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index fe922b61b99e..ea277039f89d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -310,22 +310,6 @@ enum { #define TCA_FW_MAX (__TCA_FW_MAX - 1) -/* TC index filter */ - -enum { - TCA_TCINDEX_UNSPEC, - TCA_TCINDEX_HASH, - TCA_TCINDEX_MASK, - TCA_TCINDEX_SHIFT, - TCA_TCINDEX_FALL_THROUGH, - TCA_TCINDEX_CLASSID, - TCA_TCINDEX_POLICE, - TCA_TCINDEX_ACT, - __TCA_TCINDEX_MAX -}; - -#define TCA_TCINDEX_MAX (__TCA_TCINDEX_MAX - 1) - /* Flow filter */ enum { -- cgit v1.2.3 From fe3b739a5472968d8d349522b6816bc4db82bc0f Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 23 Dec 2023 09:01:52 -0500 Subject: net/sched: Remove uapi support for dsmark qdisc Commit bbe77c14ee61 ("net/sched: Retire dsmark qdisc") retired the dsmark classifier. Remove UAPI support for it. Iproute2 will sync by equally removing it from user space. Reviewed-by: Victor Nogueira Reviewed-by: Pedro Tammela Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index f762a10bfb78..1e3a2b9ddf7e 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -557,20 +557,6 @@ enum { #define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1) -/* dsmark section */ - -enum { - TCA_DSMARK_UNSPEC, - TCA_DSMARK_INDICES, - TCA_DSMARK_DEFAULT_INDEX, - TCA_DSMARK_SET_TC_INDEX, - TCA_DSMARK_MASK, - TCA_DSMARK_VALUE, - __TCA_DSMARK_MAX, -}; - -#define TCA_DSMARK_MAX (__TCA_DSMARK_MAX - 1) - /* ATM section */ enum { -- cgit v1.2.3 From 26cc8714fc7f79a806c3d7ffa215b984c384ab4d Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 23 Dec 2023 09:01:53 -0500 Subject: net/sched: Remove uapi support for ATM qdisc Commit fb38306ceb9e ("net/sched: Retire ATM qdisc") retired the ATM qdisc. Remove UAPI for it. Iproute2 will sync by equally removing it from user space. Reviewed-by: Victor Nogueira Reviewed-by: Pedro Tammela Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 1e3a2b9ddf7e..28f08acdad52 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -557,21 +557,6 @@ enum { #define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1) -/* ATM section */ - -enum { - TCA_ATM_UNSPEC, - TCA_ATM_FD, /* file/socket descriptor */ - TCA_ATM_PTR, /* pointer to descriptor - later */ - TCA_ATM_HDR, /* LL header */ - TCA_ATM_EXCESS, /* excess traffic class (0 for CLP) */ - TCA_ATM_ADDR, /* PVC address (for output only) */ - TCA_ATM_STATE, /* VC state (ATM_VS_*; for output only) */ - __TCA_ATM_MAX, -}; - -#define TCA_ATM_MAX (__TCA_ATM_MAX - 1) - /* Network emulator */ enum { -- cgit v1.2.3 From 33241dca486264193ed68167c8eeae1fb197f3df Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 23 Dec 2023 09:01:54 -0500 Subject: net/sched: Remove uapi support for CBQ qdisc Commit 051d44209842 ("net/sched: Retire CBQ qdisc") retired the CBQ qdisc. Remove UAPI for it. Iproute2 will sync by equally removing it from user space. Reviewed-by: Victor Nogueira Reviewed-by: Pedro Tammela Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 80 ------------------------------------------ 1 file changed, 80 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 28f08acdad52..a3cd0c2dc995 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -477,86 +477,6 @@ enum { #define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1) - -/* CBQ section */ - -#define TC_CBQ_MAXPRIO 8 -#define TC_CBQ_MAXLEVEL 8 -#define TC_CBQ_DEF_EWMA 5 - -struct tc_cbq_lssopt { - unsigned char change; - unsigned char flags; -#define TCF_CBQ_LSS_BOUNDED 1 -#define TCF_CBQ_LSS_ISOLATED 2 - unsigned char ewma_log; - unsigned char level; -#define TCF_CBQ_LSS_FLAGS 1 -#define TCF_CBQ_LSS_EWMA 2 -#define TCF_CBQ_LSS_MAXIDLE 4 -#define TCF_CBQ_LSS_MINIDLE 8 -#define TCF_CBQ_LSS_OFFTIME 0x10 -#define TCF_CBQ_LSS_AVPKT 0x20 - __u32 maxidle; - __u32 minidle; - __u32 offtime; - __u32 avpkt; -}; - -struct tc_cbq_wrropt { - unsigned char flags; - unsigned char priority; - unsigned char cpriority; - unsigned char __reserved; - __u32 allot; - __u32 weight; -}; - -struct tc_cbq_ovl { - unsigned char strategy; -#define TC_CBQ_OVL_CLASSIC 0 -#define TC_CBQ_OVL_DELAY 1 -#define TC_CBQ_OVL_LOWPRIO 2 -#define TC_CBQ_OVL_DROP 3 -#define TC_CBQ_OVL_RCLASSIC 4 - unsigned char priority2; - __u16 pad; - __u32 penalty; -}; - -struct tc_cbq_police { - unsigned char police; - unsigned char __res1; - unsigned short __res2; -}; - -struct tc_cbq_fopt { - __u32 split; - __u32 defmap; - __u32 defchange; -}; - -struct tc_cbq_xstats { - __u32 borrows; - __u32 overactions; - __s32 avgidle; - __s32 undertime; -}; - -enum { - TCA_CBQ_UNSPEC, - TCA_CBQ_LSSOPT, - TCA_CBQ_WRROPT, - TCA_CBQ_FOPT, - TCA_CBQ_OVL_STRATEGY, - TCA_CBQ_RATE, - TCA_CBQ_RTAB, - TCA_CBQ_POLICE, - __TCA_CBQ_MAX, -}; - -#define TCA_CBQ_MAX (__TCA_CBQ_MAX - 1) - /* Network emulator */ enum { -- cgit v1.2.3 From 3e64db35bc37edbe9e37aaa987df92cde12ddb6c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 2 Jan 2024 14:23:34 -0800 Subject: Revert "net: mdio: get/put device node during (un)registration" This reverts commit cff9c565e65f3622e8dc1dcc21c1520a083dff35. Revert based on feedback from Russell. Link: https://lore.kernel.org/all/ZZPtUIRerqTI2%2Fyh@shell.armlinux.org.uk/ Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index ac22b8e28a85..6cb9d843aee9 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -434,9 +434,6 @@ struct mii_bus { /** @shared: shared state across different PHYs */ struct phy_package_shared *shared[PHY_MAX_ADDR]; - - /** @__unregister_callback: called at the last step of unregistration */ - void (*__unregister_callback)(struct mii_bus *bus); }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) -- cgit v1.2.3 From 8dc4c410006531ddad981ca9578f83cb93860819 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 19 Dec 2023 13:02:05 +0200 Subject: xsk: make struct xsk_cb_desc available outside CONFIG_XDP_SOCKETS The ice driver fails to build when CONFIG_XDP_SOCKETS is disabled. drivers/net/ethernet/intel/ice/ice_base.c:533:21: error: variable has incomplete type 'struct xsk_cb_desc' struct xsk_cb_desc desc = {}; ^ include/net/xsk_buff_pool.h:15:8: note: forward declaration of 'struct xsk_cb_desc' struct xsk_cb_desc; ^ Fixes: d68d707dcbbf ("ice: Support XDP hints in AF_XDP ZC mode") Closes: https://lore.kernel.org/netdev/8b76dad3-8847-475b-aa17-613c9c978f7a@infradead.org/ Signed-off-by: Vladimir Oltean Acked-by: Larysa Zaremba Reviewed-by: Maciej Fijalkowski Acked-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested Link: https://lore.kernel.org/r/20231219110205.1289506-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/net/xdp_sock_drv.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index b62bb8525a5f..526c1e7f505e 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -12,14 +12,14 @@ #define XDP_UMEM_MIN_CHUNK_SHIFT 11 #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) -#ifdef CONFIG_XDP_SOCKETS - struct xsk_cb_desc { void *src; u8 off; u8 bytes; }; +#ifdef CONFIG_XDP_SOCKETS + void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max); -- cgit v1.2.3 From 0dd415d155050f5c1cf360b97f905d42d44f33ed Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Thu, 21 Dec 2023 11:42:35 -0700 Subject: net: ethtool: add a NO_CHANGE uAPI for new RXFH's input_xfrm Add a NO_CHANGE uAPI value for the new RXFH/RSS input_xfrm uAPI field. This needed so that user-space can set other RSS values (hkey or indir table) without affecting input_xfrm. Should have been part of [1]. Link: https://lore.kernel.org/netdev/20231213003321.605376-1-ahmed.zaki@intel.com/ [1] Fixes: 13e59344fb9d ("net: ethtool: add support for symmetric-xor RSS hash") Reviewed-by: Jacob Keller Signed-off-by: Ahmed Zaki Link: https://lore.kernel.org/r/20231221184235.9192-3-ahmed.zaki@intel.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 60801df9d8c0..01ba529dbb6d 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2002,6 +2002,7 @@ static inline int ethtool_validate_duplex(__u8 duplex) * be exploited to reduce the RSS queue spread. */ #define RXH_XFRM_SYM_XOR (1 << 0) +#define RXH_XFRM_NO_CHANGE 0xff /* L2-L4 network traffic flow types */ #define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ -- cgit v1.2.3 From 292010ee509443a1383bb079a4fa80515aa89a7f Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 20 Dec 2023 16:19:47 +0100 Subject: kunit: add parameter generation macro using description from array The existing KUNIT_ARRAY_PARAM macro requires a separate function to get the description. However, in a lot of cases the description can just be copied directly from the array. Add a second macro that avoids having to write a static function just for a single strscpy. Signed-off-by: Benjamin Berg Reviewed-by: David Gow Link: https://msgid.link/20231220151952.415232-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- include/kunit/test.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/kunit/test.h b/include/kunit/test.h index 20ed9f9275c9..2dfa851e1f88 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -1514,6 +1514,25 @@ do { \ return NULL; \ } +/** + * KUNIT_ARRAY_PARAM_DESC() - Define test parameter generator from an array. + * @name: prefix for the test parameter generator function. + * @array: array of test parameters. + * @desc_member: structure member from array element to use as description + * + * Define function @name_gen_params which uses @array to generate parameters. + */ +#define KUNIT_ARRAY_PARAM_DESC(name, array, desc_member) \ + static const void *name##_gen_params(const void *prev, char *desc) \ + { \ + typeof((array)[0]) *__next = prev ? ((typeof(__next)) prev) + 1 : (array); \ + if (__next - (array) < ARRAY_SIZE((array))) { \ + strscpy(desc, __next->desc_member, KUNIT_PARAM_DESC_SIZE); \ + return __next; \ + } \ + return NULL; \ + } + // TODO(dlatypov@google.com): consider eventually migrating users to explicitly // include resource.h themselves if they need it. #include -- cgit v1.2.3 From b3231d353a51ddfb6ef7d17877d029790190f746 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 20 Dec 2023 16:19:48 +0100 Subject: kunit: add a convenience allocation wrapper for SKBs Add a simple convenience helper to allocate and zero fill an SKB for the use by a kunit test. Also provide a way to free it again in case that may be desirable. This simply mirrors the kunit_kmalloc API. Signed-off-by: Benjamin Berg Reviewed-by: David Gow Link: https://msgid.link/20231220151952.415232-3-benjamin@sipsolutions.net [adjust file description as discussed] Signed-off-by: Johannes Berg --- include/kunit/skbuff.h | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 include/kunit/skbuff.h (limited to 'include') diff --git a/include/kunit/skbuff.h b/include/kunit/skbuff.h new file mode 100644 index 000000000000..44d12370939a --- /dev/null +++ b/include/kunit/skbuff.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KUnit resource management helpers for SKBs (skbuff). + * + * Copyright (C) 2023 Intel Corporation + */ + +#ifndef _KUNIT_SKBUFF_H +#define _KUNIT_SKBUFF_H + +#include +#include + +static void kunit_action_kfree_skb(void *p) +{ + kfree_skb((struct sk_buff *)p); +} + +/** + * kunit_zalloc_skb() - Allocate and initialize a resource managed skb. + * @test: The test case to which the skb belongs + * @len: size to allocate + * + * Allocate a new struct sk_buff with GFP_KERNEL, zero fill the give length + * and add it as a resource to the kunit test for automatic cleanup. + * + * Returns: newly allocated SKB, or %NULL on error + */ +static inline struct sk_buff *kunit_zalloc_skb(struct kunit *test, int len, + gfp_t gfp) +{ + struct sk_buff *res = alloc_skb(len, GFP_KERNEL); + + if (!res || skb_pad(res, len)) + return NULL; + + if (kunit_add_action_or_reset(test, kunit_action_kfree_skb, res)) + return NULL; + + return res; +} + +/** + * kunit_kfree_skb() - Like kfree_skb except for allocations managed by KUnit. + * @test: The test case to which the resource belongs. + * @skb: The SKB to free. + */ +static inline void kunit_kfree_skb(struct kunit *test, struct sk_buff *skb) +{ + if (!skb) + return; + + kunit_release_action(test, kunit_action_kfree_skb, (void *)skb); +} + +#endif /* _KUNIT_SKBUFF_H */ -- cgit v1.2.3 From b4c1d4d9734cda4394da5b59ebf7d9ca3579561a Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Tue, 2 Jan 2024 15:15:19 +0800 Subject: fib: remove unnecessary input parameters in fib_default_rule_add When fib_default_rule_add is invoked, the value of the input parameter 'flags' is always 0. Rules uses kzalloc to allocate memory, so 'flags' has been initialized to 0. Therefore, remove the input parameter 'flags' in fib_default_rule_add. Signed-off-by: Zhengchao Shao Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20240102071519.3781384-1-shaozhengchao@huawei.com Signed-off-by: Jakub Kicinski --- include/net/fib_rules.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 82da359bca03..d17855c52ef9 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -172,8 +172,7 @@ void fib_rules_unregister(struct fib_rules_ops *); int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); -int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, - u32 flags); +int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 7865dfb1eb941ddd25802a9e13b6ff5f3f4dc02f Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Thu, 21 Dec 2023 15:23:24 -0800 Subject: bpf: sockmap, added comments describing update proto rules Add a comment describing that the psock update proto callbback can be called multiple times and this must be safe. Signed-off-by: John Fastabend Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20231221232327.43678-3-john.fastabend@gmail.com --- include/linux/skmsg.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index c953b8c0d2f4..888a4b217829 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -100,6 +100,11 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); void (*saved_data_ready)(struct sock *sk); + /* psock_update_sk_prot may be called with restore=false many times + * so the handler must be safe for this case. It will be called + * exactly once with restore=true when the psock is being destroyed + * and psock refcnt is zero, but before an RCU grace period. + */ int (*psock_update_sk_prot)(struct sock *sk, struct sk_psock *psock, bool restore); struct proto *sk_proto; -- cgit v1.2.3 From d3d344a1ca69d8fb2413e29e6400f3ad58a05c06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 2 Jan 2024 16:22:20 +0000 Subject: net-device: move xdp_prog to net_device_read_rx xdp_prog is used in receive path, both from XDP enabled drivers and from netif_elide_gro(). This patch also removes two 4-bytes holes. Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables") Signed-off-by: Eric Dumazet Cc: Coco Li Cc: Simon Horman Link: https://lore.kernel.org/r/20240102162220.750823-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d59db9adcc96..e265aa1f2169 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2150,6 +2150,7 @@ struct net_device { /* RX read-mostly hotpath */ __cacheline_group_begin(net_device_read_rx); + struct bpf_prog __rcu *xdp_prog; struct list_head ptype_specific; int ifindex; unsigned int real_num_rx_queues; @@ -2325,7 +2326,6 @@ struct net_device { const unsigned char *dev_addr; unsigned int num_rx_queues; - struct bpf_prog __rcu *xdp_prog; #define GRO_LEGACY_MAX_SIZE 65536u /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), * and shinfo->gso_segs is a 16bit field. -- cgit v1.2.3 From c2a67de9bb543394aee869d1c68b5fbcd8a89dcb Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Fri, 29 Dec 2023 10:26:41 -0300 Subject: net/sched: introduce ACT_P_BOUND return code Bound actions always return '0' and as of today we rely on '0' being returned in order to properly skip bound actions in tcf_idr_insert_many. In order to further improve maintainability, introduce the ACT_P_BOUND return code. Actions are updated to return 'ACT_P_BOUND' instead of plain '0'. tcf_idr_insert_many is then updated to check for 'ACT_P_BOUND'. Signed-off-by: Pedro Tammela Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20231229132642.1489088-1-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski --- include/net/act_api.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index ea13e1e4a7c2..447985a45ef6 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -137,6 +137,7 @@ struct tc_action_ops { #ifdef CONFIG_NET_CLS_ACT +#define ACT_P_BOUND 0 #define ACT_P_CREATED 1 #define ACT_P_DELETED 1 -- cgit v1.2.3 From 9fc8e802048ad150e8032c4f3dbf40112160cfe9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:39 -0800 Subject: bpf: Add objcg to bpf_mem_alloc The objcg is a bpf_mem_alloc level property since all bpf_mem_cache's are with the same objcg. This patch made such a property explicit. The next patch will use this property to save and restore objcg for percpu unit allocator. Acked-by: Hou Tao Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20231222031739.1288590-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index bb1223b21308..acef8c808599 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -11,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + struct obj_cgroup *objcg; bool percpu; struct work_struct work; }; -- cgit v1.2.3 From c39aa3b289e9c10d0d246cd919b06809f13b72b8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 21 Dec 2023 19:17:45 -0800 Subject: bpf: Allow per unit prefill for non-fix-size percpu memory allocator Commit 41a5db8d8161 ("Add support for non-fix-size percpu mem allocation") added support for non-fix-size percpu memory allocation. Such allocation will allocate percpu memory for all buckets on all cpus and the memory consumption is in the order to quadratic. For example, let us say, 4 cpus, unit size 16 bytes, so each cpu has 16 * 4 = 64 bytes, with 4 cpus, total will be 64 * 4 = 256 bytes. Then let us say, 8 cpus with the same unit size, each cpu has 16 * 8 = 128 bytes, with 8 cpus, total will be 128 * 8 = 1024 bytes. So if the number of cpus doubles, the number of memory consumption will be 4 times. So for a system with large number of cpus, the memory consumption goes up quickly with quadratic order. For example, for 4KB percpu allocation, 128 cpus. The total memory consumption will 4KB * 128 * 128 = 64MB. Things will become worse if the number of cpus is bigger (e.g., 512, 1024, etc.) In Commit 41a5db8d8161, the non-fix-size percpu memory allocation is done in boot time, so for system with large number of cpus, the initial percpu memory consumption is very visible. For example, for 128 cpu system, the total percpu memory allocation will be at least (16 + 32 + 64 + 96 + 128 + 196 + 256 + 512 + 1024 + 2048 + 4096) * 128 * 128 = ~138MB. which is pretty big. It will be even bigger for larger number of cpus. Note that the current prefill also allocates 4 entries if the unit size is less than 256. So on top of 138MB memory consumption, this will add more consumption with 3 * (16 + 32 + 64 + 96 + 128 + 196 + 256) * 128 * 128 = ~38MB. Next patch will try to reduce this memory consumption. Later on, Commit 1fda5bb66ad8 ("bpf: Do not allocate percpu memory at init stage") moved the non-fix-size percpu memory allocation to bpf verificaiton stage. Once a particular bpf_percpu_obj_new() is called by bpf program, the memory allocator will try to fill in the cache with all sizes, causing the same amount of percpu memory consumption as in the boot stage. To reduce the initial percpu memory consumption for non-fix-size percpu memory allocation, instead of filling the cache with all supported allocation sizes, this patch intends to fill the cache only for the requested size. As typically users will not use large percpu data structure, this can save memory significantly. For example, the allocation size is 64 bytes with 128 cpus. Then total percpu memory amount will be 64 * 128 * 128 = 1MB, much less than previous 138MB. Signed-off-by: Yonghong Song Acked-by: Hou Tao Link: https://lore.kernel.org/r/20231222031745.1289082-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index acef8c808599..aaf004d94322 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -22,8 +22,15 @@ struct bpf_mem_alloc { * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects. * Alloc and free are done with bpf_mem_{alloc,free}() and the size of * the returned object is given by the size argument of bpf_mem_alloc(). + * If percpu equals true, error will be returned in order to avoid + * large memory consumption and the below bpf_mem_alloc_percpu_unit_init() + * should be used to do on-demand per-cpu allocation for each size. */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); +/* Initialize a non-fix-size percpu memory allocator */ +int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg); +/* The percpu allocation with a specific unit size. */ +int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ -- cgit v1.2.3 From 5e5401d6612ef599ad45785b941eebda7effc90f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 4 Jan 2024 09:47:36 +0000 Subject: net: phylink: move phylink_pcs_neg_mode() into phylink.c Move phylink_pcs_neg_mode() from the header file into the .c file since nothing should be using it. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phylink.h | 66 ------------------------------------------------- 1 file changed, 66 deletions(-) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 875439ab45de..d589f89c612c 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -98,72 +98,6 @@ static inline bool phylink_autoneg_inband(unsigned int mode) return mode == MLO_AN_INBAND; } -/** - * phylink_pcs_neg_mode() - helper to determine PCS inband mode - * @mode: one of %MLO_AN_FIXED, %MLO_AN_PHY, %MLO_AN_INBAND. - * @interface: interface mode to be used - * @advertising: adertisement ethtool link mode mask - * - * Determines the negotiation mode to be used by the PCS, and returns - * one of: - * - * - %PHYLINK_PCS_NEG_NONE: interface mode does not support inband - * - %PHYLINK_PCS_NEG_OUTBAND: an out of band mode (e.g. reading the PHY) - * will be used. - * - %PHYLINK_PCS_NEG_INBAND_DISABLED: inband mode selected but autoneg - * disabled - * - %PHYLINK_PCS_NEG_INBAND_ENABLED: inband mode selected and autoneg enabled - * - * Note: this is for cases where the PCS itself is involved in negotiation - * (e.g. Clause 37, SGMII and similar) not Clause 73. - */ -static inline unsigned int phylink_pcs_neg_mode(unsigned int mode, - phy_interface_t interface, - const unsigned long *advertising) -{ - unsigned int neg_mode; - - switch (interface) { - case PHY_INTERFACE_MODE_SGMII: - case PHY_INTERFACE_MODE_QSGMII: - case PHY_INTERFACE_MODE_QUSGMII: - case PHY_INTERFACE_MODE_USXGMII: - /* These protocols are designed for use with a PHY which - * communicates its negotiation result back to the MAC via - * inband communication. Note: there exist PHYs that run - * with SGMII but do not send the inband data. - */ - if (!phylink_autoneg_inband(mode)) - neg_mode = PHYLINK_PCS_NEG_OUTBAND; - else - neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; - break; - - case PHY_INTERFACE_MODE_1000BASEX: - case PHY_INTERFACE_MODE_2500BASEX: - /* 1000base-X is designed for use media-side for Fibre - * connections, and thus the Autoneg bit needs to be - * taken into account. We also do this for 2500base-X - * as well, but drivers may not support this, so may - * need to override this. - */ - if (!phylink_autoneg_inband(mode)) - neg_mode = PHYLINK_PCS_NEG_OUTBAND; - else if (linkmode_test_bit(ETHTOOL_LINK_MODE_Autoneg_BIT, - advertising)) - neg_mode = PHYLINK_PCS_NEG_INBAND_ENABLED; - else - neg_mode = PHYLINK_PCS_NEG_INBAND_DISABLED; - break; - - default: - neg_mode = PHYLINK_PCS_NEG_NONE; - break; - } - - return neg_mode; -} - /** * struct phylink_link_state - link state structure * @advertising: ethtool bitmask containing advertised link modes -- cgit v1.2.3 From 98e20e5e13d2811898921f999288be7151a11954 Mon Sep 17 00:00:00 2001 From: Quentin Deslandes Date: Tue, 26 Dec 2023 14:07:42 +0100 Subject: bpfilter: remove bpfilter bpfilter was supposed to convert iptables filtering rules into BPF programs on the fly, from the kernel, through a usermode helper. The base code for the UMH was introduced in 2018, and couple of attempts (2, 3) tried to introduce the BPF program generate features but were abandoned. bpfilter now sits in a kernel tree unused and unusable, occasionally causing confusion amongst Linux users (4, 5). As bpfilter is now developed in a dedicated repository on GitHub (6), it was suggested a couple of times this year (LSFMM/BPF 2023, LPC 2023) to remove the deprecated kernel part of the project. This is the purpose of this patch. [1]: https://lore.kernel.org/lkml/20180522022230.2492505-1-ast@kernel.org/ [2]: https://lore.kernel.org/bpf/20210829183608.2297877-1-me@ubique.spb.ru/#t [3]: https://lore.kernel.org/lkml/20221224000402.476079-1-qde@naccy.de/ [4]: https://dxuuu.xyz/bpfilter.html [5]: https://github.com/linuxkit/linuxkit/pull/3904 [6]: https://github.com/facebook/bpfilter Signed-off-by: Quentin Deslandes Link: https://lore.kernel.org/r/20231226130745.465988-1-qde@naccy.de Signed-off-by: Alexei Starovoitov --- include/linux/bpfilter.h | 24 ------------------------ include/uapi/linux/bpfilter.h | 21 --------------------- 2 files changed, 45 deletions(-) delete mode 100644 include/linux/bpfilter.h delete mode 100644 include/uapi/linux/bpfilter.h (limited to 'include') diff --git a/include/linux/bpfilter.h b/include/linux/bpfilter.h deleted file mode 100644 index 736ded4905e0..000000000000 --- a/include/linux/bpfilter.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_BPFILTER_H -#define _LINUX_BPFILTER_H - -#include -#include -#include - -struct sock; -int bpfilter_ip_set_sockopt(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen); -int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen); - -struct bpfilter_umh_ops { - struct umd_info info; - /* since ip_getsockopt() can run in parallel, serialize access to umh */ - struct mutex lock; - int (*sockopt)(struct sock *sk, int optname, sockptr_t optval, - unsigned int optlen, bool is_set); - int (*start)(void); -}; -extern struct bpfilter_umh_ops bpfilter_ops; -#endif diff --git a/include/uapi/linux/bpfilter.h b/include/uapi/linux/bpfilter.h deleted file mode 100644 index cbc1f5813f50..000000000000 --- a/include/uapi/linux/bpfilter.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _UAPI_LINUX_BPFILTER_H -#define _UAPI_LINUX_BPFILTER_H - -#include - -enum { - BPFILTER_IPT_SO_SET_REPLACE = 64, - BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65, - BPFILTER_IPT_SET_MAX, -}; - -enum { - BPFILTER_IPT_SO_GET_INFO = 64, - BPFILTER_IPT_SO_GET_ENTRIES = 65, - BPFILTER_IPT_SO_GET_REVISION_MATCH = 66, - BPFILTER_IPT_SO_GET_REVISION_TARGET = 67, - BPFILTER_IPT_GET_MAX, -}; - -#endif /* _UAPI_LINUX_BPFILTER_H */ -- cgit v1.2.3 From fe1eb24bd5ade085914248c527044e942f75e06a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 4 Jan 2024 16:04:35 -0800 Subject: Revert "Introduce PHY listing and link_topology tracking" This reverts commit 32bb4515e34469975abc936deb0a116c4a445817. This reverts commit d078d480639a4f3b5fc2d56247afa38e0956483a. This reverts commit fcc4b105caa4b844bf043375bf799c20a9c99db1. This reverts commit 345237dbc1bdbb274c9fb9ec38976261ff4a40b8. This reverts commit 7db69ec9cfb8b4ab50420262631fb2d1908b25bf. This reverts commit 95132a018f00f5dad38bdcfd4180d1af955d46f6. This reverts commit 63d5eaf35ac36cad00cfb3809d794ef0078c822b. This reverts commit c29451aefcb42359905d18678de38e52eccb3bb5. This reverts commit 2ab0edb505faa9ac90dee1732571390f074e8113. This reverts commit dedd702a35793ab462fce4c737eeba0badf9718e. This reverts commit 034fcc210349b873ece7356905be5c6ca11eef2a. This reverts commit 9c5625f559ad6fe9f6f733c11475bf470e637d34. This reverts commit 02018c544ef113e980a2349eba89003d6f399d22. Looks like we need more time for reviews, and incremental changes will be hard to make sense of. So revert. Link: https://lore.kernel.org/all/ZZP6FV5sXEf+xd58@shell.armlinux.org.uk/ Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 +- include/linux/phy.h | 6 --- include/linux/phy_link_topology.h | 67 ---------------------------------- include/linux/phy_link_topology_core.h | 19 ---------- include/linux/sfp.h | 8 +--- include/uapi/linux/ethtool.h | 16 -------- include/uapi/linux/ethtool_netlink.h | 30 --------------- 7 files changed, 2 insertions(+), 148 deletions(-) delete mode 100644 include/linux/phy_link_topology.h delete mode 100644 include/linux/phy_link_topology_core.h (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e265aa1f2169..118c40258d07 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -40,6 +40,7 @@ #include #endif #include + #include #include #include @@ -51,7 +52,6 @@ #include #include #include -#include struct netpoll_info; struct device; @@ -2047,7 +2047,6 @@ enum netdev_stat_type { * @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp * * @priomap: XXX: need comments on this one - * @link_topo: Physical link topology tracking attached PHYs * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. @@ -2442,7 +2441,6 @@ struct net_device { #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif - struct phy_link_topology link_topo; struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; diff --git a/include/linux/phy.h b/include/linux/phy.h index 6cb9d843aee9..e9e85d347587 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -544,9 +544,6 @@ struct macsec_ops; * @drv: Pointer to the driver for this PHY instance * @devlink: Create a link between phy dev and mac dev, if the external phy * used by current mac interface is managed by another mac interface. - * @phyindex: Unique id across the phy's parent tree of phys to address the PHY - * from userspace, similar to ifindex. A zero index means the PHY - * wasn't assigned an id yet. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. @@ -646,7 +643,6 @@ struct phy_device { struct device_link *devlink; - u32 phyindex; u32 phy_id; struct phy_c45_device_ids c45_ids; @@ -1726,8 +1722,6 @@ int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); -int phy_sfp_connect_phy(void *upstream, struct phy_device *phy); -void phy_sfp_disconnect_phy(void *upstream, struct phy_device *phy); void phy_sfp_attach(void *upstream, struct sfp_bus *bus); void phy_sfp_detach(void *upstream, struct sfp_bus *bus); int phy_sfp_probe(struct phy_device *phydev, diff --git a/include/linux/phy_link_topology.h b/include/linux/phy_link_topology.h deleted file mode 100644 index 91902263ec0e..000000000000 --- a/include/linux/phy_link_topology.h +++ /dev/null @@ -1,67 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * PHY device list allow maintaining a list of PHY devices that are - * part of a netdevice's link topology. PHYs can for example be chained, - * as is the case when using a PHY that exposes an SFP module, on which an - * SFP transceiver that embeds a PHY is connected. - * - * This list can then be used by userspace to leverage individual PHY - * capabilities. - */ -#ifndef __PHY_LINK_TOPOLOGY_H -#define __PHY_LINK_TOPOLOGY_H - -#include -#include - -struct xarray; -struct phy_device; -struct net_device; -struct sfp_bus; - -struct phy_device_node { - enum phy_upstream upstream_type; - - union { - struct net_device *netdev; - struct phy_device *phydev; - } upstream; - - struct sfp_bus *parent_sfp_bus; - - struct phy_device *phy; -}; - -static inline struct phy_device * -phy_link_topo_get_phy(struct phy_link_topology *topo, u32 phyindex) -{ - struct phy_device_node *pdn = xa_load(&topo->phys, phyindex); - - if (pdn) - return pdn->phy; - - return NULL; -} - -#if IS_ENABLED(CONFIG_PHYLIB) -int phy_link_topo_add_phy(struct phy_link_topology *topo, - struct phy_device *phy, - enum phy_upstream upt, void *upstream); - -void phy_link_topo_del_phy(struct phy_link_topology *lt, struct phy_device *phy); - -#else -static inline int phy_link_topo_add_phy(struct phy_link_topology *topo, - struct phy_device *phy, - enum phy_upstream upt, void *upstream) -{ - return 0; -} - -static inline void phy_link_topo_del_phy(struct phy_link_topology *topo, - struct phy_device *phy) -{ -} -#endif - -#endif /* __PHY_LINK_TOPOLOGY_H */ diff --git a/include/linux/phy_link_topology_core.h b/include/linux/phy_link_topology_core.h deleted file mode 100644 index 78c75f909489..000000000000 --- a/include/linux/phy_link_topology_core.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __PHY_LINK_TOPOLOGY_CORE_H -#define __PHY_LINK_TOPOLOGY_CORE_H - -struct xarray; - -struct phy_link_topology { - struct xarray phys; - - u32 next_phy_index; -}; - -static inline void phy_link_topo_init(struct phy_link_topology *topo) -{ - xa_init_flags(&topo->phys, XA_FLAGS_ALLOC1); - topo->next_phy_index = 1; -} - -#endif /* __PHY_LINK_TOPOLOGY_CORE_H */ diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 55c0ab17c9e2..9346cd44814d 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -544,7 +544,7 @@ struct sfp_upstream_ops { void (*link_down)(void *priv); void (*link_up)(void *priv); int (*connect_phy)(void *priv, struct phy_device *); - void (*disconnect_phy)(void *priv, struct phy_device *); + void (*disconnect_phy)(void *priv); }; #if IS_ENABLED(CONFIG_SFP) @@ -570,7 +570,6 @@ struct sfp_bus *sfp_bus_find_fwnode(const struct fwnode_handle *fwnode); int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, const struct sfp_upstream_ops *ops); void sfp_bus_del_upstream(struct sfp_bus *bus); -const char *sfp_get_name(struct sfp_bus *bus); #else static inline int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, @@ -649,11 +648,6 @@ static inline int sfp_bus_add_upstream(struct sfp_bus *bus, void *upstream, static inline void sfp_bus_del_upstream(struct sfp_bus *bus) { } - -static inline const char *sfp_get_name(struct sfp_bus *bus) -{ - return NULL; -} #endif #endif diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 01ba529dbb6d..06ef6b78b7de 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2220,20 +2220,4 @@ struct ethtool_link_settings { * __u32 map_lp_advertising[link_mode_masks_nwords]; */ }; - -/** - * enum phy_upstream - Represents the upstream component a given PHY device - * is connected to, as in what is on the other end of the MII bus. Most PHYs - * will be attached to an Ethernet MAC controller, but in some cases, there's - * an intermediate PHY used as a media-converter, which will driver another - * MII interface as its output. - * @PHY_UPSTREAM_MAC: Upstream component is a MAC (a switch port, - * or ethernet controller) - * @PHY_UPSTREAM_PHY: Upstream component is a PHY (likely a media converter) - */ -enum phy_upstream { - PHY_UPSTREAM_MAC, - PHY_UPSTREAM_PHY, -}; - #endif /* _UAPI_LINUX_ETHTOOL_H */ diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 00cd7ad16709..3f89074aa06c 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -57,7 +57,6 @@ enum { ETHTOOL_MSG_PLCA_GET_STATUS, ETHTOOL_MSG_MM_GET, ETHTOOL_MSG_MM_SET, - ETHTOOL_MSG_PHY_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -110,8 +109,6 @@ enum { ETHTOOL_MSG_PLCA_NTF, ETHTOOL_MSG_MM_GET_REPLY, ETHTOOL_MSG_MM_NTF, - ETHTOOL_MSG_PHY_GET_REPLY, - ETHTOOL_MSG_PHY_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -136,7 +133,6 @@ enum { ETHTOOL_A_HEADER_DEV_INDEX, /* u32 */ ETHTOOL_A_HEADER_DEV_NAME, /* string */ ETHTOOL_A_HEADER_FLAGS, /* u32 - ETHTOOL_FLAG_* */ - ETHTOOL_A_HEADER_PHY_INDEX, /* u32 */ /* add new constants above here */ __ETHTOOL_A_HEADER_CNT, @@ -980,32 +976,6 @@ enum { ETHTOOL_A_MM_MAX = (__ETHTOOL_A_MM_CNT - 1) }; -enum { - ETHTOOL_A_PHY_UPSTREAM_UNSPEC, - ETHTOOL_A_PHY_UPSTREAM_INDEX, /* u32 */ - ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, /* string */ - - /* add new constants above here */ - __ETHTOOL_A_PHY_UPSTREAM_CNT, - ETHTOOL_A_PHY_UPSTREAM_MAX = (__ETHTOOL_A_PHY_UPSTREAM_CNT - 1) -}; - -enum { - ETHTOOL_A_PHY_UNSPEC, - ETHTOOL_A_PHY_HEADER, /* nest - _A_HEADER_* */ - ETHTOOL_A_PHY_INDEX, /* u32 */ - ETHTOOL_A_PHY_DRVNAME, /* string */ - ETHTOOL_A_PHY_NAME, /* string */ - ETHTOOL_A_PHY_UPSTREAM_TYPE, /* u8 */ - ETHTOOL_A_PHY_UPSTREAM, /* nest - _A_PHY_UPSTREAM_* */ - ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, /* string */ - ETHTOOL_A_PHY_ID, /* u32 */ - - /* add new constants above here */ - __ETHTOOL_A_PHY_CNT, - ETHTOOL_A_PHY_MAX = (__ETHTOOL_A_PHY_CNT - 1) -}; - /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 -- cgit v1.2.3 From 19bfcdf9498aa968ea293417fbbc39e523527ca8 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Wed, 3 Jan 2024 20:05:44 +0100 Subject: bpf: Relax tracing prog recursive attach rules Currently, it's not allowed to attach an fentry/fexit prog to another one fentry/fexit. At the same time it's not uncommon to see a tracing program with lots of logic in use, and the attachment limitation prevents usage of fentry/fexit for performance analysis (e.g. with "bpftool prog profile" command) in this case. An example could be falcosecurity libs project that uses tp_btf tracing programs. Following the corresponding discussion [1], the reason for that is to avoid tracing progs call cycles without introducing more complex solutions. But currently it seems impossible to load and attach tracing programs in a way that will form such a cycle. The limitation is coming from the fact that attach_prog_fd is specified at the prog load (thus making it impossible to attach to a program loaded after it in this way), as well as tracing progs not implementing link_detach. Replace "no same type" requirement with verification that no more than one level of attachment nesting is allowed. In this way only one fentry/fexit program could be attached to another fentry/fexit to cover profiling use case, and still no cycle could be formed. To implement, add a new field into bpf_prog_aux to track nested attachment for tracing programs. [1]: https://lore.kernel.org/bpf/20191108064039.2041889-16-ast@kernel.org/ Acked-by: Jiri Olsa Acked-by: Song Liu Signed-off-by: Dmitrii Dolgov <9erthalion6@gmail.com> Link: https://lore.kernel.org/r/20240103190559.14750-2-9erthalion6@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7671530d6e4e..e30100597d0a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1449,6 +1449,7 @@ struct bpf_prog_aux { bool dev_bound; /* Program is bound to the netdev. */ bool offload_requested; /* Program is bound and offloaded to the netdev. */ bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */ + bool attach_tracing_prog; /* true if tracing another tracing program */ bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; -- cgit v1.2.3 From 8a6286c1804e2c7144aef3154a0357c4b496e10b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 3 Jan 2024 14:28:36 +0100 Subject: dpll: expose fractional frequency offset value to user Add a new netlink attribute to expose fractional frequency offset value for a pin. Add an op to get the value from the driver. Signed-off-by: Jiri Pirko Acked-by: Vadim Fedorenko Acked-by: Arkadiusz Kubalewski Link: https://lore.kernel.org/r/20240103132838.1501801-2-jiri@resnulli.us Signed-off-by: Jakub Kicinski --- include/linux/dpll.h | 3 +++ include/uapi/linux/dpll.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index b1a5f9ca8ee5..9cf896ea1d41 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -77,6 +77,9 @@ struct dpll_pin_ops { const struct dpll_device *dpll, void *dpll_priv, const s32 phase_adjust, struct netlink_ext_ack *extack); + int (*ffo_get)(const struct dpll_pin *pin, void *pin_priv, + const struct dpll_device *dpll, void *dpll_priv, + s64 *ffo, struct netlink_ext_ack *extack); }; struct dpll_pin_frequency { diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index 715a491d2727..b4e947f9bfbc 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -179,6 +179,7 @@ enum dpll_a_pin { DPLL_A_PIN_PHASE_ADJUST_MAX, DPLL_A_PIN_PHASE_ADJUST, DPLL_A_PIN_PHASE_OFFSET, + DPLL_A_PIN_FRACTIONAL_FREQUENCY_OFFSET, __DPLL_A_PIN_MAX, DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1) -- cgit v1.2.3 From 405cd9fc6f44f7a54505019bea60de83f1c58365 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Thu, 4 Jan 2024 21:38:10 -0300 Subject: net/sched: simplify tc_action_load_ops parameters Instead of using two bools derived from a flags passed as arguments to the parent function of tc_action_load_ops, just pass the flags itself to tc_action_load_ops to simplify its parameters. Reviewed-by: Jiri Pirko Signed-off-by: Pedro Tammela Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 447985a45ef6..e1e5e72b901e 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -208,8 +208,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, struct tc_action *actions[], int init_res[], size_t *attr_size, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack); -struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, bool police, - bool rtnl_held, +struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags, struct netlink_ext_ack *extack); struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, -- cgit v1.2.3 From 477bd4beb93bf9ace9bda71f1437b191befa9cf4 Mon Sep 17 00:00:00 2001 From: Swee Leong Ching Date: Fri, 5 Jan 2024 15:09:23 +0800 Subject: net: stmmac: Make MSI interrupt routine generic There is no support for per DMA channel interrupt for non-MSI platform, where the MAC's per channel interrupt hooks up to interrupt controller(GIC) through shared peripheral interrupt(SPI) to handle interrupt from TX/RX transmit channel. This patch generalize the existing MSI ISR to also support non-MSI platform. Signed-off-by: Teoh Ji Sheng Signed-off-by: Swee Leong Ching Signed-off-by: David S. Miller --- include/linux/stmmac.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index dee5ad6e48c5..b950e6f9761d 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -98,7 +98,7 @@ struct stmmac_dma_cfg { int mixed_burst; bool aal; bool eame; - bool multi_msi_en; + bool multi_irq_en; bool dche; }; @@ -215,7 +215,7 @@ struct dwmac4_addrs { #define STMMAC_FLAG_TSO_EN BIT(4) #define STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP BIT(5) #define STMMAC_FLAG_VLAN_FAIL_Q_EN BIT(6) -#define STMMAC_FLAG_MULTI_MSI_EN BIT(7) +#define STMMAC_FLAG_MULTI_IRQ_EN BIT(7) #define STMMAC_FLAG_EXT_SNAPSHOT_EN BIT(8) #define STMMAC_FLAG_INT_SNAPSHOT_EN BIT(9) #define STMMAC_FLAG_RX_CLK_RUNS_IN_LPI BIT(10) -- cgit v1.2.3 From e9ee910218ffd420454b52a052d6f1087354905b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 7 Jan 2024 17:11:38 -0800 Subject: Revert "net: stmmac: Enable Per DMA Channel interrupt" Revert "net: stmmac: Use interrupt mode INTM=1 for per channel irq" This reverts commit 36af9f25ddfd311da82628f194c794786467cb12. Revert "net: stmmac: Add support for TX/RX channel interrupt" This reverts commit 9072e03d32088137a435ddf3aa95fd6e038d69d8. Revert "net: stmmac: Make MSI interrupt routine generic" This reverts commit 477bd4beb93bf9ace9bda71f1437b191befa9cf4. Revert "dt-bindings: net: snps,dwmac: per channel irq" This reverts commit 67d47c8ada0f8795bfcdb85cc8f2ad3ce556674b. Device tree bindings need to be reviewed. Link: https://lore.kernel.org/all/2df9fe3e-7971-4aa2-89a9-0e085b3b00d7@linaro.org/ Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index b950e6f9761d..dee5ad6e48c5 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -98,7 +98,7 @@ struct stmmac_dma_cfg { int mixed_burst; bool aal; bool eame; - bool multi_irq_en; + bool multi_msi_en; bool dche; }; @@ -215,7 +215,7 @@ struct dwmac4_addrs { #define STMMAC_FLAG_TSO_EN BIT(4) #define STMMAC_FLAG_SERDES_UP_AFTER_PHY_LINKUP BIT(5) #define STMMAC_FLAG_VLAN_FAIL_Q_EN BIT(6) -#define STMMAC_FLAG_MULTI_IRQ_EN BIT(7) +#define STMMAC_FLAG_MULTI_MSI_EN BIT(7) #define STMMAC_FLAG_EXT_SNAPSHOT_EN BIT(8) #define STMMAC_FLAG_INT_SNAPSHOT_EN BIT(9) #define STMMAC_FLAG_RX_CLK_RUNS_IN_LPI BIT(10) -- cgit v1.2.3 From 3fbf61207c66ff7ac9b60ab76d4bfd239f97e973 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sun, 7 Jan 2024 17:14:51 -0800 Subject: Revert "mlx5 updates 2023-12-20" Revert "net/mlx5: Implement management PF Ethernet profile" This reverts commit 22c4640698a1d47606b5a4264a584e8046641784. Revert "net/mlx5: Enable SD feature" This reverts commit c88c49ac9c18fb7c3fa431126de1d8f8f555e912. Revert "net/mlx5e: Block TLS device offload on combined SD netdev" This reverts commit 83a59ce0057b7753d7fbece194b89622c663b2a6. Revert "net/mlx5e: Support per-mdev queue counter" This reverts commit d72baceb92539a178d2610b0e9ceb75706a75b55. Revert "net/mlx5e: Support cross-vhca RSS" This reverts commit c73a3ab8fa6e93a783bd563938d7cf00d62d5d34. Revert "net/mlx5e: Let channels be SD-aware" This reverts commit e4f9686bdee7b4dd89e0ed63cd03606e4bda4ced. Revert "net/mlx5e: Create EN core HW resources for all secondary devices" This reverts commit c4fb94aa822d6c9d05fc3c5aee35c7e339061dc1. Revert "net/mlx5e: Create single netdev per SD group" This reverts commit e2578b4f983cfcd47837bbe3bcdbf5920e50b2ad. Revert "net/mlx5: SD, Add informative prints in kernel log" This reverts commit c82d360325112ccc512fc11a3b68cdcdf04a1478. Revert "net/mlx5: SD, Implement steering for primary and secondaries" This reverts commit 605fcce33b2d1beb0139b6e5913fa0b2062116b2. Revert "net/mlx5: SD, Implement devcom communication and primary election" This reverts commit a45af9a96740873db9a4b5bb493ce2ad81ccb4d5. Revert "net/mlx5: SD, Implement basic query and instantiation" This reverts commit 63b9ce944c0e26c44c42cdd5095c2e9851c1a8ff. Revert "net/mlx5: SD, Introduce SD lib" This reverts commit 4a04a31f49320d078b8078e1da4b0e2faca5dfa3. Revert "net/mlx5: Fix query of sd_group field" This reverts commit e04984a37398b3f4f5a79c993b94c6b1224184cc. Revert "net/mlx5e: Use the correct lag ports number when creating TISes" This reverts commit a7e7b40c4bc115dbf2a2bb453d7bbb2e0ea99703. There are some unanswered questions on the list, and we don't have any docs. Given the lack of replies so far and the fact that v6.8 merge window has started - let's revert this and revisit for v6.9. Link: https://lore.kernel.org/all/20231221005721.186607-1-saeed@kernel.org/ Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 10 ---------- include/linux/mlx5/mlx5_ifc.h | 24 ++++++------------------ include/linux/mlx5/vport.h | 1 - 3 files changed, 6 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2bba88c67f58..7ee5b79ff3d6 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -681,7 +681,6 @@ struct mlx5e_resources { struct mlx5_sq_bfreg bfreg; #define MLX5_MAX_NUM_TC 8 u32 tisn[MLX5_MAX_PORTS][MLX5_MAX_NUM_TC]; - bool tisn_valid; } hw_objs; struct net_device *uplink_netdev; struct mutex uplink_netdev_lock; @@ -822,7 +821,6 @@ struct mlx5_core_dev { struct blocking_notifier_head macsec_nh; #endif u64 num_ipsec_offloads; - struct mlx5_sd *sd; }; struct mlx5_db { @@ -1224,14 +1222,6 @@ static inline bool mlx5_core_is_ecpf(const struct mlx5_core_dev *dev) return dev->caps.embedded_cpu; } -static inline bool mlx5_core_is_mgmt_pf(const struct mlx5_core_dev *dev) -{ - if (!MLX5_CAP_GEN_2(dev, local_mng_port_valid)) - return false; - - return MLX5_CAP_GEN_2(dev, local_mng_port); -} - static inline bool mlx5_core_is_ecpf_esw_manager(const struct mlx5_core_dev *dev) { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 586569209254..fee20fc010c2 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1954,10 +1954,8 @@ enum { struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_0[0x80]; - u8 migratable[0x1]; - u8 reserved_at_81[0x19]; - u8 local_mng_port[0x1]; - u8 reserved_at_9b[0x5]; + u8 migratable[0x1]; + u8 reserved_at_81[0x1f]; u8 max_reformat_insert_size[0x8]; u8 max_reformat_insert_offset[0x8]; @@ -1975,13 +1973,7 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 allowed_object_for_other_vhca_access[0x40]; - u8 reserved_at_140[0x20]; - - u8 reserved_at_160[0xa]; - u8 local_mng_port_valid[0x1]; - u8 reserved_at_16b[0x15]; - - u8 reserved_at_180[0x20]; + u8 reserved_at_140[0x60]; u8 flow_table_type_2_type[0x8]; u8 reserved_at_1a8[0x3]; @@ -4038,13 +4030,8 @@ struct mlx5_ifc_nic_vport_context_bits { u8 affiliation_criteria[0x4]; u8 affiliated_vhca_id[0x10]; - u8 reserved_at_60[0xa0]; + u8 reserved_at_60[0xd0]; - u8 reserved_at_100[0x1]; - u8 sd_group[0x3]; - u8 reserved_at_104[0x1c]; - - u8 reserved_at_120[0x10]; u8 mtu[0x10]; u8 system_image_guid[0x40]; @@ -10129,7 +10116,8 @@ struct mlx5_ifc_mpir_reg_bits { u8 reserved_at_20[0x20]; u8 local_port[0x8]; - u8 reserved_at_28[0x18]; + u8 reserved_at_28[0x15]; + u8 sd_group[0x3]; u8 reserved_at_60[0x20]; }; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index c36cc6d82926..fbb9bf447889 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -72,7 +72,6 @@ int mlx5_query_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 *mtu); int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu); int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, u64 *system_image_guid); -int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group); int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, u16 vport, u64 node_guid); -- cgit v1.2.3