From ca34ce29fc4b0e929cc6aada40829d17ab50fee4 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 8 Aug 2022 09:47:23 -0700 Subject: bpf: Improve docstring for BPF_F_USER_BUILD_ID flag Most tools which use bpf_get_stack or bpf_get_stackid symbolicate the stack - meaning the stack of addresses in the target process' address space is transformed into meaningful symbol names. The BPF_F_USER_BUILD_ID flag eases this process by finding the build_id of the file-backed vma which the address falls in and translating the address to an offset within the backing file. To be more specific, the offset is a "file offset" from the beginning of the backing file. The symbols in ET_DYN ELF objects have a st_value which is also described as an "offset" - but an offset in the process address space, relative to the base address of the object. It's necessary to translate between the "file offset" and "virtual address offset" during symbolication before they can be directly compared. Failure to do so can lead to confusing bugs, so this patch clarifies language in the documentation in an attempt to keep this from happening. Signed-off-by: Dave Marchevsky Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220808164723.3107500-1-davemarchevsky@fb.com --- include/uapi/linux/bpf.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7bf9ba1329be..534e33fb1029 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3008,8 +3008,18 @@ union bpf_attr { * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. * **BPF_F_USER_BUILD_ID** - * Collect buildid+offset instead of ips for user stack, - * only valid if **BPF_F_USER_STACK** is also specified. + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. * * **bpf_get_stack**\ () can collect up to * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject -- cgit v1.2.3 From fa96b24204af42274ec13dfb2f2e6990d7510e55 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Fri, 5 Aug 2022 14:48:14 -0700 Subject: btf: Add a new kfunc flag which allows to mark a function to be sleepable This allows to declare a kfunc as sleepable and prevents its use in a non sleepable program. Signed-off-by: Benjamin Tissoires Co-developed-by: Yosry Ahmed Signed-off-by: Yosry Ahmed Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220805214821.1058337-2-haoluo@google.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index cdb376d53238..976cbdd2981f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -49,6 +49,7 @@ * for this case. */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ +#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ struct btf; struct btf_member; -- cgit v1.2.3 From 6e116280b41b0cbfd90dfe9fa66e07ff348d50d5 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 25 Jul 2022 10:51:30 +0200 Subject: net: netfilter: Remove ifdefs for code shared by BPF and ctnetlink The current ifdefry for code shared by the BPF and ctnetlink side looks ugly. As per Pablo's request, simplify this by unconditionally compiling in the code. This can be revisited when the shared code between the two grows further. Suggested-by: Pablo Neira Ayuso Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220725085130.11553-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/net/netfilter/nf_conntrack_core.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 3cd3a6e631aa..b2b9de70d9f4 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -86,10 +86,6 @@ extern spinlock_t nf_conntrack_expect_lock; /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ -#if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ - (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) || \ - IS_ENABLED(CONFIG_NF_CT_NETLINK)) - static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) { if (timeout > INT_MAX) @@ -101,6 +97,4 @@ int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout); void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off); int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status); -#endif - #endif /* _NF_CONNTRACK_CORE_H */ -- cgit v1.2.3 From c8996c98f703b09afe77a1d247dae691c9849dc1 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Aug 2022 08:08:02 +0200 Subject: bpf: Add BPF-helper for accessing CLOCK_TAI Commit 3dc6ffae2da2 ("timekeeping: Introduce fast accessor to clock tai") introduced a fast and NMI-safe accessor for CLOCK_TAI. Especially in time sensitive networks (TSN), where all nodes are synchronized by Precision Time Protocol (PTP), it's helpful to have the possibility to generate timestamps based on CLOCK_TAI instead of CLOCK_MONOTONIC. With a BPF helper for TAI in place, it becomes very convenient to correlate activity across different machines in the network. Use cases for such a BPF helper include functionalities such as Tx launch time (e.g. ETF and TAPRIO Qdiscs) and timestamping. Note: CLOCK_TAI is nothing new per se, only the NMI-safe variant of it is. Signed-off-by: Jesper Dangaard Brouer [Kurt: Wrote changelog and renamed helper] Signed-off-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20220809060803.5773-2-kurt@linutronix.de Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 20c26aed7896..a627a02cf8ab 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2349,6 +2349,7 @@ extern const struct bpf_func_proto bpf_get_numa_node_id_proto; extern const struct bpf_func_proto bpf_tail_call_proto; extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_ktime_get_boot_ns_proto; +extern const struct bpf_func_proto bpf_ktime_get_tai_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 534e33fb1029..7d1e2794d83e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5341,6 +5341,18 @@ union bpf_attr { * **-EACCES** if the SYN cookie is not valid. * * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * u64 bpf_ktime_get_tai_ns(void) + * Description + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * Return + * Current *ktime*. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5551,6 +5563,7 @@ union bpf_attr { FN(tcp_raw_gen_syncookie_ipv6), \ FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ + FN(ktime_get_tai_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 4dd48c6f1f83290d4bc61b43e61d86f8bc6c310e Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Wed, 10 Aug 2022 08:59:03 +0200 Subject: bpf: add destructive kfunc flag Add KF_DESTRUCTIVE flag for destructive functions. Functions with this flag set will require CAP_SYS_BOOT capabilities. Signed-off-by: Artem Savkov Link: https://lore.kernel.org/r/20220810065905.475418-2-asavkov@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/btf.h b/include/linux/btf.h index 976cbdd2981f..ad93c2d9cc1c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -49,7 +49,8 @@ * for this case. */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ -#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ +#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ +#define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ struct btf; struct btf_member; -- cgit v1.2.3 From 4961d0772578e8737afe61370743f3bc22867111 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Fri, 12 Aug 2022 16:37:27 +0100 Subject: bpf: Clear up confusion in bpf_skb_adjust_room()'s documentation Adding or removing room space _below_ layers 2 or 3, as the description mentions, is ambiguous. This was written with a mental image of the packet with layer 2 at the top, layer 3 under it, and so on. But it has led users to believe that it was on lower layers (before the beginning of the L2 and L3 headers respectively). Let's make it more explicit, and specify between which layers the room space is adjusted. Reported-by: Rumen Telbizov Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220812153727.224500-3-quentin@isovalent.com --- include/uapi/linux/bpf.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7d1e2794d83e..934a2a8beb87 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2573,10 +2573,12 @@ union bpf_attr { * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer - * (room space is added or removed below the layer 2 header). + * (room space is added or removed between the layer 2 and + * layer 3 headers). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer - * (room space is added or removed below the layer 3 header). + * (room space is added or removed between the layer 3 and + * layer 4 headers). * * The following flags are supported at this time: * -- cgit v1.2.3 From 52327d2e3996f2cac7df6de02623fd8d80fb4c85 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Mon, 15 Aug 2022 11:04:17 +0800 Subject: net: sched: remove the unused return value of unregister_qdisc Return value of unregister_qdisc is unused, remove it. Signed-off-by: Zhengchao Shao Link: https://lore.kernel.org/r/20220815030417.271894-1-shaozhengchao@huawei.com Signed-off-by: Jakub Kicinski --- include/net/pkt_sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 3372a1f67cf4..29f65632ebc5 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -100,7 +100,7 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, struct netlink_ext_ack *extack); int register_qdisc(struct Qdisc_ops *qops); -int unregister_qdisc(struct Qdisc_ops *qops); +void unregister_qdisc(struct Qdisc_ops *qops); void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); -- cgit v1.2.3 From 0630f64d25a0f0a8c6a9ce9fde8750b3b561e6f5 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 15 Aug 2022 12:07:47 -0700 Subject: net: phy: broadcom: Implement suspend/resume for AC131 and BCM5241 Implement the suspend/resume procedure for the Broadcom AC131 and BCM5241 type of PHYs (10/100 only) by entering the standard power down followed by the proprietary standby mode in the auxiliary mode 4 shadow register. On resume, the PHY software reset is enough to make it come out of standby mode so we can utilize brcm_fet_config_init() as the resume hook. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- include/linux/brcmphy.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 6ff567ece34a..9e77165f3ef6 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -293,6 +293,7 @@ #define MII_BRCM_FET_SHDW_MC_FAME 0x4000 /* Force Auto MDIX enable */ #define MII_BRCM_FET_SHDW_AUXMODE4 0x1a /* Auxiliary mode 4 */ +#define MII_BRCM_FET_SHDW_AM4_STANDBY 0x0008 /* Standby enable */ #define MII_BRCM_FET_SHDW_AM4_LED_MASK 0x0003 #define MII_BRCM_FET_SHDW_AM4_LED_MODE1 0x0001 -- cgit v1.2.3 From e34cfee65ec891a319ce79797dda18083af33a76 Mon Sep 17 00:00:00 2001 From: Wong Vee Khee Date: Wed, 17 Aug 2022 14:43:24 +0800 Subject: stmmac: intel: remove unused 'has_crossts' flag The 'has_crossts' flag was not used anywhere in the stmmac driver, removing it from both header file and dwmac-intel driver. Signed-off-by: Wong Vee Khee Reviewed-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20220817064324.10025-1-veekhee@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 8df475db88c0..fb2e88614f5d 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -257,7 +257,6 @@ struct plat_stmmacenet_data { u8 vlan_fail_q; unsigned int eee_usecs_rate; struct pci_dev *pdev; - bool has_crossts; int int_snapshot_num; int ext_snapshot_num; bool int_snapshot_en; -- cgit v1.2.3 From 24426654ed3ae83d1127511891fb782c54f49203 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:17:17 -0700 Subject: bpf: net: Avoid sk_setsockopt() taking sk lock when called from bpf Most of the code in bpf_setsockopt(SOL_SOCKET) are duplicated from the sk_setsockopt(). The number of supported optnames are increasing ever and so as the duplicated code. One issue in reusing sk_setsockopt() is that the bpf prog has already acquired the sk lock. This patch adds a has_current_bpf_ctx() to tell if the sk_setsockopt() is called from a bpf prog. The bpf prog calling bpf_setsockopt() is either running in_task() or in_serving_softirq(). Both cases have the current->bpf_ctx initialized. Thus, the has_current_bpf_ctx() only needs to test !!current->bpf_ctx. This patch also adds sockopt_{lock,release}_sock() helpers for sk_setsockopt() to use. These helpers will test has_current_bpf_ctx() before acquiring/releasing the lock. They are in EXPORT_SYMBOL for the ipv6 module to use in a latter patch. Note on the change in sock_setbindtodevice(). sockopt_lock_sock() is done in sock_setbindtodevice() instead of doing the lock_sock in sock_bindtoindex(..., lock_sk = true). Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061717.4175589-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 13 +++++++++++++ include/net/sock.h | 3 +++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a627a02cf8ab..39bd36359c1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1966,6 +1966,15 @@ static inline bool unprivileged_ebpf_enabled(void) return !sysctl_unprivileged_bpf_disabled; } +/* Not all bpf prog type has the bpf_ctx. + * For the bpf prog type that has initialized the bpf_ctx, + * this function can be used to decide if a kernel function + * is called by a bpf program. + */ +static inline bool has_current_bpf_ctx(void) +{ + return !!current->bpf_ctx; +} #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2175,6 +2184,10 @@ static inline bool unprivileged_ebpf_enabled(void) return false; } +static inline bool has_current_bpf_ctx(void) +{ + return false; +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, diff --git a/include/net/sock.h b/include/net/sock.h index 05a1bbdf5805..352b9458fdc6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1749,6 +1749,9 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow) } } +void sockopt_lock_sock(struct sock *sk); +void sockopt_release_sock(struct sock *sk); + /* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming -- cgit v1.2.3 From e42c7beee71d0d84a6193357e3525d0cf2a3e168 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:17:23 -0700 Subject: bpf: net: Consider has_current_bpf_ctx() when testing capable() in sk_setsockopt() When bpf program calling bpf_setsockopt(SOL_SOCKET), it could be run in softirq and doesn't make sense to do the capable check. There was a similar situation in bpf_setsockopt(TCP_CONGESTION). In commit 8d650cdedaab ("tcp: fix tcp_set_congestion_control() use from bpf hook"), tcp_set_congestion_control(..., cap_net_admin) was added to skip the cap check for bpf prog. This patch adds sockopt_ns_capable() and sockopt_capable() for the sk_setsockopt() to use. They will consider the has_current_bpf_ctx() before doing the ns_capable() and capable() test. They are in EXPORT_SYMBOL for the ipv6 module to use in a latter patch. Suggested-by: Stanislav Fomichev Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061723.4175820-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 352b9458fdc6..13089d88a2e2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1751,6 +1751,8 @@ static inline void unlock_sock_fast(struct sock *sk, bool slow) void sockopt_lock_sock(struct sock *sk); void sockopt_release_sock(struct sock *sk); +bool sockopt_ns_capable(struct user_namespace *ns, int cap); +bool sockopt_capable(int cap); /* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it -- cgit v1.2.3 From 29003875bd5bab262a29d1c6e76a2124bd07e4c2 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:18:04 -0700 Subject: bpf: Change bpf_setsockopt(SOL_SOCKET) to reuse sk_setsockopt() After the prep work in the previous patches, this patch removes most of the dup code from bpf_setsockopt(SOL_SOCKET) and reuses them from sk_setsockopt(). The sock ptr test is added to the SO_RCVLOWAT because the sk->sk_socket could be NULL in some of the bpf hooks. The existing optname white-list is refactored into a new function sol_socket_setsockopt(). Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061804.4178920-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 13089d88a2e2..ee44b424d952 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1828,6 +1828,8 @@ void sock_pfree(struct sk_buff *skb); #define sock_edemux sock_efree #endif +int sk_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); int sock_setsockopt(struct socket *sock, int level, int op, sockptr_t optval, unsigned int optlen); -- cgit v1.2.3 From 0c751f7071ef98d334ed06ca3f8f4cc1f7458cf5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:18:19 -0700 Subject: bpf: Change bpf_setsockopt(SOL_TCP) to reuse do_tcp_setsockopt() After the prep work in the previous patches, this patch removes all the dup code from bpf_setsockopt(SOL_TCP) and reuses the do_tcp_setsockopt(). The existing optname white-list is refactored into a new function sol_tcp_setsockopt(). The sol_tcp_setsockopt() also calls the bpf_sol_tcp_setsockopt() to handle the TCP_BPF_XXX specific optnames. bpf_setsockopt(TCP_SAVE_SYN) now also allows a value 2 to save the eth header also and it comes for free from do_tcp_setsockopt(). Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061819.4180146-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index d10962b9f0d0..c03a50c72f40 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -405,6 +405,8 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); bool tcp_bpf_bypass_getsockopt(int level, int optname); +int do_tcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); -- cgit v1.2.3 From ee7f1e1302f5cb29168f70827c12855f1d8c9845 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:18:26 -0700 Subject: bpf: Change bpf_setsockopt(SOL_IP) to reuse do_ip_setsockopt() After the prep work in the previous patches, this patch removes the dup code from bpf_setsockopt(SOL_IP) and reuses the implementation in do_ip_setsockopt(). The existing optname white-list is refactored into a new function sol_ip_setsockopt(). NOTE, the current bpf_setsockopt(IP_TOS) is quite different from the the do_ip_setsockopt(IP_TOS). For example, it does not take the INET_ECN_MASK into the account for tcp and also does not adjust sk->sk_priority. It looks like the current bpf_setsockopt(IP_TOS) was referencing the IPV6_TCLASS implementation instead of IP_TOS. This patch tries to rectify that by using the do_ip_setsockopt(IP_TOS). While this is a behavior change, the do_ip_setsockopt(IP_TOS) behavior is arguably what the user is expecting. At least, the INET_ECN_MASK bits should be masked out for tcp. Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061826.4180990-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/net/ip.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 1c979fd1904c..34fa5b0f0a0e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -743,6 +743,8 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk, int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6); DECLARE_STATIC_KEY_FALSE(ip4_min_ttl); +int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, -- cgit v1.2.3 From 75b64b68ee3f9fe90ad8f21e5c5c92de58abf725 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 16 Aug 2022 23:18:34 -0700 Subject: bpf: Change bpf_setsockopt(SOL_IPV6) to reuse do_ipv6_setsockopt() After the prep work in the previous patches, this patch removes the dup code from bpf_setsockopt(SOL_IPV6) and reuses the implementation in do_ipv6_setsockopt(). ipv6 could be compiled as a module. Like how other code solved it with stubs in ipv6_stubs.h, this patch adds the do_ipv6_setsockopt to the ipv6_bpf_stub. The current bpf_setsockopt(IPV6_TCLASS) does not take the INET_ECN_MASK into the account for tcp. The do_ipv6_setsockopt(IPV6_TCLASS) will handle it correctly. The existing optname white-list is refactored into a new function sol_ipv6_setsockopt(). After this last SOL_IPV6 dup code removal, the __bpf_setsockopt() is simplified enough that the extra "{ }" around the if statement can be removed. Reviewed-by: Stanislav Fomichev Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220817061834.4181198-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/net/ipv6.h | 2 ++ include/net/ipv6_stubs.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index de9dcc5652c4..c110d9032083 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1156,6 +1156,8 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, */ DECLARE_STATIC_KEY_FALSE(ip6_min_hopcount); +int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, + unsigned int optlen); int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ipv6_getsockopt(struct sock *sk, int level, int optname, diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 45e0339be6fa..8692698b01cf 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -81,6 +81,8 @@ struct ipv6_bpf_stub { const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); + int (*ipv6_setsockopt)(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen); }; extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; -- cgit v1.2.3 From 5d81757835859760a38a54a87eff856d4e578836 Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Thu, 18 Aug 2022 18:32:30 +0300 Subject: net: macsec: Expose MACSEC_SALT_LEN definition to user space Expose MACSEC_SALT_LEN definition to user space to be used in various user space applications such as iproute. Iproute will use this as part of adding macsec extended packet number support. Reviewed-by: Raed Salem Reviewed-by: Sabrina Dubroca Signed-off-by: Emeel Hakim Link: https://lore.kernel.org/r/20220818153229.4721-1-ehakim@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/macsec.h | 1 - include/uapi/linux/if_macsec.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/macsec.h b/include/net/macsec.h index d6fa6b97f6ef..73780aa73644 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -14,7 +14,6 @@ #define MACSEC_DEFAULT_PN_LEN 4 #define MACSEC_XPN_PN_LEN 8 -#define MACSEC_SALT_LEN 12 #define MACSEC_NUM_AN 4 /* 2 bits for the association number */ typedef u64 __bitwise sci_t; diff --git a/include/uapi/linux/if_macsec.h b/include/uapi/linux/if_macsec.h index 3af2aa069a36..d5b6d1f37353 100644 --- a/include/uapi/linux/if_macsec.h +++ b/include/uapi/linux/if_macsec.h @@ -22,6 +22,8 @@ #define MACSEC_KEYID_LEN 16 +#define MACSEC_SALT_LEN 12 + /* cipher IDs as per IEEE802.1AE-2018 (Table 14-1) */ #define MACSEC_CIPHER_ID_GCM_AES_128 0x0080C20001000001ULL #define MACSEC_CIPHER_ID_GCM_AES_256 0x0080C20001000002ULL -- cgit v1.2.3 From 5e61fe157a27afc7c0d4f7bcbceefdca536c015f Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Wed, 17 Aug 2022 14:32:52 +0200 Subject: net: phy: Introduce QUSGMII PHY mode The QUSGMII mode is a derivative of Cisco's USXGMII standard. This standard is pretty similar to SGMII, but allows for faster speeds, and has the build-in bits for Quad and Octa variants (like QSGMII). The main difference with SGMII/QSGMII is that USXGMII/QUSGMII re-uses the preamble to carry various information, named 'Extensions'. As of today, the USXGMII standard only mentions the "PCH" extension, which is used to convey timestamps, allowing in-band signaling of PTP timestamps without having to modify the frame itself. This commit adds support for that mode. When no extension is in use, it behaves exactly like QSGMII, although it's not compatible with QSGMII. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 87638c55d844..9eeab9b9a74c 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -115,6 +115,7 @@ extern const int phy_10gbit_features_array[1]; * @PHY_INTERFACE_MODE_25GBASER: 25G BaseR * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN + * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII * @PHY_INTERFACE_MODE_MAX: Book keeping * * Describes the interface between the MAC and PHY. @@ -152,6 +153,7 @@ typedef enum { PHY_INTERFACE_MODE_USXGMII, /* 10GBASE-KR - with Clause 73 AN */ PHY_INTERFACE_MODE_10GKR, + PHY_INTERFACE_MODE_QUSGMII, PHY_INTERFACE_MODE_MAX, } phy_interface_t; @@ -267,6 +269,8 @@ static inline const char *phy_modes(phy_interface_t interface) return "10gbase-kr"; case PHY_INTERFACE_MODE_100BASEX: return "100base-x"; + case PHY_INTERFACE_MODE_QUSGMII: + return "qusgmii"; default: return "unknown"; } -- cgit v1.2.3 From c04ade27cb7b952b6b9b9a0efa0a6129cc63f2ae Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Wed, 17 Aug 2022 14:32:54 +0200 Subject: net: phy: Add helper to derive the number of ports from a phy mode Some phy modes such as QSGMII multiplex several MAC<->PHY links on one single physical interface. QSGMII used to be the only one supported, but other modes such as QUSGMII also carry multiple links. This helper allows getting the number of links that are multiplexed on a given interface. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 9eeab9b9a74c..7c49ab95441b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -968,6 +968,8 @@ struct phy_fixup { const char *phy_speed_to_str(int speed); const char *phy_duplex_to_str(unsigned int duplex); +int phy_interface_num_ports(phy_interface_t interface); + /* A structure for mapping a particular speed and duplex * combination to a particular SUPPORTED and ADVERTISED value */ -- cgit v1.2.3 From 1202cdd665315c525b5237e96e0bedc76d7e754f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 17 Aug 2022 17:43:21 -0700 Subject: Remove DECnet support from kernel DECnet is an obsolete network protocol that receives more attention from kernel janitors than users. It belongs in computer protocol history museum not in Linux kernel. It has been "Orphaned" in kernel since 2010. The iproute2 support for DECnet was dropped in 5.0 release. The documentation link on Sourceforge says it is abandoned there as well. Leave the UAPI alone to keep userspace programs compiling. This means that there is still an empty neighbour table for AF_DECNET. The table of /proc/sys/net entries was updated to match current directories and reformatted to be alphabetical. Signed-off-by: Stephen Hemminger Acked-by: David Ahern Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 - include/linux/netfilter.h | 5 - include/linux/netfilter_defs.h | 8 -- include/net/dn.h | 231 ---------------------------------- include/net/dn_dev.h | 200 ----------------------------- include/net/dn_fib.h | 169 ------------------------- include/net/dn_neigh.h | 32 ----- include/net/dn_nsp.h | 201 ----------------------------- include/net/dn_route.h | 118 ----------------- include/net/netns/netfilter.h | 3 - include/uapi/linux/dn.h | 149 ---------------------- include/uapi/linux/netfilter_decnet.h | 72 ----------- include/uapi/linux/netlink.h | 2 +- 13 files changed, 1 insertion(+), 1193 deletions(-) delete mode 100644 include/net/dn.h delete mode 100644 include/net/dn_dev.h delete mode 100644 include/net/dn_fib.h delete mode 100644 include/net/dn_neigh.h delete mode 100644 include/net/dn_nsp.h delete mode 100644 include/net/dn_route.h delete mode 100644 include/uapi/linux/dn.h delete mode 100644 include/uapi/linux/netfilter_decnet.h (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a3cb93c3dcc..64e8662632f8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1837,7 +1837,6 @@ enum netdev_ml_priv_type { * @tipc_ptr: TIPC specific data * @atalk_ptr: AppleTalk link * @ip_ptr: IPv4 specific data - * @dn_ptr: DECnet specific data * @ip6_ptr: IPv6 specific data * @ax25_ptr: AX.25 specific data * @ieee80211_ptr: IEEE 802.11 specific data, assign before registering @@ -2133,9 +2132,6 @@ struct net_device { #if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif -#if IS_ENABLED(CONFIG_DECNET) - struct dn_dev __rcu *dn_ptr; -#endif #if IS_ENABLED(CONFIG_AX25) void *ax25_ptr; #endif diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index c2c6f332fb90..d8817d381c14 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -243,11 +243,6 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net, hook_head = rcu_dereference(net->nf.hooks_bridge[hook]); #endif break; -#if IS_ENABLED(CONFIG_DECNET) - case NFPROTO_DECNET: - hook_head = rcu_dereference(net->nf.hooks_decnet[hook]); - break; -#endif default: WARN_ON_ONCE(1); break; diff --git a/include/linux/netfilter_defs.h b/include/linux/netfilter_defs.h index 8dddfb151f00..a5f7bef1b3a4 100644 --- a/include/linux/netfilter_defs.h +++ b/include/linux/netfilter_defs.h @@ -7,14 +7,6 @@ /* in/out/forward only */ #define NF_ARP_NUMHOOKS 3 -/* max hook is NF_DN_ROUTE (6), also see uapi/linux/netfilter_decnet.h */ -#define NF_DN_NUMHOOKS 7 - -#if IS_ENABLED(CONFIG_DECNET) -/* Largest hook number + 1, see uapi/linux/netfilter_decnet.h */ -#define NF_MAX_HOOKS NF_DN_NUMHOOKS -#else #define NF_MAX_HOOKS NF_INET_NUMHOOKS -#endif #endif diff --git a/include/net/dn.h b/include/net/dn.h deleted file mode 100644 index ba9655b0098a..000000000000 --- a/include/net/dn.h +++ /dev/null @@ -1,231 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_DN_H -#define _NET_DN_H - -#include -#include -#include -#include -#include - -struct dn_scp /* Session Control Port */ -{ - unsigned char state; -#define DN_O 1 /* Open */ -#define DN_CR 2 /* Connect Receive */ -#define DN_DR 3 /* Disconnect Reject */ -#define DN_DRC 4 /* Discon. Rej. Complete*/ -#define DN_CC 5 /* Connect Confirm */ -#define DN_CI 6 /* Connect Initiate */ -#define DN_NR 7 /* No resources */ -#define DN_NC 8 /* No communication */ -#define DN_CD 9 /* Connect Delivery */ -#define DN_RJ 10 /* Rejected */ -#define DN_RUN 11 /* Running */ -#define DN_DI 12 /* Disconnect Initiate */ -#define DN_DIC 13 /* Disconnect Complete */ -#define DN_DN 14 /* Disconnect Notificat */ -#define DN_CL 15 /* Closed */ -#define DN_CN 16 /* Closed Notification */ - - __le16 addrloc; - __le16 addrrem; - __u16 numdat; - __u16 numoth; - __u16 numoth_rcv; - __u16 numdat_rcv; - __u16 ackxmt_dat; - __u16 ackxmt_oth; - __u16 ackrcv_dat; - __u16 ackrcv_oth; - __u8 flowrem_sw; - __u8 flowloc_sw; -#define DN_SEND 2 -#define DN_DONTSEND 1 -#define DN_NOCHANGE 0 - __u16 flowrem_dat; - __u16 flowrem_oth; - __u16 flowloc_dat; - __u16 flowloc_oth; - __u8 services_rem; - __u8 services_loc; - __u8 info_rem; - __u8 info_loc; - - __u16 segsize_rem; - __u16 segsize_loc; - - __u8 nonagle; - __u8 multi_ireq; - __u8 accept_mode; - unsigned long seg_total; /* Running total of current segment */ - - struct optdata_dn conndata_in; - struct optdata_dn conndata_out; - struct optdata_dn discdata_in; - struct optdata_dn discdata_out; - struct accessdata_dn accessdata; - - struct sockaddr_dn addr; /* Local address */ - struct sockaddr_dn peer; /* Remote address */ - - /* - * In this case the RTT estimation is not specified in the - * docs, nor is any back off algorithm. Here we follow well - * known tcp algorithms with a few small variations. - * - * snd_window: Max number of packets we send before we wait for - * an ack to come back. This will become part of a - * more complicated scheme when we support flow - * control. - * - * nsp_srtt: Round-Trip-Time (x8) in jiffies. This is a rolling - * average. - * nsp_rttvar: Round-Trip-Time-Varience (x4) in jiffies. This is the - * varience of the smoothed average (but calculated in - * a simpler way than for normal statistical varience - * calculations). - * - * nsp_rxtshift: Backoff counter. Value is zero normally, each time - * a packet is lost is increases by one until an ack - * is received. Its used to index an array of backoff - * multipliers. - */ -#define NSP_MIN_WINDOW 1 -#define NSP_MAX_WINDOW (0x07fe) - unsigned long max_window; - unsigned long snd_window; -#define NSP_INITIAL_SRTT (HZ) - unsigned long nsp_srtt; -#define NSP_INITIAL_RTTVAR (HZ*3) - unsigned long nsp_rttvar; -#define NSP_MAXRXTSHIFT 12 - unsigned long nsp_rxtshift; - - /* - * Output queues, one for data, one for otherdata/linkservice - */ - struct sk_buff_head data_xmit_queue; - struct sk_buff_head other_xmit_queue; - - /* - * Input queue for other data - */ - struct sk_buff_head other_receive_queue; - int other_report; - - /* - * Stuff to do with the slow timer - */ - unsigned long stamp; /* time of last transmit */ - unsigned long persist; - int (*persist_fxn)(struct sock *sk); - unsigned long keepalive; - void (*keepalive_fxn)(struct sock *sk); - -}; - -static inline struct dn_scp *DN_SK(struct sock *sk) -{ - return (struct dn_scp *)(sk + 1); -} - -/* - * src,dst : Source and Destination DECnet addresses - * hops : Number of hops through the network - * dst_port, src_port : NSP port numbers - * services, info : Useful data extracted from conninit messages - * rt_flags : Routing flags byte - * nsp_flags : NSP layer flags byte - * segsize : Size of segment - * segnum : Number, for data, otherdata and linkservice - * xmit_count : Number of times we've transmitted this skb - * stamp : Time stamp of most recent transmission, used in RTT calculations - * iif: Input interface number - * - * As a general policy, this structure keeps all addresses in network - * byte order, and all else in host byte order. Thus dst, src, dst_port - * and src_port are in network order. All else is in host order. - * - */ -#define DN_SKB_CB(skb) ((struct dn_skb_cb *)(skb)->cb) -struct dn_skb_cb { - __le16 dst; - __le16 src; - __u16 hops; - __le16 dst_port; - __le16 src_port; - __u8 services; - __u8 info; - __u8 rt_flags; - __u8 nsp_flags; - __u16 segsize; - __u16 segnum; - __u16 xmit_count; - unsigned long stamp; - int iif; -}; - -static inline __le16 dn_eth2dn(const unsigned char *ethaddr) -{ - return get_unaligned((__le16 *)(ethaddr + 4)); -} - -static inline __le16 dn_saddr2dn(struct sockaddr_dn *saddr) -{ - return *(__le16 *)saddr->sdn_nodeaddr; -} - -static inline void dn_dn2eth(unsigned char *ethaddr, __le16 addr) -{ - __u16 a = le16_to_cpu(addr); - ethaddr[0] = 0xAA; - ethaddr[1] = 0x00; - ethaddr[2] = 0x04; - ethaddr[3] = 0x00; - ethaddr[4] = (__u8)(a & 0xff); - ethaddr[5] = (__u8)(a >> 8); -} - -static inline void dn_sk_ports_copy(struct flowidn *fld, struct dn_scp *scp) -{ - fld->fld_sport = scp->addrloc; - fld->fld_dport = scp->addrrem; -} - -unsigned int dn_mss_from_pmtu(struct net_device *dev, int mtu); -void dn_register_sysctl(void); -void dn_unregister_sysctl(void); - -#define DN_MENUVER_ACC 0x01 -#define DN_MENUVER_USR 0x02 -#define DN_MENUVER_PRX 0x04 -#define DN_MENUVER_UIC 0x08 - -struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr); -struct sock *dn_find_by_skb(struct sk_buff *skb); -#define DN_ASCBUF_LEN 9 -char *dn_addr2asc(__u16, char *); -int dn_destroy_timer(struct sock *sk); - -int dn_sockaddr2username(struct sockaddr_dn *addr, unsigned char *buf, - unsigned char type); -int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn *addr, - unsigned char *type); - -void dn_start_slow_timer(struct sock *sk); -void dn_stop_slow_timer(struct sock *sk); - -extern __le16 decnet_address; -extern int decnet_debug_level; -extern int decnet_time_wait; -extern int decnet_dn_count; -extern int decnet_di_count; -extern int decnet_dr_count; -extern int decnet_no_fc_max_cwnd; - -extern long sysctl_decnet_mem[3]; -extern int sysctl_decnet_wmem[3]; -extern int sysctl_decnet_rmem[3]; - -#endif /* _NET_DN_H */ diff --git a/include/net/dn_dev.h b/include/net/dn_dev.h deleted file mode 100644 index bec303ea8367..000000000000 --- a/include/net/dn_dev.h +++ /dev/null @@ -1,200 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_DN_DEV_H -#define _NET_DN_DEV_H - -#include - -struct dn_dev; - -struct dn_ifaddr { - struct dn_ifaddr __rcu *ifa_next; - struct dn_dev *ifa_dev; - __le16 ifa_local; - __le16 ifa_address; - __u32 ifa_flags; - __u8 ifa_scope; - char ifa_label[IFNAMSIZ]; - struct rcu_head rcu; -}; - -#define DN_DEV_S_RU 0 /* Run - working normally */ -#define DN_DEV_S_CR 1 /* Circuit Rejected */ -#define DN_DEV_S_DS 2 /* Data Link Start */ -#define DN_DEV_S_RI 3 /* Routing Layer Initialize */ -#define DN_DEV_S_RV 4 /* Routing Layer Verify */ -#define DN_DEV_S_RC 5 /* Routing Layer Complete */ -#define DN_DEV_S_OF 6 /* Off */ -#define DN_DEV_S_HA 7 /* Halt */ - - -/* - * The dn_dev_parms structure contains the set of parameters - * for each device (hence inclusion in the dn_dev structure) - * and an array is used to store the default types of supported - * device (in dn_dev.c). - * - * The type field matches the ARPHRD_ constants and is used in - * searching the list for supported devices when new devices - * come up. - * - * The mode field is used to find out if a device is broadcast, - * multipoint, or pointopoint. Please note that DECnet thinks - * different ways about devices to the rest of the kernel - * so the normal IFF_xxx flags are invalid here. For devices - * which can be any combination of the previously mentioned - * attributes, you can set this on a per device basis by - * installing an up() routine. - * - * The device state field, defines the initial state in which the - * device will come up. In the dn_dev structure, it is the actual - * state. - * - * Things have changed here. I've killed timer1 since it's a user space - * issue for a user space routing deamon to sort out. The kernel does - * not need to be bothered with it. - * - * Timers: - * t2 - Rate limit timer, min time between routing and hello messages - * t3 - Hello timer, send hello messages when it expires - * - * Callbacks: - * up() - Called to initialize device, return value can veto use of - * device with DECnet. - * down() - Called to turn device off when it goes down - * timer3() - Called once for each ifaddr when timer 3 goes off - * - * sysctl - Hook for sysctl things - * - */ -struct dn_dev_parms { - int type; /* ARPHRD_xxx */ - int mode; /* Broadcast, Unicast, Mulitpoint */ -#define DN_DEV_BCAST 1 -#define DN_DEV_UCAST 2 -#define DN_DEV_MPOINT 4 - int state; /* Initial state */ - int forwarding; /* 0=EndNode, 1=L1Router, 2=L2Router */ - unsigned long t2; /* Default value of t2 */ - unsigned long t3; /* Default value of t3 */ - int priority; /* Priority to be a router */ - char *name; /* Name for sysctl */ - int (*up)(struct net_device *); - void (*down)(struct net_device *); - void (*timer3)(struct net_device *, struct dn_ifaddr *ifa); - void *sysctl; -}; - - -struct dn_dev { - struct dn_ifaddr __rcu *ifa_list; - struct net_device *dev; - struct dn_dev_parms parms; - char use_long; - struct timer_list timer; - unsigned long t3; - struct neigh_parms *neigh_parms; - __u8 addr[ETH_ALEN]; - struct neighbour *router; /* Default router on circuit */ - struct neighbour *peer; /* Peer on pointopoint links */ - unsigned long uptime; /* Time device went up in jiffies */ -}; - -struct dn_short_packet { - __u8 msgflg; - __le16 dstnode; - __le16 srcnode; - __u8 forward; -} __packed; - -struct dn_long_packet { - __u8 msgflg; - __u8 d_area; - __u8 d_subarea; - __u8 d_id[6]; - __u8 s_area; - __u8 s_subarea; - __u8 s_id[6]; - __u8 nl2; - __u8 visit_ct; - __u8 s_class; - __u8 pt; -} __packed; - -/*------------------------- DRP - Routing messages ---------------------*/ - -struct endnode_hello_message { - __u8 msgflg; - __u8 tiver[3]; - __u8 id[6]; - __u8 iinfo; - __le16 blksize; - __u8 area; - __u8 seed[8]; - __u8 neighbor[6]; - __le16 timer; - __u8 mpd; - __u8 datalen; - __u8 data[2]; -} __packed; - -struct rtnode_hello_message { - __u8 msgflg; - __u8 tiver[3]; - __u8 id[6]; - __u8 iinfo; - __le16 blksize; - __u8 priority; - __u8 area; - __le16 timer; - __u8 mpd; -} __packed; - - -void dn_dev_init(void); -void dn_dev_cleanup(void); - -int dn_dev_ioctl(unsigned int cmd, void __user *arg); - -void dn_dev_devices_off(void); -void dn_dev_devices_on(void); - -void dn_dev_init_pkt(struct sk_buff *skb); -void dn_dev_veri_pkt(struct sk_buff *skb); -void dn_dev_hello(struct sk_buff *skb); - -void dn_dev_up(struct net_device *); -void dn_dev_down(struct net_device *); - -int dn_dev_set_default(struct net_device *dev, int force); -struct net_device *dn_dev_get_default(void); -int dn_dev_bind_default(__le16 *addr); - -int register_dnaddr_notifier(struct notifier_block *nb); -int unregister_dnaddr_notifier(struct notifier_block *nb); - -static inline int dn_dev_islocal(struct net_device *dev, __le16 addr) -{ - struct dn_dev *dn_db; - struct dn_ifaddr *ifa; - int res = 0; - - rcu_read_lock(); - dn_db = rcu_dereference(dev->dn_ptr); - if (dn_db == NULL) { - printk(KERN_DEBUG "dn_dev_islocal: Called for non DECnet device\n"); - goto out; - } - - for (ifa = rcu_dereference(dn_db->ifa_list); - ifa != NULL; - ifa = rcu_dereference(ifa->ifa_next)) - if ((addr ^ ifa->ifa_local) == 0) { - res = 1; - break; - } -out: - rcu_read_unlock(); - return res; -} - -#endif /* _NET_DN_DEV_H */ diff --git a/include/net/dn_fib.h b/include/net/dn_fib.h deleted file mode 100644 index 1929a3cd5ebe..000000000000 --- a/include/net/dn_fib.h +++ /dev/null @@ -1,169 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_DN_FIB_H -#define _NET_DN_FIB_H - -#include -#include -#include -#include - -extern const struct nla_policy rtm_dn_policy[]; - -struct dn_fib_res { - struct fib_rule *r; - struct dn_fib_info *fi; - unsigned char prefixlen; - unsigned char nh_sel; - unsigned char type; - unsigned char scope; -}; - -struct dn_fib_nh { - struct net_device *nh_dev; - unsigned int nh_flags; - unsigned char nh_scope; - int nh_weight; - int nh_power; - int nh_oif; - __le16 nh_gw; -}; - -struct dn_fib_info { - struct dn_fib_info *fib_next; - struct dn_fib_info *fib_prev; - refcount_t fib_treeref; - refcount_t fib_clntref; - int fib_dead; - unsigned int fib_flags; - int fib_protocol; - __le16 fib_prefsrc; - __u32 fib_priority; - __u32 fib_metrics[RTAX_MAX]; - int fib_nhs; - int fib_power; - struct dn_fib_nh fib_nh[0]; -#define dn_fib_dev fib_nh[0].nh_dev -}; - - -#define DN_FIB_RES_RESET(res) ((res).nh_sel = 0) -#define DN_FIB_RES_NH(res) ((res).fi->fib_nh[(res).nh_sel]) - -#define DN_FIB_RES_PREFSRC(res) ((res).fi->fib_prefsrc ? : __dn_fib_res_prefsrc(&res)) -#define DN_FIB_RES_GW(res) (DN_FIB_RES_NH(res).nh_gw) -#define DN_FIB_RES_DEV(res) (DN_FIB_RES_NH(res).nh_dev) -#define DN_FIB_RES_OIF(res) (DN_FIB_RES_NH(res).nh_oif) - -typedef struct { - __le16 datum; -} dn_fib_key_t; - -typedef struct { - __le16 datum; -} dn_fib_hash_t; - -typedef struct { - __u16 datum; -} dn_fib_idx_t; - -struct dn_fib_node { - struct dn_fib_node *fn_next; - struct dn_fib_info *fn_info; -#define DN_FIB_INFO(f) ((f)->fn_info) - dn_fib_key_t fn_key; - u8 fn_type; - u8 fn_scope; - u8 fn_state; -}; - - -struct dn_fib_table { - struct hlist_node hlist; - u32 n; - - int (*insert)(struct dn_fib_table *t, struct rtmsg *r, - struct nlattr *attrs[], struct nlmsghdr *n, - struct netlink_skb_parms *req); - int (*delete)(struct dn_fib_table *t, struct rtmsg *r, - struct nlattr *attrs[], struct nlmsghdr *n, - struct netlink_skb_parms *req); - int (*lookup)(struct dn_fib_table *t, const struct flowidn *fld, - struct dn_fib_res *res); - int (*flush)(struct dn_fib_table *t); - int (*dump)(struct dn_fib_table *t, struct sk_buff *skb, struct netlink_callback *cb); - - unsigned char data[]; -}; - -#ifdef CONFIG_DECNET_ROUTER -/* - * dn_fib.c - */ -void dn_fib_init(void); -void dn_fib_cleanup(void); - -int dn_fib_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); -struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, - struct nlattr *attrs[], - const struct nlmsghdr *nlh, int *errp); -int dn_fib_semantic_match(int type, struct dn_fib_info *fi, - const struct flowidn *fld, struct dn_fib_res *res); -void dn_fib_release_info(struct dn_fib_info *fi); -void dn_fib_flush(void); -void dn_fib_select_multipath(const struct flowidn *fld, struct dn_fib_res *res); - -/* - * dn_tables.c - */ -struct dn_fib_table *dn_fib_get_table(u32 n, int creat); -struct dn_fib_table *dn_fib_empty_table(void); -void dn_fib_table_init(void); -void dn_fib_table_cleanup(void); - -/* - * dn_rules.c - */ -void dn_fib_rules_init(void); -void dn_fib_rules_cleanup(void); -unsigned int dnet_addr_type(__le16 addr); -int dn_fib_lookup(struct flowidn *fld, struct dn_fib_res *res); - -int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb); - -void dn_fib_free_info(struct dn_fib_info *fi); - -static inline void dn_fib_info_put(struct dn_fib_info *fi) -{ - if (refcount_dec_and_test(&fi->fib_clntref)) - dn_fib_free_info(fi); -} - -static inline void dn_fib_res_put(struct dn_fib_res *res) -{ - if (res->fi) - dn_fib_info_put(res->fi); - if (res->r) - fib_rule_put(res->r); -} - -#else /* Endnode */ - -#define dn_fib_init() do { } while(0) -#define dn_fib_cleanup() do { } while(0) - -#define dn_fib_lookup(fl, res) (-ESRCH) -#define dn_fib_info_put(fi) do { } while(0) -#define dn_fib_select_multipath(fl, res) do { } while(0) -#define dn_fib_rules_policy(saddr,res,flags) (0) -#define dn_fib_res_put(res) do { } while(0) - -#endif /* CONFIG_DECNET_ROUTER */ - -static inline __le16 dnet_make_mask(int n) -{ - if (n) - return cpu_to_le16(~((1 << (16 - n)) - 1)); - return cpu_to_le16(0); -} - -#endif /* _NET_DN_FIB_H */ diff --git a/include/net/dn_neigh.h b/include/net/dn_neigh.h deleted file mode 100644 index 1f7df98bfc33..000000000000 --- a/include/net/dn_neigh.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NET_DN_NEIGH_H -#define _NET_DN_NEIGH_H - -#include - -/* - * The position of the first two fields of - * this structure are critical - SJW - */ -struct dn_neigh { - struct neighbour n; - __le16 addr; - unsigned long flags; -#define DN_NDFLAG_R1 0x0001 /* Router L1 */ -#define DN_NDFLAG_R2 0x0002 /* Router L2 */ -#define DN_NDFLAG_P3 0x0004 /* Phase III Node */ - unsigned long blksize; - __u8 priority; -}; - -void dn_neigh_init(void); -void dn_neigh_cleanup(void); -int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb); -int dn_neigh_endnode_hello(struct net *net, struct sock *sk, struct sk_buff *skb); -void dn_neigh_pointopoint_hello(struct sk_buff *skb); -int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n); -int dn_to_neigh_output(struct net *net, struct sock *sk, struct sk_buff *skb); - -extern struct neigh_table dn_neigh_table; - -#endif /* _NET_DN_NEIGH_H */ diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h deleted file mode 100644 index a4a18fee0b7c..000000000000 --- a/include/net/dn_nsp.h +++ /dev/null @@ -1,201 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _NET_DN_NSP_H -#define _NET_DN_NSP_H -/****************************************************************************** - (c) 1995-1998 E.M. Serrat emserrat@geocities.com - -*******************************************************************************/ -/* dn_nsp.c functions prototyping */ -#include -#include -#include - -struct sk_buff; -struct sk_buff_head; - -void dn_nsp_send_data_ack(struct sock *sk); -void dn_nsp_send_oth_ack(struct sock *sk); -void dn_send_conn_ack(struct sock *sk); -void dn_send_conn_conf(struct sock *sk, gfp_t gfp); -void dn_nsp_send_disc(struct sock *sk, unsigned char type, - unsigned short reason, gfp_t gfp); -void dn_nsp_return_disc(struct sk_buff *skb, unsigned char type, - unsigned short reason); -void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval); -void dn_nsp_send_conninit(struct sock *sk, unsigned char flags); - -void dn_nsp_output(struct sock *sk); -int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, - struct sk_buff_head *q, unsigned short acknum); -void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb, gfp_t gfp, - int oob); -unsigned long dn_nsp_persist(struct sock *sk); -int dn_nsp_xmit_timeout(struct sock *sk); - -int dn_nsp_rx(struct sk_buff *); -int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb); - -struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri); -struct sk_buff *dn_alloc_send_skb(struct sock *sk, size_t *size, int noblock, - long timeo, int *err); - -#define NSP_REASON_OK 0 /* No error */ -#define NSP_REASON_NR 1 /* No resources */ -#define NSP_REASON_UN 2 /* Unrecognised node name */ -#define NSP_REASON_SD 3 /* Node shutting down */ -#define NSP_REASON_ID 4 /* Invalid destination end user */ -#define NSP_REASON_ER 5 /* End user lacks resources */ -#define NSP_REASON_OB 6 /* Object too busy */ -#define NSP_REASON_US 7 /* Unspecified error */ -#define NSP_REASON_TP 8 /* Third-Party abort */ -#define NSP_REASON_EA 9 /* End user has aborted the link */ -#define NSP_REASON_IF 10 /* Invalid node name format */ -#define NSP_REASON_LS 11 /* Local node shutdown */ -#define NSP_REASON_LL 32 /* Node lacks logical-link resources */ -#define NSP_REASON_LE 33 /* End user lacks logical-link resources */ -#define NSP_REASON_UR 34 /* Unacceptable RQSTRID or PASSWORD field */ -#define NSP_REASON_UA 36 /* Unacceptable ACCOUNT field */ -#define NSP_REASON_TM 38 /* End user timed out logical link */ -#define NSP_REASON_NU 39 /* Node unreachable */ -#define NSP_REASON_NL 41 /* No-link message */ -#define NSP_REASON_DC 42 /* Disconnect confirm */ -#define NSP_REASON_IO 43 /* Image data field overflow */ - -#define NSP_DISCINIT 0x38 -#define NSP_DISCCONF 0x48 - -/*------------------------- NSP - messages ------------------------------*/ -/* Data Messages */ -/*---------------*/ - -/* Data Messages (data segment/interrupt/link service) */ - -struct nsp_data_seg_msg { - __u8 msgflg; - __le16 dstaddr; - __le16 srcaddr; -} __packed; - -struct nsp_data_opt_msg { - __le16 acknum; - __le16 segnum; - __le16 lsflgs; -} __packed; - -struct nsp_data_opt_msg1 { - __le16 acknum; - __le16 segnum; -} __packed; - - -/* Acknowledgment Message (data/other data) */ -struct nsp_data_ack_msg { - __u8 msgflg; - __le16 dstaddr; - __le16 srcaddr; - __le16 acknum; -} __packed; - -/* Connect Acknowledgment Message */ -struct nsp_conn_ack_msg { - __u8 msgflg; - __le16 dstaddr; -} __packed; - - -/* Connect Initiate/Retransmit Initiate/Connect Confirm */ -struct nsp_conn_init_msg { - __u8 msgflg; -#define NSP_CI 0x18 /* Connect Initiate */ -#define NSP_RCI 0x68 /* Retrans. Conn Init */ - __le16 dstaddr; - __le16 srcaddr; - __u8 services; -#define NSP_FC_NONE 0x00 /* Flow Control None */ -#define NSP_FC_SRC 0x04 /* Seg Req. Count */ -#define NSP_FC_SCMC 0x08 /* Sess. Control Mess */ -#define NSP_FC_MASK 0x0c /* FC type mask */ - __u8 info; - __le16 segsize; -} __packed; - -/* Disconnect Initiate/Disconnect Confirm */ -struct nsp_disconn_init_msg { - __u8 msgflg; - __le16 dstaddr; - __le16 srcaddr; - __le16 reason; -} __packed; - - - -struct srcobj_fmt { - __u8 format; - __u8 task; - __le16 grpcode; - __le16 usrcode; - __u8 dlen; -} __packed; - -/* - * A collection of functions for manipulating the sequence - * numbers used in NSP. Similar in operation to the functions - * of the same name in TCP. - */ -static __inline__ int dn_before(__u16 seq1, __u16 seq2) -{ - seq1 &= 0x0fff; - seq2 &= 0x0fff; - - return (int)((seq1 - seq2) & 0x0fff) > 2048; -} - - -static __inline__ int dn_after(__u16 seq1, __u16 seq2) -{ - seq1 &= 0x0fff; - seq2 &= 0x0fff; - - return (int)((seq2 - seq1) & 0x0fff) > 2048; -} - -static __inline__ int dn_equal(__u16 seq1, __u16 seq2) -{ - return ((seq1 ^ seq2) & 0x0fff) == 0; -} - -static __inline__ int dn_before_or_equal(__u16 seq1, __u16 seq2) -{ - return (dn_before(seq1, seq2) || dn_equal(seq1, seq2)); -} - -static __inline__ void seq_add(__u16 *seq, __u16 off) -{ - (*seq) += off; - (*seq) &= 0x0fff; -} - -static __inline__ int seq_next(__u16 seq1, __u16 seq2) -{ - return dn_equal(seq1 + 1, seq2); -} - -/* - * Can we delay the ack ? - */ -static __inline__ int sendack(__u16 seq) -{ - return (int)((seq & 0x1000) ? 0 : 1); -} - -/* - * Is socket congested ? - */ -static __inline__ int dn_congested(struct sock *sk) -{ - return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); -} - -#define DN_MAX_NSP_DATA_HEADER (11) - -#endif /* _NET_DN_NSP_H */ diff --git a/include/net/dn_route.h b/include/net/dn_route.h deleted file mode 100644 index 88c0300236cc..000000000000 --- a/include/net/dn_route.h +++ /dev/null @@ -1,118 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _NET_DN_ROUTE_H -#define _NET_DN_ROUTE_H - -/****************************************************************************** - (c) 1995-1998 E.M. Serrat emserrat@geocities.com - -*******************************************************************************/ - -#include -#include - -struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri); -int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *, - struct sock *sk, int flags); -int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb); -void dn_rt_cache_flush(int delay); -int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev); - -/* Masks for flags field */ -#define DN_RT_F_PID 0x07 /* Mask for packet type */ -#define DN_RT_F_PF 0x80 /* Padding Follows */ -#define DN_RT_F_VER 0x40 /* Version =0 discard packet if ==1 */ -#define DN_RT_F_IE 0x20 /* Intra Ethernet, Reserved in short pkt */ -#define DN_RT_F_RTS 0x10 /* Packet is being returned to sender */ -#define DN_RT_F_RQR 0x08 /* Return packet to sender upon non-delivery */ - -/* Mask for types of routing packets */ -#define DN_RT_PKT_MSK 0x06 -/* Types of routing packets */ -#define DN_RT_PKT_SHORT 0x02 /* Short routing packet */ -#define DN_RT_PKT_LONG 0x06 /* Long routing packet */ - -/* Mask for control/routing selection */ -#define DN_RT_PKT_CNTL 0x01 /* Set to 1 if a control packet */ -/* Types of control packets */ -#define DN_RT_CNTL_MSK 0x0f /* Mask for control packets */ -#define DN_RT_PKT_INIT 0x01 /* Initialisation packet */ -#define DN_RT_PKT_VERI 0x03 /* Verification Message */ -#define DN_RT_PKT_HELO 0x05 /* Hello and Test Message */ -#define DN_RT_PKT_L1RT 0x07 /* Level 1 Routing Message */ -#define DN_RT_PKT_L2RT 0x09 /* Level 2 Routing Message */ -#define DN_RT_PKT_ERTH 0x0b /* Ethernet Router Hello */ -#define DN_RT_PKT_EEDH 0x0d /* Ethernet EndNode Hello */ - -/* Values for info field in hello message */ -#define DN_RT_INFO_TYPE 0x03 /* Type mask */ -#define DN_RT_INFO_L1RT 0x02 /* L1 Router */ -#define DN_RT_INFO_L2RT 0x01 /* L2 Router */ -#define DN_RT_INFO_ENDN 0x03 /* EndNode */ -#define DN_RT_INFO_VERI 0x04 /* Verification Reqd. */ -#define DN_RT_INFO_RJCT 0x08 /* Reject Flag, Reserved */ -#define DN_RT_INFO_VFLD 0x10 /* Verification Failed, Reserved */ -#define DN_RT_INFO_NOML 0x20 /* No Multicast traffic accepted */ -#define DN_RT_INFO_BLKR 0x40 /* Blocking Requested */ - -/* - * The fl structure is what we used to look up the route. - * The rt_saddr & rt_daddr entries are the same as key.saddr & key.daddr - * except for local input routes, where the rt_saddr = fl.fld_dst and - * rt_daddr = fl.fld_src to allow the route to be used for returning - * packets to the originating host. - */ -struct dn_route { - struct dst_entry dst; - struct dn_route __rcu *dn_next; - - struct neighbour *n; - - struct flowidn fld; - - __le16 rt_saddr; - __le16 rt_daddr; - __le16 rt_gateway; - __le16 rt_local_src; /* Source used for forwarding packets */ - __le16 rt_src_map; - __le16 rt_dst_map; - - unsigned int rt_flags; - unsigned int rt_type; -}; - -static inline bool dn_is_input_route(struct dn_route *rt) -{ - return rt->fld.flowidn_iif != 0; -} - -static inline bool dn_is_output_route(struct dn_route *rt) -{ - return rt->fld.flowidn_iif == 0; -} - -void dn_route_init(void); -void dn_route_cleanup(void); - -#include -#include - -static inline void dn_rt_send(struct sk_buff *skb) -{ - dev_queue_xmit(skb); -} - -static inline void dn_rt_finish_output(struct sk_buff *skb, char *dst, char *src) -{ - struct net_device *dev = skb->dev; - - if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK)) - dst = NULL; - - if (dev_hard_header(skb, dev, ETH_P_DNA_RT, dst, src, skb->len) >= 0) - dn_rt_send(skb); - else - kfree_skb(skb); -} - -#endif /* _NET_DN_ROUTE_H */ diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index b593f95e9991..02bbdc577f8e 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -24,9 +24,6 @@ struct netns_nf { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE struct nf_hook_entries __rcu *hooks_bridge[NF_INET_NUMHOOKS]; #endif -#if IS_ENABLED(CONFIG_DECNET) - struct nf_hook_entries __rcu *hooks_decnet[NF_DN_NUMHOOKS]; -#endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) unsigned int defrag_ipv4_users; #endif diff --git a/include/uapi/linux/dn.h b/include/uapi/linux/dn.h deleted file mode 100644 index 36ca71bd8bbe..000000000000 --- a/include/uapi/linux/dn.h +++ /dev/null @@ -1,149 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _LINUX_DN_H -#define _LINUX_DN_H - -#include -#include -#include - -/* - - DECnet Data Structures and Constants - -*/ - -/* - * DNPROTO_NSP can't be the same as SOL_SOCKET, - * so increment each by one (compared to ULTRIX) - */ -#define DNPROTO_NSP 2 /* NSP protocol number */ -#define DNPROTO_ROU 3 /* Routing protocol number */ -#define DNPROTO_NML 4 /* Net mgt protocol number */ -#define DNPROTO_EVL 5 /* Evl protocol number (usr) */ -#define DNPROTO_EVR 6 /* Evl protocol number (evl) */ -#define DNPROTO_NSPT 7 /* NSP trace protocol number */ - - -#define DN_ADDL 2 -#define DN_MAXADDL 2 /* ULTRIX headers have 20 here, but pathworks has 2 */ -#define DN_MAXOPTL 16 -#define DN_MAXOBJL 16 -#define DN_MAXACCL 40 -#define DN_MAXALIASL 128 -#define DN_MAXNODEL 256 -#define DNBUFSIZE 65023 - -/* - * SET/GET Socket options - must match the DSO_ numbers below - */ -#define SO_CONDATA 1 -#define SO_CONACCESS 2 -#define SO_PROXYUSR 3 -#define SO_LINKINFO 7 - -#define DSO_CONDATA 1 /* Set/Get connect data */ -#define DSO_DISDATA 10 /* Set/Get disconnect data */ -#define DSO_CONACCESS 2 /* Set/Get connect access data */ -#define DSO_ACCEPTMODE 4 /* Set/Get accept mode */ -#define DSO_CONACCEPT 5 /* Accept deferred connection */ -#define DSO_CONREJECT 6 /* Reject deferred connection */ -#define DSO_LINKINFO 7 /* Set/Get link information */ -#define DSO_STREAM 8 /* Set socket type to stream */ -#define DSO_SEQPACKET 9 /* Set socket type to sequenced packet */ -#define DSO_MAXWINDOW 11 /* Maximum window size allowed */ -#define DSO_NODELAY 12 /* Turn off nagle */ -#define DSO_CORK 13 /* Wait for more data! */ -#define DSO_SERVICES 14 /* NSP Services field */ -#define DSO_INFO 15 /* NSP Info field */ -#define DSO_MAX 15 /* Maximum option number */ - - -/* LINK States */ -#define LL_INACTIVE 0 -#define LL_CONNECTING 1 -#define LL_RUNNING 2 -#define LL_DISCONNECTING 3 - -#define ACC_IMMED 0 -#define ACC_DEFER 1 - -#define SDF_WILD 1 /* Wild card object */ -#define SDF_PROXY 2 /* Addr eligible for proxy */ -#define SDF_UICPROXY 4 /* Use uic-based proxy */ - -/* Structures */ - - -struct dn_naddr { - __le16 a_len; - __u8 a_addr[DN_MAXADDL]; /* Two bytes little endian */ -}; - -struct sockaddr_dn { - __u16 sdn_family; - __u8 sdn_flags; - __u8 sdn_objnum; - __le16 sdn_objnamel; - __u8 sdn_objname[DN_MAXOBJL]; - struct dn_naddr sdn_add; -}; -#define sdn_nodeaddrl sdn_add.a_len /* Node address length */ -#define sdn_nodeaddr sdn_add.a_addr /* Node address */ - - - -/* - * DECnet set/get DSO_CONDATA, DSO_DISDATA (optional data) structure - */ -struct optdata_dn { - __le16 opt_status; /* Extended status return */ -#define opt_sts opt_status - __le16 opt_optl; /* Length of user data */ - __u8 opt_data[16]; /* User data */ -}; - -struct accessdata_dn { - __u8 acc_accl; - __u8 acc_acc[DN_MAXACCL]; - __u8 acc_passl; - __u8 acc_pass[DN_MAXACCL]; - __u8 acc_userl; - __u8 acc_user[DN_MAXACCL]; -}; - -/* - * DECnet logical link information structure - */ -struct linkinfo_dn { - __u16 idn_segsize; /* Segment size for link */ - __u8 idn_linkstate; /* Logical link state */ -}; - -/* - * Ethernet address format (for DECnet) - */ -union etheraddress { - __u8 dne_addr[ETH_ALEN]; /* Full ethernet address */ - struct { - __u8 dne_hiord[4]; /* DECnet HIORD prefix */ - __u8 dne_nodeaddr[2]; /* DECnet node address */ - } dne_remote; -}; - - -/* - * DECnet physical socket address format - */ -struct dn_addr { - __le16 dna_family; /* AF_DECnet */ - union etheraddress dna_netaddr; /* DECnet ethernet address */ -}; - -#define DECNET_IOCTL_BASE 0x89 /* PROTOPRIVATE range */ - -#define SIOCSNETADDR _IOW(DECNET_IOCTL_BASE, 0xe0, struct dn_naddr) -#define SIOCGNETADDR _IOR(DECNET_IOCTL_BASE, 0xe1, struct dn_naddr) -#define OSIOCSNETADDR _IOW(DECNET_IOCTL_BASE, 0xe0, int) -#define OSIOCGNETADDR _IOR(DECNET_IOCTL_BASE, 0xe1, int) - -#endif /* _LINUX_DN_H */ diff --git a/include/uapi/linux/netfilter_decnet.h b/include/uapi/linux/netfilter_decnet.h deleted file mode 100644 index 3c77f54560f2..000000000000 --- a/include/uapi/linux/netfilter_decnet.h +++ /dev/null @@ -1,72 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef __LINUX_DECNET_NETFILTER_H -#define __LINUX_DECNET_NETFILTER_H - -/* DECnet-specific defines for netfilter. - * This file (C) Steve Whitehouse 1999 derived from the - * ipv4 netfilter header file which is - * (C)1998 Rusty Russell -- This code is GPL. - */ - -#include - -/* only for userspace compatibility */ -#ifndef __KERNEL__ - -#include /* for INT_MIN, INT_MAX */ - -/* kernel define is in netfilter_defs.h */ -#define NF_DN_NUMHOOKS 7 -#endif /* ! __KERNEL__ */ - -/* DECnet Hooks */ -/* After promisc drops, checksum checks. */ -#define NF_DN_PRE_ROUTING 0 -/* If the packet is destined for this box. */ -#define NF_DN_LOCAL_IN 1 -/* If the packet is destined for another interface. */ -#define NF_DN_FORWARD 2 -/* Packets coming from a local process. */ -#define NF_DN_LOCAL_OUT 3 -/* Packets about to hit the wire. */ -#define NF_DN_POST_ROUTING 4 -/* Input Hello Packets */ -#define NF_DN_HELLO 5 -/* Input Routing Packets */ -#define NF_DN_ROUTE 6 - -enum nf_dn_hook_priorities { - NF_DN_PRI_FIRST = INT_MIN, - NF_DN_PRI_CONNTRACK = -200, - NF_DN_PRI_MANGLE = -150, - NF_DN_PRI_NAT_DST = -100, - NF_DN_PRI_FILTER = 0, - NF_DN_PRI_NAT_SRC = 100, - NF_DN_PRI_DNRTMSG = 200, - NF_DN_PRI_LAST = INT_MAX, -}; - -struct nf_dn_rtmsg { - int nfdn_ifindex; -}; - -#define NFDN_RTMSG(r) ((unsigned char *)(r) + NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg))) - -#ifndef __KERNEL__ -/* backwards compatibility for userspace */ -#define DNRMG_L1_GROUP 0x01 -#define DNRMG_L2_GROUP 0x02 -#endif - -enum { - DNRNG_NLGRP_NONE, -#define DNRNG_NLGRP_NONE DNRNG_NLGRP_NONE - DNRNG_NLGRP_L1, -#define DNRNG_NLGRP_L1 DNRNG_NLGRP_L1 - DNRNG_NLGRP_L2, -#define DNRNG_NLGRP_L2 DNRNG_NLGRP_L2 - __DNRNG_NLGRP_MAX -}; -#define DNRNG_NLGRP_MAX (__DNRNG_NLGRP_MAX - 1) - -#endif /*__LINUX_DECNET_NETFILTER_H*/ diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 855dffb4c1c3..1e543cf0568c 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -20,7 +20,7 @@ #define NETLINK_CONNECTOR 11 #define NETLINK_NETFILTER 12 /* netfilter subsystem */ #define NETLINK_IP6_FW 13 -#define NETLINK_DNRTMSG 14 /* DECnet routing messages */ +#define NETLINK_DNRTMSG 14 /* DECnet routing messages (obsolete) */ #define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */ #define NETLINK_GENERIC 16 /* leave room for NETLINK_DM (DM Events) */ -- cgit v1.2.3 From e38f22c860edb7804b4722ac2332f7c51b9c6b72 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Fri, 19 Aug 2022 05:25:19 +0000 Subject: vsock: SO_RCVLOWAT transport set callback This adds transport specific callback for SO_RCVLOWAT, because in some transports it may be difficult to know current available number of bytes ready to read. Thus, when SO_RCVLOWAT is set, transport may reject it. Signed-off-by: Arseniy Krasnov Signed-off-by: Paolo Abeni --- include/net/af_vsock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 1c53c4c4d88f..d609a088cb27 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -135,6 +135,7 @@ struct vsock_transport { u64 (*stream_rcvhiwat)(struct vsock_sock *); bool (*stream_is_active)(struct vsock_sock *); bool (*stream_allow)(u32 cid, u32 port); + int (*set_rcvlowat)(struct vsock_sock *vsk, int val); /* SEQ_PACKET. */ ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg, -- cgit v1.2.3 From f2fdcf67aceb1a7d5e0661cb7ca95cda68d3014a Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Fri, 19 Aug 2022 05:36:52 +0000 Subject: vsock: add API call for data ready This adds 'vsock_data_ready()' which must be called by transport to kick sleeping data readers. It checks for SO_RCVLOWAT value before waking user, thus preventing spurious wake ups. Based on 'tcp_data_ready()' logic. Signed-off-by: Arseniy Krasnov Reviewed-by: Stefano Garzarella Signed-off-by: Paolo Abeni --- include/net/af_vsock.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index d609a088cb27..568a87c5e0d0 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -78,6 +78,7 @@ struct vsock_sock { s64 vsock_stream_has_data(struct vsock_sock *vsk); s64 vsock_stream_has_space(struct vsock_sock *vsk); struct sock *vsock_create_connected(struct sock *parent); +void vsock_data_ready(struct sock *sk); /**** TRANSPORT ****/ -- cgit v1.2.3 From 5dc760d1208200daaf49a2ff56a0e7dfa2d80bb2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 19 Aug 2022 20:48:18 +0300 Subject: net: dsa: use dsa_tree_for_each_cpu_port in dsa_tree_{setup,teardown}_master More logic will be added to dsa_tree_setup_master() and dsa_tree_teardown_master() in upcoming changes. Reduce the indentation by one level in these functions by introducing and using a dedicated iterator for CPU ports of a tree. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- include/net/dsa.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index b902b31bebce..f2ce12860546 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -559,6 +559,10 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) +#define dsa_tree_for_each_cpu_port(_dp, _dst) \ + list_for_each_entry((_dp), &(_dst)->ports, list) \ + if (dsa_port_is_cpu((_dp))) + #define dsa_switch_for_each_port(_dp, _ds) \ list_for_each_entry((_dp), &(_ds)->dst->ports, list) \ if ((_dp)->ds == (_ds)) -- cgit v1.2.3 From 36a0bf44358597dee6947938e8643c61442cab87 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 19 Aug 2022 20:48:19 +0300 Subject: net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean Signed-off-by: Paolo Abeni --- include/soc/mscc/ocelot.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 2edea901bbd5..2a7e18ee5577 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -1024,6 +1024,8 @@ void ocelot_deinit(struct ocelot *ocelot); void ocelot_init_port(struct ocelot *ocelot, int port); void ocelot_deinit_port(struct ocelot *ocelot, int port); +void ocelot_port_setup_dsa_8021q_cpu(struct ocelot *ocelot, int cpu); +void ocelot_port_teardown_dsa_8021q_cpu(struct ocelot *ocelot, int cpu); void ocelot_port_assign_dsa_8021q_cpu(struct ocelot *ocelot, int port, int cpu); void ocelot_port_unassign_dsa_8021q_cpu(struct ocelot *ocelot, int port); u32 ocelot_port_assigned_dsa_8021q_cpu_mask(struct ocelot *ocelot, int port); -- cgit v1.2.3 From 0ba985024ae7db226776725d9aa436b5c1c9fca2 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sun, 21 Aug 2022 14:35:16 +0300 Subject: flow_dissector: Make 'bpf_flow_dissect' return the bpf program retcode Let 'bpf_flow_dissect' callers know the BPF program's retcode and act accordingly. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220821113519.116765-2-shmulik.ladkani@gmail.com --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca8afa382bf2..87921996175c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1460,8 +1460,8 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, unsigned int key_count); struct bpf_flow_dissector; -bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen, unsigned int flags); +u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen, unsigned int flags); bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, -- cgit v1.2.3 From 91350fe152930c0d61a362af68272526490efea5 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Sun, 21 Aug 2022 14:35:17 +0300 Subject: bpf, flow_dissector: Introduce BPF_FLOW_DISSECTOR_CONTINUE retcode for bpf progs Currently, attaching BPF_PROG_TYPE_FLOW_DISSECTOR programs completely replaces the flow-dissector logic with custom dissection logic. This forces implementors to write programs that handle dissection for any flows expected in the namespace. It makes sense for flow-dissector BPF programs to just augment the dissector with custom logic (e.g. dissecting certain flows or custom protocols), while enjoying the broad capabilities of the standard dissector for any other traffic. Introduce BPF_FLOW_DISSECTOR_CONTINUE retcode. Flow-dissector BPF programs may return this to indicate no dissection was made, and fallback to the standard dissector is requested. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20220821113519.116765-3-shmulik.ladkani@gmail.com --- include/uapi/linux/bpf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 934a2a8beb87..7f87012b012e 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5861,6 +5861,11 @@ enum bpf_ret_code { * represented by BPF_REDIRECT above). */ BPF_LWT_REROUTE = 128, + /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR + * to indicate that no custom dissection was performed, and + * fallback to standard dissector is requested. + */ + BPF_FLOW_DISSECTOR_CONTINUE = 129, }; struct bpf_sock { -- cgit v1.2.3 From dea6a4e17013382b20717664ebf3d7cc405e0952 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:51 -0700 Subject: bpf: Introduce cgroup_{common,current}_func_proto Split cgroup_base_func_proto into the following: * cgroup_common_func_proto - common helpers for all cgroup hooks * cgroup_current_func_proto - common helpers for all cgroup hooks running in the process context (== have meaningful 'current'). Move bpf_{g,s}et_retval and other cgroup-related helpers into kernel/bpf/cgroup.c so they closer to where they are being used. Signed-off-by: Stanislav Fomichev Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220823222555.523590-2-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 2bd1b5f8de9b..57e9e109257e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -414,6 +414,11 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog); int cgroup_bpf_prog_query(const union bpf_attr *attr, union bpf_attr __user *uattr); + +const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); +const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); #else static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } @@ -444,6 +449,18 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, return -EINVAL; } +static inline const struct bpf_func_proto * +cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return NULL; +} + +static inline const struct bpf_func_proto * +cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return NULL; +} + static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map) { return 0; } static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( -- cgit v1.2.3 From bed89185af0de0d417e29ca1798df50f161b0231 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:52 -0700 Subject: bpf: Use cgroup_{common,current}_func_proto in more hooks The following hooks are per-cgroup hooks but they are not using cgroup_{common,current}_func_proto, fix it: * BPF_PROG_TYPE_CGROUP_SKB (cg_skb) * BPF_PROG_TYPE_CGROUP_SOCK_ADDR (cg_sock_addr) * BPF_PROG_TYPE_CGROUP_SOCK (cg_sock) * BPF_PROG_TYPE_LSM+BPF_LSM_CGROUP Also: * move common func_proto's into cgroup func_proto handlers * make sure bpf_{g,s}et_retval are not accessible from recvmsg, getpeername and getsockname (return/errno is ignored in these places) * as a side effect, expose get_current_pid_tgid, get_current_comm_proto, get_current_ancestor_cgroup_id, get_cgroup_classid to more cgroup hooks Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220823222555.523590-3-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 39bd36359c1e..99fc7a64564f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2375,6 +2375,7 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; extern const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto; +extern const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto; extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; -- cgit v1.2.3 From 2172fb8007eaafbef18563afb6c1ae5a976bf787 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 23 Aug 2022 15:25:54 -0700 Subject: bpf: update bpf_{g,s}et_retval documentation * replace 'syscall' with 'upper layers', still mention that it's being exported via syscall errno * describe what happens in set_retval(-EPERM) + return 1 * describe what happens with bind's 'return 3' Acked-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220823222555.523590-5-sdf@google.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7f87012b012e..644600dbb114 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5085,17 +5085,29 @@ union bpf_attr { * * int bpf_get_retval(void) * Description - * Get the syscall's return value that will be returned to userspace. + * Get the BPF program's return value that will be returned to the upper layers. * - * This helper is currently supported by cgroup programs only. + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. * Return - * The syscall's return value. + * The BPF program's return value. * * int bpf_set_retval(int retval) * Description - * Set the syscall's return value that will be returned to userspace. + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. * - * This helper is currently supported by cgroup programs only. * Return * 0 on success, or a negative error in case of failure. * -- cgit v1.2.3 From 30b6055428a90cc52d4add164df12b94ab07c3fd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 19 Aug 2022 13:02:20 -0700 Subject: net: improve and fix netlink kdoc Subsequent patch will render the kdoc from include/uapi/linux/netlink.h into Documentation. We need to fix the warnings. While at it move the comments on struct nlmsghdr to a proper kdoc comment. Link: https://lore.kernel.org/r/20220819200221.422801-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/netlink.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 1e543cf0568c..e0ab261ceca2 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -41,12 +41,20 @@ struct sockaddr_nl { __u32 nl_groups; /* multicast groups mask */ }; +/** + * struct nlmsghdr - fixed format metadata header of Netlink messages + * @nlmsg_len: Length of message including header + * @nlmsg_type: Message content type + * @nlmsg_flags: Additional flags + * @nlmsg_seq: Sequence number + * @nlmsg_pid: Sending process port ID + */ struct nlmsghdr { - __u32 nlmsg_len; /* Length of message including header */ - __u16 nlmsg_type; /* Message content */ - __u16 nlmsg_flags; /* Additional flags */ - __u32 nlmsg_seq; /* Sequence number */ - __u32 nlmsg_pid; /* Sending process port ID */ + __u32 nlmsg_len; + __u16 nlmsg_type; + __u16 nlmsg_flags; + __u32 nlmsg_seq; + __u32 nlmsg_pid; }; /* Flags values */ @@ -337,6 +345,9 @@ enum netlink_attribute_type { * bitfield32 type (U32) * @NL_POLICY_TYPE_ATTR_MASK: mask of valid bits for unsigned integers (U64) * @NL_POLICY_TYPE_ATTR_PAD: pad attribute for 64-bit alignment + * + * @__NL_POLICY_TYPE_ATTR_MAX: number of attributes + * @NL_POLICY_TYPE_ATTR_MAX: highest attribute number */ enum netlink_policy_type_attr { NL_POLICY_TYPE_ATTR_UNSPEC, -- cgit v1.2.3 From c205cc7534a97f2d6fbd2a23a94ed7c036c6e2aa Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 21 Aug 2022 13:18:58 +0800 Subject: net: skb: prevent the split of kfree_skb_reason() by gcc Sometimes, gcc will optimize the function by spliting it to two or more functions. In this case, kfree_skb_reason() is splited to kfree_skb_reason and kfree_skb_reason.part.0. However, the function/tracepoint trace_kfree_skb() in it needs the return address of kfree_skb_reason(). This split makes the call chains becomes: kfree_skb_reason() -> kfree_skb_reason.part.0 -> trace_kfree_skb() which makes the return address that passed to trace_kfree_skb() be kfree_skb(). Therefore, introduce '__fix_address', which is the combination of '__noclone' and 'noinline', and apply it to kfree_skb_reason() to prevent to from being splited or made inline. (Is it better to simply apply '__noclone oninline' to kfree_skb_reason? I'm thinking maybe other functions have the same problems) Meanwhile, wrap 'skb_unref()' with 'unlikely()', as the compiler thinks it is likely return true and splits kfree_skb_reason(). Signed-off-by: Menglong Dong Signed-off-by: David S. Miller --- include/linux/compiler_attributes.h | 7 +++++++ include/linux/skbuff.h | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h index 445e80517cab..fc93c9488c76 100644 --- a/include/linux/compiler_attributes.h +++ b/include/linux/compiler_attributes.h @@ -371,4 +371,11 @@ */ #define __weak __attribute__((__weak__)) +/* + * Used by functions that use '__builtin_return_address'. These function + * don't want to be splited or made inline, which can make + * the '__builtin_return_address' get unexpected address. + */ +#define __fix_address noinline __noclone + #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ca8afa382bf2..b51d07a727c9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1195,7 +1195,8 @@ static inline bool skb_unref(struct sk_buff *skb) return true; } -void kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); +void __fix_address +kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason); /** * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason -- cgit v1.2.3 From 9d9d00ac29d0ef7ce426964de46fa6b380357d0a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 23 Aug 2022 03:31:25 +0200 Subject: bpf: Fix reference state management for synchronous callbacks Currently, verifier verifies callback functions (sync and async) as if they will be executed once, (i.e. it explores execution state as if the function was being called once). The next insn to explore is set to start of subprog and the exit from nested frame is handled using curframe > 0 and prepare_func_exit. In case of async callback it uses a customized variant of push_stack simulating a kind of branch to set up custom state and execution context for the async callback. While this approach is simple and works when callback really will be executed only once, it is unsafe for all of our current helpers which are for_each style, i.e. they execute the callback multiple times. A callback releasing acquired references of the caller may do so multiple times, but currently verifier sees it as one call inside the frame, which then returns to caller. Hence, it thinks it released some reference that the cb e.g. got access through callback_ctx (register filled inside cb from spilled typed register on stack). Similarly, it may see that an acquire call is unpaired inside the callback, so the caller will copy the reference state of callback and then will have to release the register with new ref_obj_ids. But again, the callback may execute multiple times, but the verifier will only account for acquired references for a single symbolic execution of the callback, which will cause leaks. Note that for async callback case, things are different. While currently we have bpf_timer_set_callback which only executes it once, even for multiple executions it would be safe, as reference state is NULL and check_reference_leak would force program to release state before BPF_EXIT. The state is also unaffected by analysis for the caller frame. Hence async callback is safe. Since we want the reference state to be accessible, e.g. for pointers loaded from stack through callback_ctx's PTR_TO_STACK, we still have to copy caller's reference_state to callback's bpf_func_state, but we enforce that whatever references it adds to that reference_state has been released before it hits BPF_EXIT. This requires introducing a new callback_ref member in the reference state to distinguish between caller vs callee references. Hence, check_reference_leak now errors out if it sees we are in callback_fn and we have not released callback_ref refs. Since there can be multiple nested callbacks, like frame 0 -> cb1 -> cb2 etc. we need to also distinguish between whether this particular ref belongs to this callback frame or parent, and only error for our own, so we store state->frameno (which is always non-zero for callbacks). In short, callbacks can read parent reference_state, but cannot mutate it, to be able to use pointers acquired by the caller. They must only undo their changes (by releasing their own acquired_refs before BPF_EXIT) on top of caller reference_state before returning (at which point the caller and callback state will match anyway, so no need to copy it back to caller). Fixes: 69c087ba6225 ("bpf: Add bpf_for_each_map_elem() helper") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220823013125.24938-1-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2e3bad8640dc..1fdddbf3546b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -212,6 +212,17 @@ struct bpf_reference_state { * is used purely to inform the user of a reference leak. */ int insn_idx; + /* There can be a case like: + * main (frame 0) + * cb (frame 1) + * func (frame 3) + * cb (frame 4) + * Hence for frame 4, if callback_ref just stored boolean, it would be + * impossible to distinguish nested callback refs. Hence store the + * frameno and compare that to callback_ref in check_reference_leak when + * exiting a callback function. + */ + int callback_ref; }; /* state of the program: -- cgit v1.2.3 From 0bf73255d3a3cf3b0416e95f2c9f7c53095c2e1a Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Wed, 24 Aug 2022 09:36:21 +0800 Subject: netlink: fix some kernel-doc comments Modify the comment of input parameter of nlmsg_ and nla_ function. Signed-off-by: Zhengchao Shao Link: https://lore.kernel.org/r/20220824013621.365103-1-shaozhengchao@huawei.com Signed-off-by: Jakub Kicinski --- include/net/netlink.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index 7a2a9d3144ba..e658d18afa67 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -741,6 +741,7 @@ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected + * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse() @@ -760,6 +761,7 @@ static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected + * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated() @@ -779,6 +781,7 @@ static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen, * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected + * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated_strict() @@ -814,7 +817,6 @@ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy - * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the -- cgit v1.2.3 From 28044fc1d4953b07acec0da4d2fc4784c57ea6fb Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Mon, 22 Aug 2022 11:10:21 -0700 Subject: net: Add a bhash2 table hashed by port and address The current bind hashtable (bhash) is hashed by port only. In the socket bind path, we have to check for bind conflicts by traversing the specified port's inet_bind_bucket while holding the hashbucket's spinlock (see inet_csk_get_port() and inet_csk_bind_conflict()). In instances where there are tons of sockets hashed to the same port at different addresses, the bind conflict check is time-intensive and can cause softirq cpu lockups, as well as stops new tcp connections since __inet_inherit_port() also contests for the spinlock. This patch adds a second bind table, bhash2, that hashes by port and sk->sk_rcv_saddr (ipv4) and sk->sk_v6_rcv_saddr (ipv6). Searching the bhash2 table leads to significantly faster conflict resolution and less time holding the hashbucket spinlock. Please note a few things: * There can be the case where the a socket's address changes after it has been bound. There are two cases where this happens: 1) The case where there is a bind() call on INADDR_ANY (ipv4) or IPV6_ADDR_ANY (ipv6) and then a connect() call. The kernel will assign the socket an address when it handles the connect() 2) In inet_sk_reselect_saddr(), which is called when rebuilding the sk header and a few pre-conditions are met (eg rerouting fails). In these two cases, we need to update the bhash2 table by removing the entry for the old address, and add a new entry reflecting the updated address. * The bhash2 table must have its own lock, even though concurrent accesses on the same port are protected by the bhash lock. Bhash2 must have its own lock to protect against cases where sockets on different ports hash to different bhash hashbuckets but to the same bhash2 hashbucket. This brings up a few stipulations: 1) When acquiring both the bhash and the bhash2 lock, the bhash2 lock will always be acquired after the bhash lock and released before the bhash lock is released. 2) There are no nested bhash2 hashbucket locks. A bhash2 lock is always acquired+released before another bhash2 lock is acquired+released. * The bhash table cannot be superseded by the bhash2 table because for bind requests on INADDR_ANY (ipv4) or IPV6_ADDR_ANY (ipv6), every socket bound to that port must be checked for a potential conflict. The bhash table is the only source of port->socket associations. Signed-off-by: Joanne Koong Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 3 ++ include/net/inet_hashtables.h | 80 +++++++++++++++++++++++++++++++++++++- include/net/sock.h | 14 +++++++ 3 files changed, 95 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index ee88f0f1350f..c2b15f7e5516 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -25,6 +25,7 @@ #undef INET_CSK_CLEAR_TIMERS struct inet_bind_bucket; +struct inet_bind2_bucket; struct tcp_congestion_ops; /* @@ -57,6 +58,7 @@ struct inet_connection_sock_af_ops { * * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node + * @icsk_bind2_hash: Bind node in the bhash2 table * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout @@ -83,6 +85,7 @@ struct inet_connection_sock { struct inet_sock icsk_inet; struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; + struct inet_bind2_bucket *icsk_bind2_hash; unsigned long icsk_timeout; struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index e9cf2157ed8a..44a419b9e3d5 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -90,7 +91,28 @@ struct inet_bind_bucket { struct hlist_head owners; }; -static inline struct net *ib_net(struct inet_bind_bucket *ib) +struct inet_bind2_bucket { + possible_net_t ib_net; + int l3mdev; + unsigned short port; + union { +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr v6_rcv_saddr; +#endif + __be32 rcv_saddr; + }; + /* Node in the bhash2 inet_bind_hashbucket chain */ + struct hlist_node node; + /* List of sockets hashed to this bucket */ + struct hlist_head owners; +}; + +static inline struct net *ib_net(const struct inet_bind_bucket *ib) +{ + return read_pnet(&ib->ib_net); +} + +static inline struct net *ib2_net(const struct inet_bind2_bucket *ib) { return read_pnet(&ib->ib_net); } @@ -133,7 +155,14 @@ struct inet_hashinfo { * TCP hash as well as the others for fast bind/connect. */ struct kmem_cache *bind_bucket_cachep; + /* This bind table is hashed by local port */ struct inet_bind_hashbucket *bhash; + struct kmem_cache *bind2_bucket_cachep; + /* This bind table is hashed by local port and sk->sk_rcv_saddr (ipv4) + * or sk->sk_v6_rcv_saddr (ipv6). This 2nd bind table is used + * primarily for expediting bind conflict resolution. + */ + struct inet_bind_hashbucket *bhash2; unsigned int bhash_size; /* The 2nd listener table hashed by local port and address */ @@ -182,14 +211,61 @@ inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb); +bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, + const struct net *net, unsigned short port, + int l3mdev); + +struct inet_bind2_bucket * +inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net, + struct inet_bind_hashbucket *head, + unsigned short port, int l3mdev, + const struct sock *sk); + +void inet_bind2_bucket_destroy(struct kmem_cache *cachep, + struct inet_bind2_bucket *tb); + +struct inet_bind2_bucket * +inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, + const struct net *net, + unsigned short port, int l3mdev, + const struct sock *sk); + +bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, + const struct net *net, unsigned short port, + int l3mdev, const struct sock *sk); + static inline u32 inet_bhashfn(const struct net *net, const __u16 lport, const u32 bhash_size) { return (lport + net_hash_mix(net)) & (bhash_size - 1); } +static inline struct inet_bind_hashbucket * +inet_bhashfn_portaddr(const struct inet_hashinfo *hinfo, const struct sock *sk, + const struct net *net, unsigned short port) +{ + u32 hash; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + hash = ipv6_portaddr_hash(net, &sk->sk_v6_rcv_saddr, port); + else +#endif + hash = ipv4_portaddr_hash(net, sk->sk_rcv_saddr, port); + return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)]; +} + +struct inet_bind_hashbucket * +inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port); + +/* This should be called whenever a socket's sk_rcv_saddr (ipv4) or + * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's + * rcv_saddr field should already have been updated when this is called. + */ +int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk); + void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - const unsigned short snum); + struct inet_bind2_bucket *tb2, unsigned short port); /* Caller must disable local BH processing. */ int __inet_inherit_port(const struct sock *sk, struct sock *child); diff --git a/include/net/sock.h b/include/net/sock.h index d08cfe190a78..ca469980e006 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -348,6 +348,7 @@ struct sk_filter; * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags * @ns_tracker: tracker for netns reference + * @sk_bind2_node: bind node in the bhash2 table */ struct sock { /* @@ -537,6 +538,7 @@ struct sock { #endif struct rcu_head sk_rcu; netns_tracker ns_tracker; + struct hlist_node sk_bind2_node; }; enum sk_pacing { @@ -870,6 +872,16 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_add_head(&sk->sk_bind_node, list); } +static inline void __sk_del_bind2_node(struct sock *sk) +{ + __hlist_del(&sk->sk_bind2_node); +} + +static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list) +{ + hlist_add_head(&sk->sk_bind2_node, list); +} + #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ @@ -887,6 +899,8 @@ static inline void sk_add_bind_node(struct sock *sk, hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) +#define sk_for_each_bound_bhash2(__sk, list) \ + hlist_for_each_entry(__sk, list, sk_bind2_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset -- cgit v1.2.3 From 35ffb66547295c72650978f9c28e670e014d0957 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 23 Aug 2022 09:10:49 +0200 Subject: net: gro: skb_gro_header helper function Introduce a simple helper function to replace a common pattern. When accessing the GRO header, we fetch the pointer from frag0, then test its validity and fetch it from the skb when necessary. This leads to the pattern skb_gro_header_fast -> skb_gro_header_hard -> skb_gro_header_slow recurring many times throughout GRO code. This patch replaces these patterns with a single inlined function call, improving code readability. Signed-off-by: Richard Gobert Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220823071034.GA56142@debian Signed-off-by: Paolo Abeni --- include/net/gro.h | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h index 867656b0739c..5bf15c212434 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -160,6 +160,17 @@ static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, return skb->data + offset; } +static inline void *skb_gro_header(struct sk_buff *skb, + unsigned int hlen, unsigned int offset) +{ + void *ptr; + + ptr = skb_gro_header_fast(skb, offset); + if (skb_gro_header_hard(skb, hlen)) + ptr = skb_gro_header_slow(skb, hlen, offset); + return ptr; +} + static inline void *skb_gro_network_header(struct sk_buff *skb) { return (NAPI_GRO_CB(skb)->frag0 ?: skb->data) + @@ -301,12 +312,9 @@ static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, return ptr; } - ptr = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, off + plen)) { - ptr = skb_gro_header_slow(skb, off + plen, off); - if (!ptr) - return NULL; - } + ptr = skb_gro_header(skb, off + plen, off); + if (!ptr) + return NULL; delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, start, offset); @@ -329,12 +337,9 @@ static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, if (!grc->delta) return; - ptr = skb_gro_header_fast(skb, grc->offset); - if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) { - ptr = skb_gro_header_slow(skb, plen, grc->offset); - if (!ptr) - return; - } + ptr = skb_gro_header(skb, plen, grc->offset); + if (!ptr) + return; remcsum_unadjust((__sum16 *)ptr, grc->delta); } @@ -405,9 +410,7 @@ static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) off = skb_gro_offset(skb); hlen = off + sizeof(*uh); - uh = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, hlen)) - uh = skb_gro_header_slow(skb, hlen, off); + uh = skb_gro_header(skb, hlen, off); return uh; } -- cgit v1.2.3 From 9d2bb84d54a40361c7008b33a60dc24f78724746 Mon Sep 17 00:00:00 2001 From: Shaul Triebitz Date: Tue, 2 Aug 2022 15:22:42 +0300 Subject: wifi: cfg80211: add link id to txq params The Tx queue parameters are per link, so add the link ID from nl80211 parameters to the API. While at it, lock the wdev when calling into the driver so it (and we) can check the link ID appropriately. Signed-off-by: Shaul Triebitz Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 908d58393484..1d393e09a632 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2316,6 +2316,7 @@ struct ocb_setup { * @cwmax: Maximum contention window [a value of the form 2^n-1 in the range * 1..32767] * @aifs: Arbitration interframe space [0..255] + * @link_id: link_id or -1 for non-MLD */ struct ieee80211_txq_params { enum nl80211_ac ac; @@ -2323,6 +2324,7 @@ struct ieee80211_txq_params { u16 cwmin; u16 cwmax; u8 aifs; + int link_id; }; /** -- cgit v1.2.3 From e7a7b84e33178db4a839c5e1773247be17597c1f Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Sat, 30 Jul 2022 10:56:43 +0530 Subject: wifi: cfg80211: Add link_id parameter to various key operations for MLO Add support for various key operations on MLD by adding new parameter link_id. Pass the link_id received from userspace to driver for add_key, get_key, del_key, set_default_key, set_default_mgmt_key and set_default_beacon_key to support configuring keys specific to each MLO link. Userspace must not specify link ID for MLO pairwise key since it is common for all the MLO links. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/20220730052643.1959111-4-quic_vjakkam@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 37 +++++++++++++++++++++++++------------ include/uapi/linux/nl80211.h | 14 +++++++++++--- 2 files changed, 36 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1d393e09a632..ffbc65a9b00b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -3931,22 +3931,33 @@ struct mgmt_frame_regs { * @del_intf_link: Remove an MLO link from the given interface. * * @add_key: add a key with the given parameters. @mac_addr will be %NULL - * when adding a group key. + * when adding a group key. @link_id will be -1 for non-MLO connection. + * For MLO connection, @link_id will be >= 0 for group key and -1 for + * pairwise key, @mac_addr will be peer's MLD address for MLO pairwise key. * * @get_key: get information about the key with the given parameters. * @mac_addr will be %NULL when requesting information for a group * key. All pointers given to the @callback function need not be valid * after it returns. This function should return an error if it is * not possible to retrieve the key, -ENOENT if it doesn't exist. + * @link_id will be -1 for non-MLO connection. For MLO connection, + * @link_id will be >= 0 for group key and -1 for pairwise key, @mac_addr + * will be peer's MLD address for MLO pairwise key. * * @del_key: remove a key given the @mac_addr (%NULL for a group key) - * and @key_index, return -ENOENT if the key doesn't exist. + * and @key_index, return -ENOENT if the key doesn't exist. @link_id will + * be -1 for non-MLO connection. For MLO connection, @link_id will be >= 0 + * for group key and -1 for pairwise key, @mac_addr will be peer's MLD + * address for MLO pairwise key. * - * @set_default_key: set the default key on an interface + * @set_default_key: set the default key on an interface. @link_id will be >= 0 + * for MLO connection and -1 for non-MLO connection. * - * @set_default_mgmt_key: set the default management frame key on an interface + * @set_default_mgmt_key: set the default management frame key on an interface. + * @link_id will be >= 0 for MLO connection and -1 for non-MLO connection. * - * @set_default_beacon_key: set the default Beacon frame key on an interface + * @set_default_beacon_key: set the default Beacon frame key on an interface. + * @link_id will be >= 0 for MLO connection and -1 for non-MLO connection. * * @set_rekey_data: give the data necessary for GTK rekeying to the driver * @@ -4295,22 +4306,24 @@ struct cfg80211_ops { unsigned int link_id); int (*add_key)(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr, - struct key_params *params); + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, struct key_params *params); int (*get_key)(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr, - void *cookie, + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)); int (*del_key)(struct wiphy *wiphy, struct net_device *netdev, - u8 key_index, bool pairwise, const u8 *mac_addr); + int link_id, u8 key_index, bool pairwise, + const u8 *mac_addr); int (*set_default_key)(struct wiphy *wiphy, - struct net_device *netdev, + struct net_device *netdev, int link_id, u8 key_index, bool unicast, bool multicast); int (*set_default_mgmt_key)(struct wiphy *wiphy, - struct net_device *netdev, + struct net_device *netdev, int link_id, u8 key_index); int (*set_default_beacon_key)(struct wiphy *wiphy, struct net_device *netdev, + int link_id, u8 key_index); int (*start_ap)(struct wiphy *wiphy, struct net_device *dev, diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ffb7c573e299..573db20403dc 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -377,14 +377,22 @@ * the non-transmitting interfaces are deleted as well. * * @NL80211_CMD_GET_KEY: Get sequence counter information for a key specified - * by %NL80211_ATTR_KEY_IDX and/or %NL80211_ATTR_MAC. + * by %NL80211_ATTR_KEY_IDX and/or %NL80211_ATTR_MAC. %NL80211_ATTR_MAC + * represents peer's MLD address for MLO pairwise key. For MLO group key, + * the link is identified by %NL80211_ATTR_MLO_LINK_ID. * @NL80211_CMD_SET_KEY: Set key attributes %NL80211_ATTR_KEY_DEFAULT, * %NL80211_ATTR_KEY_DEFAULT_MGMT, or %NL80211_ATTR_KEY_THRESHOLD. + * For MLO connection, the link to set default key is identified by + * %NL80211_ATTR_MLO_LINK_ID. * @NL80211_CMD_NEW_KEY: add a key with given %NL80211_ATTR_KEY_DATA, * %NL80211_ATTR_KEY_IDX, %NL80211_ATTR_MAC, %NL80211_ATTR_KEY_CIPHER, - * and %NL80211_ATTR_KEY_SEQ attributes. + * and %NL80211_ATTR_KEY_SEQ attributes. %NL80211_ATTR_MAC represents + * peer's MLD address for MLO pairwise key. The link to add MLO + * group key is identified by %NL80211_ATTR_MLO_LINK_ID. * @NL80211_CMD_DEL_KEY: delete a key identified by %NL80211_ATTR_KEY_IDX - * or %NL80211_ATTR_MAC. + * or %NL80211_ATTR_MAC. %NL80211_ATTR_MAC represents peer's MLD address + * for MLO pairwise key. The link to delete group key is identified by + * %NL80211_ATTR_MLO_LINK_ID. * * @NL80211_CMD_GET_BEACON: (not used) * @NL80211_CMD_SET_BEACON: change the beacon on an access point interface -- cgit v1.2.3 From ccdde7c74ffd7e8bdd3cf685bbfa41231c8e3131 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 17 Aug 2022 11:17:01 +0200 Subject: wifi: mac80211: properly implement MLO key handling Implement key installation and lookup (on TX and RX) for MLO, so we can use multiple GTKs/IGTKs/BIGTKs. Co-authored-by: Ilan Peer Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index f198af600b5e..7951400e509b 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1975,6 +1975,7 @@ enum ieee80211_key_flags { * - Temporal Authenticator Rx MIC Key (64 bits) * @icv_len: The ICV length for this key type * @iv_len: The IV length for this key type + * @link_id: the link ID for MLO, or -1 for non-MLO or pairwise keys */ struct ieee80211_key_conf { atomic64_t tx_pn; @@ -1984,6 +1985,7 @@ struct ieee80211_key_conf { u8 hw_key_idx; s8 keyidx; u16 flags; + s8 link_id; u8 keylen; u8 key[]; }; -- cgit v1.2.3 From ea9d807b56428d65cf43030cbd7ae5a580077147 Mon Sep 17 00:00:00 2001 From: Vasanthakumar Thiagarajan Date: Wed, 17 Aug 2022 16:12:12 +0530 Subject: wifi: mac80211: add link information in ieee80211_rx_status In MLO, when the address translation from link to MLD is done in fw/hw, it is necessary to be able to have some information on the link on which the frame has been received. Extend the rx API to include link_id and a valid flag in ieee80211_rx_status. Also make chanes to mac80211 rx APIs to make use of the reported link_id after sanity checks. Signed-off-by: Vasanthakumar Thiagarajan Link: https://lore.kernel.org/r/20220817104213.2531-2-quic_vthiagar@quicinc.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7951400e509b..b38927e33d10 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1480,6 +1480,10 @@ enum mac80211_rx_encoding { * each A-MPDU but the same for each subframe within one A-MPDU * @ampdu_delimiter_crc: A-MPDU delimiter CRC * @zero_length_psdu_type: radiotap type of the 0-length PSDU + * @link_valid: if the link which is identified by @link_id is valid. This flag + * is set only when connection is MLO. + * @link_id: id of the link used to receive the packet. This is used along with + * @link_valid. */ struct ieee80211_rx_status { u64 mactime; @@ -1504,6 +1508,7 @@ struct ieee80211_rx_status { s8 chain_signal[IEEE80211_MAX_CHAINS]; u8 ampdu_delimiter_crc; u8 zero_length_psdu_type; + u8 link_valid:1, link_id:4; }; static inline u32 -- cgit v1.2.3 From ea5cba269fb1fe22b84f4d01bb3d56320e6ffa3e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 16 Aug 2022 11:26:23 +0200 Subject: wifi: cfg80211/mac80211: check EHT capability size correctly For AP/non-AP the EHT MCS/NSS subfield size differs, the 4-octet subfield is only used for 20 MHz-only non-AP STA. Pass an argument around everywhere to be able to parse it properly. Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 55e6f4ad0ca6..6f70394417ac 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2886,7 +2886,8 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) /* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */ static inline u8 ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, - const struct ieee80211_eht_cap_elem_fixed *eht_cap) + const struct ieee80211_eht_cap_elem_fixed *eht_cap, + bool from_ap) { u8 count = 0; @@ -2907,7 +2908,10 @@ ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) count += 3; - return count ? count : 4; + if (count) + return count; + + return from_ap ? 3 : 4; } /* 802.11be EHT PPE Thresholds */ @@ -2943,7 +2947,8 @@ ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info) } static inline bool -ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len) +ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len, + bool from_ap) { const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data; u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed); @@ -2952,7 +2957,8 @@ ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len) return false; needed += ieee80211_eht_mcs_nss_size((const void *)he_capa, - (const void *)data); + (const void *)data, + from_ap); if (len < needed) return false; -- cgit v1.2.3 From c73993b865bf0b2ea1cbb3e9616f18a6508fc49c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 19 Aug 2022 13:12:38 +0200 Subject: wifi: mac80211: maintain link_id in link_sta To helper drivers if they e.g. have a lookup of the link_sta pointer, add the link ID to the link_sta structure. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index b38927e33d10..ffd0ebbff294 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2135,6 +2135,7 @@ struct ieee80211_sta_txpwr { * @addr: MAC address of the Link STA. For non-MLO STA this is same as the addr * in ieee80211_sta. For MLO Link STA this addr can be same or different * from addr in ieee80211_sta (representing MLD STA addr) + * @link_id: the link ID for this link STA (0 for deflink) * @supp_rates: Bitmap of supported rates * @ht_cap: HT capabilities of this STA; restricted to our own capabilities * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities @@ -2151,6 +2152,7 @@ struct ieee80211_sta_txpwr { */ struct ieee80211_link_sta { u8 addr[ETH_ALEN]; + u8 link_id; u32 supp_rates[NUM_NL80211_BANDS]; struct ieee80211_sta_ht_cap ht_cap; -- cgit v1.2.3 From b8c9024e0ed03c5feb29bd1086a1eb799f3fff44 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Fri, 22 Jul 2022 18:41:42 +0530 Subject: wifi: cfg80211: Add link_id to cfg80211_ch_switch_started_notify() Add link_id parameter to cfg80211_ch_switch_started_notify() to allow driver to indicate on which link channel switch started on MLD. Send the data to userspace so it knows as well. Signed-off-by: Veerendranath Jakkam Link: https://lore.kernel.org/r/20220722131143.3438042-1-quic_vjakkam@quicinc.com Link: https://lore.kernel.org/r/20220722131143.3438042-2-quic_vjakkam@quicinc.com [squash two patches] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ffbc65a9b00b..e09ff87146c1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -8281,6 +8281,7 @@ void cfg80211_ch_switch_notify(struct net_device *dev, * cfg80211_ch_switch_started_notify - notify channel switch start * @dev: the device on which the channel switch started * @chandef: the future channel definition + * @link_id: the link ID for MLO, must be 0 for non-MLO * @count: the number of TBTTs until the channel switch happens * @quiet: whether or not immediate quiet was requested by the AP * @@ -8290,7 +8291,8 @@ void cfg80211_ch_switch_notify(struct net_device *dev, */ void cfg80211_ch_switch_started_notify(struct net_device *dev, struct cfg80211_chan_def *chandef, - u8 count, bool quiet); + unsigned int link_id, u8 count, + bool quiet); /** * ieee80211_operating_class_to_band - convert operating class to band -- cgit v1.2.3 From c19d893fbf3f2f8fa864ae39652c7fee939edde2 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Wed, 24 Aug 2022 08:52:31 +0800 Subject: net: sched: delete duplicate cleanup of backlog and qlen qdisc_reset() is clearing qdisc->q.qlen and qdisc->qstats.backlog _after_ calling qdisc->ops->reset. There is no need to clear them again in the specific reset function. Signed-off-by: Zhengchao Shao Link: https://lore.kernel.org/r/20220824005231.345727-1-shaozhengchao@huawei.com Signed-off-by: Paolo Abeni --- include/net/sch_generic.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index ec693fe7c553..f2958fb5ae08 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1137,7 +1137,6 @@ static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh) static inline void qdisc_reset_queue(struct Qdisc *sch) { __qdisc_reset_queue(&sch->q); - sch->qstats.backlog = 0; } static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, -- cgit v1.2.3 From d4ccaf58a8472123ac97e6db03932c375b5c45ba Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Wed, 24 Aug 2022 16:31:13 -0700 Subject: bpf: Introduce cgroup iter Cgroup_iter is a type of bpf_iter. It walks over cgroups in four modes: - walking a cgroup's descendants in pre-order. - walking a cgroup's descendants in post-order. - walking a cgroup's ancestors. - process only the given cgroup. When attaching cgroup_iter, one can set a cgroup to the iter_link created from attaching. This cgroup is passed as a file descriptor or cgroup id and serves as the starting point of the walk. If no cgroup is specified, the starting point will be the root cgroup v2. For walking descendants, one can specify the order: either pre-order or post-order. For walking ancestors, the walk starts at the specified cgroup and ends at the root. One can also terminate the walk early by returning 1 from the iter program. Note that because walking cgroup hierarchy holds cgroup_mutex, the iter program is called with cgroup_mutex held. Currently only one session is supported, which means, depending on the volume of data bpf program intends to send to user space, the number of cgroups that can be walked is limited. For example, given the current buffer size is 8 * PAGE_SIZE, if the program sends 64B data for each cgroup, assuming PAGE_SIZE is 4kb, the total number of cgroups that can be walked is 512. This is a limitation of cgroup_iter. If the output data is larger than the kernel buffer size, after all data in the kernel buffer is consumed by user space, the subsequent read() syscall will signal EOPNOTSUPP. In order to work around, the user may have to update their program to reduce the volume of data sent to output. For example, skip some uninteresting cgroups. In future, we may extend bpf_iter flags to allow customizing buffer size. Acked-by: Yonghong Song Acked-by: Tejun Heo Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220824233117.1312810-2-haoluo@google.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ include/uapi/linux/bpf.h | 30 ++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 99fc7a64564f..9c1674973e03 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -48,6 +48,7 @@ struct mem_cgroup; struct module; struct bpf_func_state; struct ftrace_ops; +struct cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -1730,7 +1731,14 @@ int bpf_obj_get_user(const char __user *pathname, int flags); int __init bpf_iter_ ## target(args) { return 0; } struct bpf_iter_aux_info { + /* for map_elem iter */ struct bpf_map *map; + + /* for cgroup iter */ + struct { + struct cgroup *start; /* starting cgroup */ + enum bpf_cgroup_iter_order order; + } cgroup; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 644600dbb114..0f61f09f467a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -87,10 +87,29 @@ struct bpf_cgroup_storage_key { __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; +enum bpf_cgroup_iter_order { + BPF_ITER_ORDER_UNSPEC = 0, + BPF_ITER_SELF_ONLY, /* process only a single object. */ + BPF_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + union bpf_iter_link_info { struct { __u32 map_fd; } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ @@ -6176,11 +6195,22 @@ struct bpf_link_info { struct { __aligned_u64 target_name; /* in/out: target_name buffer ptr */ __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ union { struct { __u32 map_id; } map; }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + }; } iter; struct { __u32 netns_ino; -- cgit v1.2.3 From bb67012331f7f07ff325877fbbb430bc515d371e Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 24 Aug 2022 14:20:09 +0200 Subject: net: devlink: extend info_get() version put to indicate a flash component Whenever the driver is called by his info_get() op, it may put multiple version names and values to the netlink message. Extend by additional helper devlink_info_version_running/stored_put_ext() that allows to specify a version type that indicates when particular version name represents a flash component. This is going to be used in follow-up patch calling info_get() during flash update command checking if version with this the version type exists. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 119ed1ffb988..f50a002e5023 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1714,15 +1714,31 @@ int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name); int devlink_info_board_serial_number_put(struct devlink_info_req *req, const char *bsn); + +enum devlink_info_version_type { + DEVLINK_INFO_VERSION_TYPE_NONE, + DEVLINK_INFO_VERSION_TYPE_COMPONENT, /* May be used as flash update + * component by name. + */ +}; + int devlink_info_version_fixed_put(struct devlink_info_req *req, const char *version_name, const char *version_value); int devlink_info_version_stored_put(struct devlink_info_req *req, const char *version_name, const char *version_value); +int devlink_info_version_stored_put_ext(struct devlink_info_req *req, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type); int devlink_info_version_running_put(struct devlink_info_req *req, const char *version_name, const char *version_value); +int devlink_info_version_running_put_ext(struct devlink_info_req *req, + const char *version_name, + const char *version_value, + enum devlink_info_version_type version_type); int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg); int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg); -- cgit v1.2.3 From f94b606325c194adadaee2265b4db990802c4850 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 24 Aug 2022 14:20:11 +0200 Subject: net: devlink: limit flash component name to match version returned by info_get() Limit the acceptance of component name passed to cmd_flash_update() to match one of the versions returned by info_get(), marked by version type. This makes things clearer and enforces 1:1 mapping between exposed version and accepted flash component. Check VERSION_TYPE_COMPONENT version type during cmd_flash_update() execution by calling info_get() with different "req" context. That causes info_get() to lookup the component name instead of filling-up the netlink message. Remove "UPDATE_COMPONENT" flag which becomes used. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index f50a002e5023..1f7026011856 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -624,8 +624,7 @@ struct devlink_flash_update_params { u32 overwrite_mask; }; -#define DEVLINK_SUPPORT_FLASH_UPDATE_COMPONENT BIT(0) -#define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK BIT(1) +#define DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK BIT(0) struct devlink_region; struct devlink_info_req; -- cgit v1.2.3 From b9030780971b56c0c455c3b66244efd96608846d Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 22 Aug 2022 16:32:43 +0200 Subject: netdev: Use try_cmpxchg in napi_if_scheduled_mark_missed Use try_cmpxchg instead of cmpxchg (*ptr, old, new) == old in napi_if_scheduled_mark_missed. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. Cc: Eric Dumazet Cc: Paolo Abeni Signed-off-by: Uros Bizjak Link: https://lore.kernel.org/r/20220822143243.2798-1-ubizjak@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 64e8662632f8..8839abc1f941 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -546,8 +546,8 @@ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n) { unsigned long val, new; + val = READ_ONCE(n->state); do { - val = READ_ONCE(n->state); if (val & NAPIF_STATE_DISABLE) return true; @@ -555,7 +555,7 @@ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n) return false; new = val | NAPIF_STATE_MISSED; - } while (cmpxchg(&n->state, val, new) != val); + } while (!try_cmpxchg(&n->state, &val, new)); return true; } -- cgit v1.2.3 From c249ea9b4309cf3250c5bbb42a05d38d0ed9071c Mon Sep 17 00:00:00 2001 From: Brian Gix Date: Fri, 5 Aug 2022 16:42:32 -0700 Subject: Bluetooth: Move Adv Instance timer to hci_sync The Advertising Instance expiration timer adv_instance_expire was handled with the deprecated hci_request mechanism, rather than it's replacement: hci_sync. Signed-off-by: Brian Gix Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_sync.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 3843f5060c73..aea950440b9d 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -72,7 +72,8 @@ int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, int hci_remove_advertising_sync(struct hci_dev *hdev, struct sock *sk, u8 instance, bool force); int hci_disable_advertising_sync(struct hci_dev *hdev); - +int hci_clear_adv_instance_sync(struct hci_dev *hdev, struct sock *sk, + u8 instance, bool force); int hci_update_passive_scan_sync(struct hci_dev *hdev); int hci_update_passive_scan(struct hci_dev *hdev); int hci_read_rssi_sync(struct hci_dev *hdev, __le16 handle); -- cgit v1.2.3 From 3fe318ee72c54506534f51b4b4dfb19e0e0df2db Mon Sep 17 00:00:00 2001 From: Brian Gix Date: Fri, 5 Aug 2022 16:42:34 -0700 Subject: Bluetooth: move hci_get_random_address() to hci_sync This function has no dependencies on the deprecated hci_request mechanism, so has been moved unchanged to hci_sync.c Signed-off-by: Brian Gix Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_sync.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index aea950440b9d..b6b975c2ed3e 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -16,6 +16,7 @@ struct hci_cmd_sync_work_entry { hci_cmd_sync_work_destroy_t destroy; }; +struct adv_info; /* Function with sync suffix shall not be called with hdev->lock held as they * wait the command to complete and in the meantime an event could be received * which could attempt to acquire hdev->lock causing a deadlock. @@ -51,6 +52,10 @@ int hci_update_class_sync(struct hci_dev *hdev); int hci_update_name_sync(struct hci_dev *hdev); int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode); +int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, + bool use_rpa, struct adv_info *adv_instance, + u8 *own_addr_type, bdaddr_t *rand_addr); + int hci_update_random_address_sync(struct hci_dev *hdev, bool require_privacy, bool rpa, u8 *own_addr_type); -- cgit v1.2.3 From 651cd3d65b0f76a2198fcf3a80ce5d53dd267717 Mon Sep 17 00:00:00 2001 From: Brian Gix Date: Fri, 5 Aug 2022 16:42:35 -0700 Subject: Bluetooth: convert hci_update_adv_data to hci_sync hci_update_adv_data() is called from hci_event and hci_core due to events from the controller. The prior function used the deprecated hci_request method, and the new one uses hci_sync.c Signed-off-by: Brian Gix Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_sync.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index b6b975c2ed3e..17f5a4c32f36 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -61,6 +61,7 @@ int hci_update_random_address_sync(struct hci_dev *hdev, bool require_privacy, int hci_update_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance); int hci_update_adv_data_sync(struct hci_dev *hdev, u8 instance); +int hci_update_adv_data(struct hci_dev *hdev, u8 instance); int hci_schedule_adv_instance_sync(struct hci_dev *hdev, u8 instance, bool force); -- cgit v1.2.3 From d4ffb6f39f1a1b260966b43a4ffdb64779c650dd Mon Sep 17 00:00:00 2001 From: Hao Luo Date: Thu, 25 Aug 2022 15:39:36 -0700 Subject: bpf: Add CGROUP prefix to cgroup_iter_order bpf_cgroup_iter_order is globally visible but the entries do not have CGROUP prefix. As requested by Andrii, put a CGROUP in the names in bpf_cgroup_iter_order. This patch fixes two previous commits: one introduced the API and the other uses the API in bpf selftest (that is, the selftest cgroup_hierarchical_stats). I tested this patch via the following command: test_progs -t cgroup,iter,btf_dump Fixes: d4ccaf58a847 ("bpf: Introduce cgroup iter") Fixes: 88886309d2e8 ("selftests/bpf: add a selftest for cgroup hierarchical stats collection") Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Signed-off-by: Hao Luo Link: https://lore.kernel.org/r/20220825223936.1865810-1-haoluo@google.com Signed-off-by: Martin KaFai Lau --- include/uapi/linux/bpf.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0f61f09f467a..bdf4bc6d8d6b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -88,11 +88,11 @@ struct bpf_cgroup_storage_key { }; enum bpf_cgroup_iter_order { - BPF_ITER_ORDER_UNSPEC = 0, - BPF_ITER_SELF_ONLY, /* process only a single object. */ - BPF_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ - BPF_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ - BPF_ITER_ANCESTORS_UP, /* walk ancestors upward. */ + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ }; union bpf_iter_link_info { -- cgit v1.2.3 From 44387d1736c40a74085be354e2b5f37ca0689608 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Wed, 24 Aug 2022 17:10:03 +0800 Subject: net: sched: remove unnecessary init of qdisc skb head The memory allocated by using kzallloc_node and kcalloc has been cleared. Therefore, the structure members of the new qdisc are 0. So there's no need to explicitly assign a value of 0. Signed-off-by: Zhengchao Shao Signed-off-by: David S. Miller --- include/net/sch_generic.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index f2958fb5ae08..7dc83400bde4 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -940,13 +940,6 @@ static inline void qdisc_purge_queue(struct Qdisc *sch) qdisc_tree_reduce_backlog(sch, qlen, backlog); } -static inline void qdisc_skb_head_init(struct qdisc_skb_head *qh) -{ - qh->head = NULL; - qh->tail = NULL; - qh->qlen = 0; -} - static inline void __qdisc_enqueue_tail(struct sk_buff *skb, struct qdisc_skb_head *qh) { -- cgit v1.2.3 From 54c4ef34c4b6f9720fded620e2893894f9f2c554 Mon Sep 17 00:00:00 2001 From: Andrey Zhadchenko Date: Thu, 25 Aug 2022 05:04:49 +0300 Subject: openvswitch: allow specifying ifindex of new interfaces CRIU is preserving ifindexes of net devices after restoration. However, current Open vSwitch API does not allow to target ifindex, so we cannot correctly restore OVS configuration. Add new OVS_DP_ATTR_IFINDEX for OVS_DP_CMD_NEW and use it as desired ifindex. Use OVS_VPORT_ATTR_IFINDEX during OVS_VPORT_CMD_NEW to specify new netdev ifindex. Signed-off-by: Andrey Zhadchenko Acked-by: Christian Brauner (Microsoft) Signed-off-by: Jakub Kicinski --- include/uapi/linux/openvswitch.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index ce3e1738d427..94066f87e9ee 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -76,6 +76,8 @@ enum ovs_datapath_cmd { * datapath. Always present in notifications. * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the * datapath. Always present in notifications. + * @OVS_DP_ATTR_IFINDEX: Interface index for a new datapath netdev. Only + * valid for %OVS_DP_CMD_NEW requests. * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_DP_* commands. @@ -92,6 +94,7 @@ enum ovs_datapath_attr { OVS_DP_ATTR_PER_CPU_PIDS, /* Netlink PIDS to receive upcalls in * per-cpu dispatch mode */ + OVS_DP_ATTR_IFINDEX, __OVS_DP_ATTR_MAX }; -- cgit v1.2.3 From aa75622c3be4d5819ce69c714acbcbd67bba5d65 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 25 Aug 2022 23:08:06 +0100 Subject: bpf: Fix a few typos in BPF helpers documentation Address a few typos in the documentation for the BPF helper functions. They were reported by Jakub [0], who ran spell checkers on the generated man page [1]. [0] https://lore.kernel.org/linux-man/d22dcd47-023c-8f52-d369-7b5308e6c842@gmail.com/T/#mb02e7d4b7fb61d98fa914c77b581184e9a9537af [1] https://lore.kernel.org/linux-man/eb6a1e41-c48e-ac45-5154-ac57a2c76108@gmail.com/T/#m4a8d1b003616928013ffcd1450437309ab652f9f v3: Do not copy unrelated (and breaking) elements to tools/ header v2: Turn a ',' into a ';' Reported-by: Jakub Wilk Signed-off-by: Quentin Monnet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220825220806.107143-1-quentin@isovalent.com --- include/uapi/linux/bpf.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bdf4bc6d8d6b..962960a98835 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4456,7 +4456,7 @@ union bpf_attr { * * **-EEXIST** if the option already exists. * - * **-EFAULT** on failrue to parse the existing header options. + * **-EFAULT** on failure to parse the existing header options. * * **-EPERM** if the helper cannot be used under the current * *skops*\ **->op**. @@ -4665,7 +4665,7 @@ union bpf_attr { * a *map* with *task* as the **key**. From this * perspective, the usage is not much different from * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this - * helper enforces the key must be an task_struct and the map must also + * helper enforces the key must be a task_struct and the map must also * be a **BPF_MAP_TYPE_TASK_STORAGE**. * * Underneath, the value is stored locally at *task* instead of @@ -4723,7 +4723,7 @@ union bpf_attr { * * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) * Description - * Returns the stored IMA hash of the *inode* (if it's avaialable). + * Returns the stored IMA hash of the *inode* (if it's available). * If the hash is larger than *size*, then only *size* * bytes will be copied to *dst* * Return @@ -4747,12 +4747,12 @@ union bpf_attr { * * The argument *len_diff* can be used for querying with a planned * size change. This allows to check MTU prior to changing packet - * ctx. Providing an *len_diff* adjustment that is larger than the + * ctx. Providing a *len_diff* adjustment that is larger than the * actual packet size (resulting in negative packet size) will in - * principle not exceed the MTU, why it is not considered a - * failure. Other BPF-helpers are needed for performing the - * planned size change, why the responsability for catch a negative - * packet size belong in those helpers. + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. * * Specifying *ifindex* zero means the MTU check is performed * against the current net device. This is practical if this isn't -- cgit v1.2.3 From 5182a5d48c3d1992b2db8748f96914e07eee0956 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Fri, 26 Aug 2022 14:46:58 +0300 Subject: net: allow storing xfrm interface metadata in metadata_dst XFRM interfaces provide the association of various XFRM transformations to a netdevice using an 'if_id' identifier common to both the XFRM data structures (polcies, states) and the interface. The if_id is configured by the controlling entity (usually the IKE daemon) and can be used by the administrator to define logical relations between different connections. For example, different connections can share the if_id identifier so that they pass through the same interface, . However, currently it is not possible for connections using a different if_id to use the same interface while retaining the logical separation between them, without using additional criteria such as skb marks or different traffic selectors. When having a large number of connections, it is useful to have a the logical separation offered by the if_id identifier but use a single network interface. Similar to the way collect_md mode is used in IP tunnels. This patch attempts to enable different configuration mechanisms - such as ebpf programs, LWT encapsulations, and TC - to attach metadata to skbs which would carry the if_id. This way a single xfrm interface in collect_md mode can demux traffic based on this configuration on tx and provide this metadata on rx. The XFRM metadata is somewhat similar to ip tunnel metadata in that it has an "id", and shares similar configuration entities (bpf, tc, ...), however, it does not necessarily represent an IP tunnel or use other ip tunnel information, and also has an optional "link" property which can be used for affecting underlying routing decisions. Additional xfrm related criteria may also be added in the future. Therefore, a new metadata type is introduced, to be used in subsequent patches in the xfrm interface and configuration entities. Reviewed-by: Nikolay Aleksandrov Reviewed-by: Nicolas Dichtel Signed-off-by: Eyal Birger Signed-off-by: Steffen Klassert --- include/net/dst_metadata.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index adab27ba1ecb..e4b059908cc7 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -9,6 +9,7 @@ enum metadata_type { METADATA_IP_TUNNEL, METADATA_HW_PORT_MUX, + METADATA_XFRM, }; struct hw_port_info { @@ -16,12 +17,18 @@ struct hw_port_info { u32 port_id; }; +struct xfrm_md_info { + u32 if_id; + int link; +}; + struct metadata_dst { struct dst_entry dst; enum metadata_type type; union { struct ip_tunnel_info tun_info; struct hw_port_info port_info; + struct xfrm_md_info xfrm_info; } u; }; @@ -53,6 +60,16 @@ skb_tunnel_info(const struct sk_buff *skb) return NULL; } +static inline struct xfrm_md_info *skb_xfrm_md_info(const struct sk_buff *skb) +{ + struct metadata_dst *md_dst = skb_metadata_dst(skb); + + if (md_dst && md_dst->type == METADATA_XFRM) + return &md_dst->u.xfrm_info; + + return NULL; +} + static inline bool skb_valid_dst(const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -82,6 +99,9 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, return memcmp(&a->u.tun_info, &b->u.tun_info, sizeof(a->u.tun_info) + a->u.tun_info.options_len); + case METADATA_XFRM: + return memcmp(&a->u.xfrm_info, &b->u.xfrm_info, + sizeof(a->u.xfrm_info)); default: return 1; } -- cgit v1.2.3 From abc340b38ba25cd6c7aa2c0bd9150d30738c82d0 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Fri, 26 Aug 2022 14:46:59 +0300 Subject: xfrm: interface: support collect metadata mode This commit adds support for 'collect_md' mode on xfrm interfaces. Each net can have one collect_md device, created by providing the IFLA_XFRM_COLLECT_METADATA flag at creation. This device cannot be altered and has no if_id or link device attributes. On transmit to this device, the if_id is fetched from the attached dst metadata on the skb. If exists, the link property is also fetched from the metadata. The dst metadata type used is METADATA_XFRM which holds these properties. On the receive side, xfrmi_rcv_cb() populates a dst metadata for each packet received and attaches it to the skb. The if_id used in this case is fetched from the xfrm state, and the link is fetched from the incoming device. This information can later be used by upper layers such as tc, ebpf, and ip rules. Because the skb is scrubed in xfrmi_rcv_cb(), the attachment of the dst metadata is postponed until after scrubing. Similarly, xfrm_input() is adapted to avoid dropping metadata dsts by only dropping 'valid' (skb_valid_dst(skb) == true) dsts. Policy matching on packets arriving from collect_md xfrmi devices is done by using the xfrm state existing in the skb's sec_path. The xfrm_if_cb.decode_cb() interface implemented by xfrmi_decode_session() is changed to keep the details of the if_id extraction tucked away in xfrm_interface.c. Reviewed-by: Nicolas Dichtel Reviewed-by: Nikolay Aleksandrov Signed-off-by: Eyal Birger Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 11 +++++++++-- include/uapi/linux/if_link.h | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6e8fa98f786f..28b988577ed2 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -312,9 +312,15 @@ struct km_event { struct net *net; }; +struct xfrm_if_decode_session_result { + struct net *net; + u32 if_id; +}; + struct xfrm_if_cb { - struct xfrm_if *(*decode_session)(struct sk_buff *skb, - unsigned short family); + bool (*decode_session)(struct sk_buff *skb, + unsigned short family, + struct xfrm_if_decode_session_result *res); }; void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb); @@ -985,6 +991,7 @@ void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); struct xfrm_if_parms { int link; /* ifindex of underlying L2 interface */ u32 if_id; /* interface identifyer */ + bool collect_md; }; struct xfrm_if { diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index e36d9d2c65a7..d96f13a42589 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -694,6 +694,7 @@ enum { IFLA_XFRM_UNSPEC, IFLA_XFRM_LINK, IFLA_XFRM_IF_ID, + IFLA_XFRM_COLLECT_METADATA, __IFLA_XFRM_MAX }; -- cgit v1.2.3 From 2c2493b9da9166478fe072e3054f8a5741dadb02 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Fri, 26 Aug 2022 14:47:00 +0300 Subject: xfrm: lwtunnel: add lwtunnel support for xfrm interfaces in collect_md mode Allow specifying the xfrm interface if_id and link as part of a route metadata using the lwtunnel infrastructure. This allows for example using a single xfrm interface in collect_md mode as the target of multiple routes each specifying a different if_id. With the appropriate changes to iproute2, considering an xfrm device ipsec1 in collect_md mode one can for example add a route specifying an if_id like so: ip route add dev ipsec1 encap xfrm if_id 1 In which case traffic routed to the device via this route would use if_id in the xfrm interface policy lookup. Or in the context of vrf, one can also specify the "link" property: ip route add dev ipsec1 encap xfrm if_id 1 link_dev eth15 Note: LWT_XFRM_LINK uses NLA_U32 similar to IFLA_XFRM_LINK even though internally "link" is signed. This is consistent with other _LINK attributes in other devices as well as in bpf and should not have an effect as device indexes can't be negative. Reviewed-by: Nicolas Dichtel Reviewed-by: Nikolay Aleksandrov Signed-off-by: Eyal Birger Signed-off-by: Steffen Klassert --- include/net/dst_metadata.h | 11 +++++++++++ include/uapi/linux/lwtunnel.h | 10 ++++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index e4b059908cc7..57f75960fa28 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -60,13 +60,24 @@ skb_tunnel_info(const struct sk_buff *skb) return NULL; } +static inline struct xfrm_md_info *lwt_xfrm_info(struct lwtunnel_state *lwt) +{ + return (struct xfrm_md_info *)lwt->data; +} + static inline struct xfrm_md_info *skb_xfrm_md_info(const struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); + struct dst_entry *dst; if (md_dst && md_dst->type == METADATA_XFRM) return &md_dst->u.xfrm_info; + dst = skb_dst(skb); + if (dst && dst->lwtstate && + dst->lwtstate->type == LWTUNNEL_ENCAP_XFRM) + return lwt_xfrm_info(dst->lwtstate); + return NULL; } diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 2e206919125c..229655ef792f 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -15,6 +15,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_SEG6_LOCAL, LWTUNNEL_ENCAP_RPL, LWTUNNEL_ENCAP_IOAM6, + LWTUNNEL_ENCAP_XFRM, __LWTUNNEL_ENCAP_MAX, }; @@ -111,4 +112,13 @@ enum { #define LWT_BPF_MAX_HEADROOM 256 +enum { + LWT_XFRM_UNSPEC, + LWT_XFRM_IF_ID, + LWT_XFRM_LINK, + __LWT_XFRM_MAX, +}; + +#define LWT_XFRM_MAX (__LWT_XFRM_MAX - 1) + #endif /* _UAPI_LWTUNNEL_H_ */ -- cgit v1.2.3 From 9c5d03d362519f36cd551aec596388f895c93d2d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Aug 2022 17:18:30 -0700 Subject: genetlink: start to validate reserved header bytes We had historically not checked that genlmsghdr.reserved is 0 on input which prevents us from using those precious bytes in the future. One use case would be to extend the cmd field, which is currently just 8 bits wide and 256 is not a lot of commands for some core families. To make sure that new families do the right thing by default put the onus of opting out of validation on existing families. Signed-off-by: Jakub Kicinski Acked-by: Paul Moore (NetLabel) Signed-off-by: David S. Miller --- include/linux/genl_magic_func.h | 1 + include/net/genetlink.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 939b1a8f571b..4a4b387181ad 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -294,6 +294,7 @@ static struct genl_family ZZZ_genl_family __ro_after_init = { .ops = ZZZ_genl_ops, .n_ops = ARRAY_SIZE(ZZZ_genl_ops), .mcgrps = ZZZ_genl_mcgrps, + .resv_start_op = 42, /* drbd is currently the only user */ .n_mcgrps = ARRAY_SIZE(ZZZ_genl_mcgrps), .module = THIS_MODULE, }; diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 56a50e1c51b9..a4827b5e1e07 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -39,6 +39,8 @@ struct genl_info; * undo operations done by pre_doit, for example release locks * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups + * @resv_start_op: first operation for which reserved fields of the header + * can be validated, new families should leave this field at zero * @mcgrp_offset: starting number of multicast group IDs in this family * (private) * @ops: the operations supported by this family @@ -58,6 +60,7 @@ struct genl_family { u8 n_ops; u8 n_small_ops; u8 n_mcgrps; + u8 resv_start_op; const struct nla_policy *policy; int (*pre_doit)(const struct genl_ops *ops, struct sk_buff *skb, -- cgit v1.2.3 From e8013f8edaa30e17fd583a326daff1fa86b75c82 Mon Sep 17 00:00:00 2001 From: Casper Andersson Date: Thu, 25 Aug 2022 11:28:35 +0200 Subject: ethernet: Add helpers to recognize addresses mapped to IP multicast IP multicast must sometimes be discriminated from non-IP multicast, e.g. when determining the forwarding behavior of a given group in the presence of multicast router ports on an offloaded bridge. Therefore, provide helpers to identify these groups. Signed-off-by: Casper Andersson Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 92b10e67d5f8..a541f0c4f146 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -428,6 +428,28 @@ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2, return true; } +static inline bool ether_addr_is_ipv4_mcast(const u8 *addr) +{ + u8 base[ETH_ALEN] = { 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 }; + u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 }; + + return ether_addr_equal_masked(addr, base, mask); +} + +static inline bool ether_addr_is_ipv6_mcast(const u8 *addr) +{ + u8 base[ETH_ALEN] = { 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 }; + u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }; + + return ether_addr_equal_masked(addr, base, mask); +} + +static inline bool ether_addr_is_ip_mcast(const u8 *addr) +{ + return ether_addr_is_ipv4_mcast(addr) || + ether_addr_is_ipv6_mcast(addr); +} + /** * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address -- cgit v1.2.3 From 690252f19f0e486abb8590b3a7a03d4e065d93d4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Aug 2022 20:09:31 -0700 Subject: netlink: add support for ext_ack missing attributes There is currently no way to report via extack in a structured way that an attribute is missing. This leads to families resorting to string messages. Add a pair of attributes - @offset and @type for machine-readable way of reporting missing attributes. The @offset points to the nest which should have contained the attribute, @type is the expected nla_type. The offset will be skipped if the attribute is missing at the message level rather than inside a nest. User space should be able to figure out which attribute enum (AKA attribute space AKA attribute set) the nest pointed to by @offset is using. Reviewed-by: Johannes Berg Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/netlink.h | 13 +++++++++++++ include/uapi/linux/netlink.h | 6 ++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index bda1c385cffb..1619221c415c 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -71,6 +71,8 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) * %NL_SET_ERR_MSG * @bad_attr: attribute with error * @policy: policy for a bad attribute + * @miss_type: attribute type which was missing + * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length */ @@ -78,6 +80,8 @@ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; const struct nla_policy *policy; + const struct nlattr *miss_nest; + u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; }; @@ -126,6 +130,15 @@ struct netlink_ext_ack { #define NL_SET_ERR_MSG_ATTR(extack, attr, msg) \ NL_SET_ERR_MSG_ATTR_POL(extack, attr, NULL, msg) +#define NL_SET_ERR_ATTR_MISS(extack, nest, type) do { \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (__extack) { \ + __extack->miss_nest = (nest); \ + __extack->miss_type = (type); \ + } \ +} while (0) + static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index e0ab261ceca2..e0689dbd2cde 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -140,6 +140,10 @@ struct nlmsgerr { * be used - in the success case - to identify a created * object or operation or similar (binary) * @NLMSGERR_ATTR_POLICY: policy for a rejected attribute + * @NLMSGERR_ATTR_MISS_TYPE: type of a missing required attribute, + * %NLMSGERR_ATTR_MISS_NEST will not be present if the attribute was + * missing at the message level + * @NLMSGERR_ATTR_MISS_NEST: offset of the nest where attribute was missing * @__NLMSGERR_ATTR_MAX: number of attributes * @NLMSGERR_ATTR_MAX: highest attribute number */ @@ -149,6 +153,8 @@ enum nlmsgerr_attrs { NLMSGERR_ATTR_OFFS, NLMSGERR_ATTR_COOKIE, NLMSGERR_ATTR_POLICY, + NLMSGERR_ATTR_MISS_TYPE, + NLMSGERR_ATTR_MISS_NEST, __NLMSGERR_ATTR_MAX, NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 -- cgit v1.2.3 From 45dca15759643806660e9285e6af8a1ba3c76c82 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 25 Aug 2022 20:09:32 -0700 Subject: netlink: add helpers for extack attr presence checking Being able to check attribute presence and set extack if not on one line is handy, add helpers. Reviewed-by: Johannes Berg Signed-off-by: Jakub Kicinski Signed-off-by: Paolo Abeni --- include/linux/netlink.h | 11 +++++++++++ include/net/genetlink.h | 7 +++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 1619221c415c..d51e041d2242 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -139,6 +139,17 @@ struct netlink_ext_ack { } \ } while (0) +#define NL_REQ_ATTR_CHECK(extack, nest, tb, type) ({ \ + struct nlattr **__tb = (tb); \ + u32 __attr = (type); \ + int __retval; \ + \ + __retval = !__tb[__attr]; \ + if (__retval) \ + NL_SET_ERR_ATTR_MISS((extack), (nest), __attr); \ + __retval; \ +}) + static inline void nl_set_extack_cookie_u64(struct netlink_ext_ack *extack, u64 cookie) { diff --git a/include/net/genetlink.h b/include/net/genetlink.h index a4827b5e1e07..8f780170e2f8 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -110,6 +110,13 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net) #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg) +/* Report that a root attribute is missing */ +#define GENL_REQ_ATTR_CHECK(info, attr) ({ \ + struct genl_info *__info = (info); \ + \ + NL_REQ_ATTR_CHECK(__info->extack, NULL, __info->attrs, (attr)); \ +}) + enum genl_validate_flags { GENL_DONT_VALIDATE_STRICT = BIT(0), GENL_DONT_VALIDATE_DUMP = BIT(1), -- cgit v1.2.3 From 146ecbac1d327e7ed2153cfb3ef880166dc2b312 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 26 Aug 2022 10:27:30 +0200 Subject: net: devlink: stub port params cmds for they are unused internally Follow-up the removal of unused internal api of port params made by commit 42ded61aa75e ("devlink: Delete not used port parameters APIs") and stub the commands and add extack message to tell the user what is going on. If later on port params are needed, could be easily re-introduced, but until then it is a dead code. Signed-off-by: Jiri Pirko Reviewed-by: Jakub Kicinski Link: https://lore.kernel.org/r/20220826082730.1399735-1-jiri@resnulli.us Signed-off-by: Paolo Abeni --- include/net/devlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 1f7026011856..264aa98e6da6 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -118,7 +118,6 @@ struct devlink_rate { struct devlink_port { struct list_head list; - struct list_head param_list; struct list_head region_list; struct devlink *devlink; unsigned int index; -- cgit v1.2.3 From f9cad07b840ec8a8eb54928908d694b6e262631c Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 30 Aug 2022 18:32:47 +0300 Subject: thunderbolt: Show link type for XDomain connections too Following what we do for routers already, extend this to XDomain connections as well. This will show in sysfs whether the link is in USB4 or Thunderbolt mode. Signed-off-by: Mika Westerberg Signed-off-by: David S. Miller --- include/linux/thunderbolt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/thunderbolt.h b/include/linux/thunderbolt.h index 9f442d73f3df..90cd08ab2f5d 100644 --- a/include/linux/thunderbolt.h +++ b/include/linux/thunderbolt.h @@ -187,6 +187,7 @@ void tb_unregister_property_dir(const char *key, struct tb_property_dir *dir); * @device_name: Name of the device (or %NULL if not known) * @link_speed: Speed of the link in Gb/s * @link_width: Width of the link (1 or 2) + * @link_usb4: Downstream link is USB4 * @is_unplugged: The XDomain is unplugged * @needs_uuid: If the XDomain does not have @remote_uuid it will be * queried first @@ -234,6 +235,7 @@ struct tb_xdomain { const char *device_name; unsigned int link_speed; unsigned int link_width; + bool link_usb4; bool is_unplugged; bool needs_uuid; struct ida service_ids; -- cgit v1.2.3 From 1a942de092c0b96216864fedcb4d8822ce3fc12e Mon Sep 17 00:00:00 2001 From: Brian Gix Date: Tue, 16 Aug 2022 09:41:20 -0700 Subject: Bluetooth: Move hci_abort_conn to hci_conn.c hci_abort_conn() is a wrapper around a number of DISCONNECT and CREATE_CONN_CANCEL commands that was being invoked from hci_request request queues, which are now deprecated. There are two versions: hci_abort_conn() which can be invoked from the hci_event thread, and hci_abort_conn_sync() which can be invoked within a hci_sync cmd chain. Signed-off-by: Brian Gix Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index e7862903187d..932153e68864 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -2075,6 +2075,7 @@ int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, bdaddr_t *bdaddr, u8 addr_type); +int hci_abort_conn(struct hci_conn *conn, u8 reason); u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier); void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, -- cgit v1.2.3 From 4516c873e3b55856012ddd6db9d4366ce3c60c5d Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Tue, 30 Aug 2022 17:22:55 +0800 Subject: net: sched: gred/red: remove unused variables in struct red_stats The variable "other" in the struct red_stats is not used. Remove it. Signed-off-by: Zhengchao Shao Signed-off-by: Jakub Kicinski --- include/net/red.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/red.h b/include/net/red.h index be11dbd26492..454ac2b65d8c 100644 --- a/include/net/red.h +++ b/include/net/red.h @@ -122,7 +122,6 @@ struct red_stats { u32 forced_drop; /* Forced drops, qavg > max_thresh */ u32 forced_mark; /* Forced marks, qavg > max_thresh */ u32 pdrop; /* Drops due to queue limits */ - u32 other; /* Drops due to drop() calls */ }; struct red_parms { -- cgit v1.2.3 From 977f1aa5e4d1942bc8012bf3f0e695008979ff4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Aug 2022 18:44:27 +0000 Subject: net: bql: add more documentation Add some documentation for netdev_tx_sent_queue() and netdev_tx_completed_queue() Stating that netdev_tx_completed_queue() must be called once per TX completion round is apparently not obvious for everybody. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eec35f9b6616..cfe41c053e11 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3353,6 +3353,16 @@ static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_qu #endif } +/** + * netdev_tx_sent_queue - report the number of bytes queued to a given tx queue + * @dev_queue: network device queue + * @bytes: number of bytes queued to the device queue + * + * Report the number of bytes queued for sending/completion to the network + * device hardware queue. @bytes should be a good approximation and should + * exactly match netdev_completed_queue() @bytes. + * This is typically called once per packet, from ndo_start_xmit(). + */ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, unsigned int bytes) { @@ -3398,13 +3408,14 @@ static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue, } /** - * netdev_sent_queue - report the number of bytes queued to hardware - * @dev: network device - * @bytes: number of bytes queued to the hardware device queue + * netdev_sent_queue - report the number of bytes queued to hardware + * @dev: network device + * @bytes: number of bytes queued to the hardware device queue * - * Report the number of bytes queued for sending/completion to the network - * device hardware queue. @bytes should be a good approximation and should - * exactly match netdev_completed_queue() @bytes + * Report the number of bytes queued for sending/completion to the network + * device hardware queue#0. @bytes should be a good approximation and should + * exactly match netdev_completed_queue() @bytes. + * This is typically called once per packet, from ndo_start_xmit(). */ static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes) { @@ -3419,6 +3430,15 @@ static inline bool __netdev_sent_queue(struct net_device *dev, xmit_more); } +/** + * netdev_tx_completed_queue - report number of packets/bytes at TX completion. + * @dev_queue: network device queue + * @pkts: number of packets (currently ignored) + * @bytes: number of bytes dequeued from the device queue + * + * Must be called at most once per TX completion round (and not per + * individual packet), so that BQL can adjust its limits appropriately. + */ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue, unsigned int pkts, unsigned int bytes) { -- cgit v1.2.3 From c3f760ef128789252e7c4f10d3c1721422dceba9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 31 Aug 2022 17:00:58 -0700 Subject: net: remove netif_tx_napi_add() All callers are now gone. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cfe41c053e11..f0068c1ff1df 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2569,8 +2569,6 @@ netif_napi_add_tx_weight(struct net_device *dev, netif_napi_add_weight(dev, napi, poll, weight); } -#define netif_tx_napi_add netif_napi_add_tx_weight - /** * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only * @dev: network device -- cgit v1.2.3 From dc84dbbcc97bfb47e0f2b175d816e601b2890c91 Mon Sep 17 00:00:00 2001 From: Shung-Hsi Yu Date: Wed, 31 Aug 2022 11:19:06 +0800 Subject: bpf, tnums: Warn against the usage of tnum_in(tnum_range(), ...) Commit a657182a5c51 ("bpf: Don't use tnum_range on array range checking for poke descriptors") has shown that using tnum_range() as argument to tnum_in() can lead to misleading code that looks like tight bound check when in fact the actual allowed range is much wider. Document such behavior to warn against its usage in general, and suggest some scenario where result can be trusted. Signed-off-by: Shung-Hsi Yu Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/984b37f9fdf7ac36831d2137415a4a915744c1b6.1661462653.git.daniel@iogearbox.net Link: https://www.openwall.com/lists/oss-security/2022/08/26/1 Link: https://lore.kernel.org/bpf/20220831031907.16133-3-shung-hsi.yu@suse.com Link: https://lore.kernel.org/bpf/20220831031907.16133-2-shung-hsi.yu@suse.com --- include/linux/tnum.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/tnum.h b/include/linux/tnum.h index 498dbcedb451..1c3948a1d6ad 100644 --- a/include/linux/tnum.h +++ b/include/linux/tnum.h @@ -21,7 +21,12 @@ struct tnum { struct tnum tnum_const(u64 value); /* A completely unknown value */ extern const struct tnum tnum_unknown; -/* A value that's unknown except that @min <= value <= @max */ +/* An unknown value that is a superset of @min <= value <= @max. + * + * Could include values outside the range of [@min, @max]. + * For example tnum_range(0, 2) is represented by {0, 1, 2, *3*}, + * rather than the intended set of {0, 1, 2}. + */ struct tnum tnum_range(u64 min, u64 max); /* Arithmetic and logical ops */ @@ -73,7 +78,18 @@ static inline bool tnum_is_unknown(struct tnum a) */ bool tnum_is_aligned(struct tnum a, u64 size); -/* Returns true if @b represents a subset of @a. */ +/* Returns true if @b represents a subset of @a. + * + * Note that using tnum_range() as @a requires extra cautions as tnum_in() may + * return true unexpectedly due to tnum limited ability to represent tight + * range, e.g. + * + * tnum_in(tnum_range(0, 2), tnum_const(3)) == true + * + * As a rule of thumb, if @a is explicitly coded rather than coming from + * reg->var_off, it should be in form of tnum_const(), tnum_range(0, 2**n - 1), + * or tnum_range(2**n, 2**(n+1) - 1). + */ bool tnum_in(struct tnum a, struct tnum b); /* Formatting functions. These have snprintf-like semantics: they will write -- cgit v1.2.3 From 44c51472bef83bb70b43e2f4b7a592096f32a855 Mon Sep 17 00:00:00 2001 From: Shmulik Ladkani Date: Wed, 31 Aug 2022 17:40:09 +0300 Subject: bpf: Support getting tunnel flags Existing 'bpf_skb_get_tunnel_key' extracts various tunnel parameters (id, ttl, tos, local and remote) but does not expose ip_tunnel_info's tun_flags to the BPF program. It makes sense to expose tun_flags to the BPF program. Assume for example multiple GRE tunnels maintained on a single GRE interface in collect_md mode. The program expects origins to initiate over GRE, however different origins use different GRE characteristics (e.g. some prefer to use GRE checksum, some do not; some pass a GRE key, some do not, etc..). A BPF program getting tun_flags can therefore remember the relevant flags (e.g. TUNNEL_CSUM, TUNNEL_SEQ...) for each initiating remote. In the reply path, the program can use 'bpf_skb_set_tunnel_key' in order to correctly reply to the remote, using similar characteristics, based on the stored tunnel flags. Introduce BPF_F_TUNINFO_FLAGS flag for bpf_skb_get_tunnel_key. If specified, 'bpf_tunnel_key->tunnel_flags' is set with the tun_flags. Decided to use the existing unused 'tunnel_ext' as the storage for the 'tunnel_flags' in order to avoid changing bpf_tunnel_key's layout. Also, the following has been considered during the design: 1. Convert the "interesting" internal TUNNEL_xxx flags back to BPF_F_yyy and place into the new 'tunnel_flags' field. This has 2 drawbacks: - The BPF_F_yyy flags are from *set_tunnel_key* enumeration space, e.g. BPF_F_ZERO_CSUM_TX. It is awkward that it is "returned" into tunnel_flags from a *get_tunnel_key* call. - Not all "interesting" TUNNEL_xxx flags can be mapped to existing BPF_F_yyy flags, and it doesn't make sense to create new BPF_F_yyy flags just for purposes of the returned tunnel_flags. 2. Place key.tun_flags into 'tunnel_flags' but mask them, keeping only "interesting" flags. That's ok, but the drawback is that what's "interesting" for my usecase might be limiting for other usecases. Therefore I decided to expose what's in key.tun_flags *as is*, which seems most flexible. The BPF user can just choose to ignore bits he's not interested in. The TUNNEL_xxx are also UAPI, so no harm exposing them back in the get_tunnel_key call. Signed-off-by: Shmulik Ladkani Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220831144010.174110-1-shmulik.ladkani@gmail.com --- include/uapi/linux/bpf.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 962960a98835..837c0f9b7fdd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5659,6 +5659,11 @@ enum { BPF_F_SEQ_NUMBER = (1ULL << 3), }; +/* BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_FLAGS = (1ULL << 4), +}; + /* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and * BPF_FUNC_perf_event_read_value flags. */ @@ -5848,7 +5853,10 @@ struct bpf_tunnel_key { }; __u8 tunnel_tos; __u8 tunnel_ttl; - __u16 tunnel_ext; /* Padding, future use. */ + union { + __u16 tunnel_ext; /* compat */ + __be16 tunnel_flags; + }; __u32 tunnel_label; union { __u32 local_ipv4; -- cgit v1.2.3 From 8254393663f9b8cb8b84cdce1abb118833c22a54 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 1 Sep 2022 20:06:20 -0700 Subject: net: ieee802154: Fix compilation error when CONFIG_IEEE802154_NL802154_EXPERIMENTAL is disabled When CONFIG_IEEE802154_NL802154_EXPERIMENTAL is disabled, NL802154_CMD_DEL_SEC_LEVEL is undefined and results in a compilation error: net/ieee802154/nl802154.c:2503:19: error: 'NL802154_CMD_DEL_SEC_LEVEL' undeclared here (not in a function); did you mean 'NL802154_CMD_SET_CCA_ED_LEVEL'? 2503 | .resv_start_op = NL802154_CMD_DEL_SEC_LEVEL + 1, | ^~~~~~~~~~~~~~~~~~~~~~~~~~ | NL802154_CMD_SET_CCA_ED_LEVEL Unhide the experimental commands, having them defined in an enum makes no difference. Fixes: 9c5d03d36251 ("genetlink: start to validate reserved header bytes") Signed-off-by: Gal Pressman Acked-by: Stefan Schmidt Tested-by: Sudip Mukherjee Link: https://lore.kernel.org/r/20220902030620.2737091-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/net/nl802154.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/nl802154.h b/include/net/nl802154.h index 145acb8f2509..f5850b569c52 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -58,9 +58,6 @@ enum nl802154_commands { NL802154_CMD_SET_WPAN_PHY_NETNS, - /* add new commands above here */ - -#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL NL802154_CMD_SET_SEC_PARAMS, NL802154_CMD_GET_SEC_KEY, /* can dump */ NL802154_CMD_NEW_SEC_KEY, @@ -74,7 +71,8 @@ enum nl802154_commands { NL802154_CMD_GET_SEC_LEVEL, /* can dump */ NL802154_CMD_NEW_SEC_LEVEL, NL802154_CMD_DEL_SEC_LEVEL, -#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ + + /* add new commands above here */ /* used to define NL802154_CMD_MAX below */ __NL802154_CMD_AFTER_LAST, -- cgit v1.2.3 From 4ff09db1b79b98b4a2a7511571c640b76cab3beb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:28:02 -0700 Subject: bpf: net: Change sk_getsockopt() to take the sockptr_t argument This patch changes sk_getsockopt() to take the sockptr_t argument such that it can be used by bpf_getsockopt(SOL_SOCKET) in a latter patch. security_socket_getpeersec_stream() is not changed. It stays with the __user ptr (optval.user and optlen.user) to avoid changes to other security hooks. bpf_getsockopt(SOL_SOCKET) also does not support SO_PEERSEC. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002802.2888419-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 3 +-- include/linux/sockptr.h | 5 +++++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index a5f21dc3c432..527ae1d64e27 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -900,8 +900,7 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); -int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, - unsigned int len); +int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len); bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index d45902fb4cad..bae5e2369b4f 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -64,6 +64,11 @@ static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, return 0; } +static inline int copy_to_sockptr(sockptr_t dst, const void *src, size_t size) +{ + return copy_to_sockptr_offset(dst, 0, src, size); +} + static inline void *memdup_sockptr(sockptr_t src, size_t len) { void *p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); -- cgit v1.2.3 From 728f064cd7ebea8c182e99e6f152c8b4a0a6b071 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:28:28 -0700 Subject: bpf: net: Change do_ip_getsockopt() to take the sockptr_t argument Similar to the earlier patch that changes sk_getsockopt() to take the sockptr_t argument. This patch also changes do_ip_getsockopt() to take the sockptr_t argument such that a latter patch can make bpf_getsockopt(SOL_IP) to reuse do_ip_getsockopt(). Note on the change in ip_mc_gsfget(). This function is to return an array of sockaddr_storage in optval. This function is shared between ip_get_mcast_msfilter() and compat_ip_get_mcast_msfilter(). However, the sockaddr_storage is stored at different offset of the optval because of the difference between group_filter and compat_group_filter. Thus, a new 'ss_offset' argument is added to ip_mc_gsfget(). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002828.2890585-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/igmp.h | 4 ++-- include/linux/mroute.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 93c262ecbdc9..78890143f079 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -118,9 +118,9 @@ extern int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex); extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex); extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, - struct ip_msfilter __user *optval, int __user *optlen); + sockptr_t optval, sockptr_t optlen); extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage __user *p); + sockptr_t optval, size_t offset); extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt, int dif, int sdif); extern void ip_mc_init_dev(struct in_device *); diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 6cbbfe94348c..80b8400ab8b2 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -17,7 +17,7 @@ static inline int ip_mroute_opt(int opt) } int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); -int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); +int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); int ip_mr_init(void); @@ -29,8 +29,8 @@ static inline int ip_mroute_setsockopt(struct sock *sock, int optname, return -ENOPROTOOPT; } -static inline int ip_mroute_getsockopt(struct sock *sock, int optname, - char __user *optval, int __user *optlen) +static inline int ip_mroute_getsockopt(struct sock *sk, int optname, + sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } -- cgit v1.2.3 From 6dadbe4bac68309eb46ab0f30e8ff47a789df49a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:28:53 -0700 Subject: bpf: net: Change do_ipv6_getsockopt() to take the sockptr_t argument Similar to the earlier patch that changes sk_getsockopt() to take the sockptr_t argument . This patch also changes do_ipv6_getsockopt() to take the sockptr_t argument such that a latter patch can make bpf_getsockopt(SOL_IPV6) to reuse do_ipv6_getsockopt(). Note on the change in ip6_mc_msfget(). This function is to return an array of sockaddr_storage in optval. This function is shared between ipv6_get_msfilter() and compat_ipv6_get_msfilter(). However, the sockaddr_storage is stored at different offset of the optval because of the difference between group_filter and compat_group_filter. Thus, a new 'ss_offset' argument is added to ip6_mc_msfget(). Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002853.2892532-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/mroute6.h | 4 ++-- include/net/ipv6.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index bc351a85ce9b..8f2b307fb124 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -27,7 +27,7 @@ struct sock; #ifdef CONFIG_IPV6_MROUTE extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); -extern int ip6_mroute_getsockopt(struct sock *, int, char __user *, int __user *); +extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); extern int ip6_mr_input(struct sk_buff *skb); extern int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg); extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); @@ -42,7 +42,7 @@ static inline int ip6_mroute_setsockopt(struct sock *sock, int optname, static inline int ip6_mroute_getsockopt(struct sock *sock, - int optname, char __user *optval, int __user *optlen) + int optname, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } diff --git a/include/net/ipv6.h b/include/net/ipv6.h index c110d9032083..a4f24573ed7a 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1209,7 +1209,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list); int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage __user *p); + sockptr_t optval, size_t ss_offset); #ifdef CONFIG_PROC_FS int ac6_proc_init(struct net *net); -- cgit v1.2.3 From 65ddc82d3b96be5555a36de4e2b4547433a00532 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:29:12 -0700 Subject: bpf: Change bpf_getsockopt(SOL_SOCKET) to reuse sk_getsockopt() This patch changes bpf_getsockopt(SOL_SOCKET) to reuse sk_getsockopt(). It removes all duplicated code from bpf_getsockopt(SOL_SOCKET). Before this patch, there were some optnames available to bpf_setsockopt(SOL_SOCKET) but missing in bpf_getsockopt(SOL_SOCKET). It surprises users from time to time. For example, SO_REUSEADDR, SO_KEEPALIVE, SO_RCVLOWAT, and SO_MAX_PACING_RATE. This patch automatically closes this gap without duplicating more code. The only exception is SO_BINDTODEVICE because it needs to acquire a blocking lock. Thus, SO_BINDTODEVICE is not supported. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002912.2894040-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index ee44b424d952..ea7965524133 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1833,6 +1833,8 @@ int sk_setsockopt(struct sock *sk, int level, int optname, int sock_setsockopt(struct socket *sock, int level, int op, sockptr_t optval, unsigned int optlen); +int sk_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen); int sock_getsockopt(struct socket *sock, int level, int op, char __user *optval, int __user *optlen); int sock_gettstamp(struct socket *sock, void __user *userstamp, -- cgit v1.2.3 From 273b7f0fb44847c41814a59901977be284daa447 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:29:18 -0700 Subject: bpf: Change bpf_getsockopt(SOL_TCP) to reuse do_tcp_getsockopt() This patch changes bpf_getsockopt(SOL_TCP) to reuse do_tcp_getsockopt(). It removes the duplicated code from bpf_getsockopt(SOL_TCP). Before this patch, there were some optnames available to bpf_setsockopt(SOL_TCP) but missing in bpf_getsockopt(SOL_TCP). For example, TCP_NODELAY, TCP_MAXSEG, TCP_KEEPIDLE, TCP_KEEPINTVL, and a few more. It surprises users from time to time. This patch automatically closes this gap without duplicating more code. bpf_getsockopt(TCP_SAVED_SYN) does not free the saved_syn, so it stays in sol_tcp_sockopt(). For string name value like TCP_CONGESTION, bpf expects it is always null terminated, so sol_tcp_sockopt() decrements optlen by one before calling do_tcp_getsockopt() and the 'if (optlen < saved_optlen) memset(..,0,..);' in __bpf_getsockopt() will always do a null termination. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002918.2894511-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index c03a50c72f40..735e957f7f4b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -402,6 +402,8 @@ void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); +int do_tcp_getsockopt(struct sock *sk, int level, + int optname, sockptr_t optval, sockptr_t optlen); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); bool tcp_bpf_bypass_getsockopt(int level, int optname); -- cgit v1.2.3 From fd969f25fe24be515278d28cbf86dde39be8a495 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:29:25 -0700 Subject: bpf: Change bpf_getsockopt(SOL_IP) to reuse do_ip_getsockopt() This patch changes bpf_getsockopt(SOL_IP) to reuse do_ip_getsockopt() and remove the duplicated code. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002925.2895416-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/net/ip.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/ip.h b/include/net/ip.h index 34fa5b0f0a0e..038097c2a152 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -747,6 +747,8 @@ int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); +int do_ip_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen); int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int ip_ra_control(struct sock *sk, unsigned char on, -- cgit v1.2.3 From 38566ec06f52250c4abaa7447aae676e0b881c46 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 1 Sep 2022 17:29:31 -0700 Subject: bpf: Change bpf_getsockopt(SOL_IPV6) to reuse do_ipv6_getsockopt() This patch changes bpf_getsockopt(SOL_IPV6) to reuse do_ipv6_getsockopt(). It removes the duplicated code from bpf_getsockopt(SOL_IPV6). This also makes bpf_getsockopt(SOL_IPV6) supporting the same set of optnames as in bpf_setsockopt(SOL_IPV6). In particular, this adds IPV6_AUTOFLOWLABEL support to bpf_getsockopt(SOL_IPV6). ipv6 could be compiled as a module. Like how other code solved it with stubs in ipv6_stubs.h, this patch adds the do_ipv6_getsockopt to the ipv6_bpf_stub. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220902002931.2896218-1-kafai@fb.com Signed-off-by: Alexei Starovoitov --- include/net/ipv6.h | 2 ++ include/net/ipv6_stubs.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a4f24573ed7a..d664ba5812d8 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1160,6 +1160,8 @@ int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval unsigned int optlen); int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); +int do_ipv6_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen); int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 8692698b01cf..c48186bf4737 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -83,6 +83,8 @@ struct ipv6_bpf_stub { struct sk_buff *skb); int (*ipv6_setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); + int (*ipv6_getsockopt)(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen); }; extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; -- cgit v1.2.3 From 5854a09b49574da5a77a0f36ad7b021a2661321d Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 31 Aug 2022 14:12:42 -0500 Subject: net/ipv4: Use __DECLARE_FLEX_ARRAY() helper We now have a cleaner way to keep compatibility with user-space (a.k.a. not breaking it) when we need to keep in place a one-element array (for its use in user-space) together with a flexible-array member (for its use in kernel-space) without making it hard to read at the source level. This is through the use of the new __DECLARE_FLEX_ARRAY() helper macro. The size and memory layout of the structure is preserved after the changes. See below. Before changes: $ pahole -C ip_msfilter net/ipv4/igmp.o struct ip_msfilter { union { struct { __be32 imsf_multiaddr_aux; /* 0 4 */ __be32 imsf_interface_aux; /* 4 4 */ __u32 imsf_fmode_aux; /* 8 4 */ __u32 imsf_numsrc_aux; /* 12 4 */ __be32 imsf_slist[1]; /* 16 4 */ }; /* 0 20 */ struct { __be32 imsf_multiaddr; /* 0 4 */ __be32 imsf_interface; /* 4 4 */ __u32 imsf_fmode; /* 8 4 */ __u32 imsf_numsrc; /* 12 4 */ __be32 imsf_slist_flex[0]; /* 16 0 */ }; /* 0 16 */ }; /* 0 20 */ /* size: 20, cachelines: 1, members: 1 */ /* last cacheline: 20 bytes */ }; After changes: $ pahole -C ip_msfilter net/ipv4/igmp.o struct ip_msfilter { __be32 imsf_multiaddr; /* 0 4 */ __be32 imsf_interface; /* 4 4 */ __u32 imsf_fmode; /* 8 4 */ __u32 imsf_numsrc; /* 12 4 */ union { __be32 imsf_slist[1]; /* 16 4 */ struct { struct { } __empty_imsf_slist_flex; /* 16 0 */ __be32 imsf_slist_flex[0]; /* 16 0 */ }; /* 16 0 */ }; /* 16 4 */ /* size: 20, cachelines: 1, members: 5 */ /* last cacheline: 20 bytes */ }; In the past, we had to duplicate the whole original structure within a union, and update the names of all the members. Now, we just need to declare the flexible-array member to be used in kernel-space through the __DECLARE_FLEX_ARRAY() helper together with the one-element array, within a union. This makes the source code more clean and easier to read. Link: https://github.com/KSPP/linux/issues/193 Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: David S. Miller --- include/uapi/linux/in.h | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 14168225cecd..578daa6f816b 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -188,21 +188,13 @@ struct ip_mreq_source { }; struct ip_msfilter { + __be32 imsf_multiaddr; + __be32 imsf_interface; + __u32 imsf_fmode; + __u32 imsf_numsrc; union { - struct { - __be32 imsf_multiaddr_aux; - __be32 imsf_interface_aux; - __u32 imsf_fmode_aux; - __u32 imsf_numsrc_aux; - __be32 imsf_slist[1]; - }; - struct { - __be32 imsf_multiaddr; - __be32 imsf_interface; - __u32 imsf_fmode; - __u32 imsf_numsrc; - __be32 imsf_slist_flex[]; - }; + __be32 imsf_slist[1]; + __DECLARE_FLEX_ARRAY(__be32, imsf_slist_flex); }; }; -- cgit v1.2.3 From a36c421690b3e5dee38fc12abfcabda742f00064 Mon Sep 17 00:00:00 2001 From: James Prestwood Date: Fri, 26 Aug 2022 10:00:31 -0700 Subject: wifi: nl80211: Add POWERED_ADDR_CHANGE feature Add a new extended feature bit signifying that the wireless hardware supports changing the MAC address while the underlying net_device is powered. Note that this has a different meaning from IFF_LIVE_ADDR_CHANGE as additional restrictions might be imposed by the hardware, such as: - No connection is active on this interface, carrier is off - No scan is in progress - No offchannel operations are in progress Signed-off-by: James Prestwood Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 573db20403dc..a00a23840c57 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -6281,6 +6281,14 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_RADAR_BACKGROUND: Device supports background radar/CAC * detection. * + * @NL80211_EXT_FEATURE_POWERED_ADDR_CHANGE: Device can perform a MAC address + * change without having to bring the underlying network device down + * first. For example, in station mode this can be used to vary the + * origin MAC address prior to a connection to a new AP for privacy + * or other reasons. Note that certain driver specific restrictions + * might apply, e.g. no scans in progress, no offchannel operations + * in progress, and no active connections. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6348,6 +6356,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_BSS_COLOR, NL80211_EXT_FEATURE_FILS_CRYPTO_OFFLOAD, NL80211_EXT_FEATURE_RADAR_BACKGROUND, + NL80211_EXT_FEATURE_POWERED_ADDR_CHANGE, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 6522047c65764c9aaec8009e73daa8c0b138c701 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Sep 2022 16:12:50 +0200 Subject: wifi: nl80211: add MLD address to assoc BSS entries Add an MLD address attribute to BSS entries that the interface is currently associated with to help userspace figure out what's going on. Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a00a23840c57..c32e7616a366 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4959,6 +4959,7 @@ enum nl80211_bss_scan_width { * using the nesting index as the antenna number. * @NL80211_BSS_FREQUENCY_OFFSET: frequency offset in KHz * @NL80211_BSS_MLO_LINK_ID: MLO link ID of the BSS (u8). + * @NL80211_BSS_MLD_ADDR: MLD address of this BSS if connected to it. * @__NL80211_BSS_AFTER_LAST: internal * @NL80211_BSS_MAX: highest BSS attribute */ @@ -4985,6 +4986,7 @@ enum nl80211_bss { NL80211_BSS_CHAIN_SIGNAL, NL80211_BSS_FREQUENCY_OFFSET, NL80211_BSS_MLO_LINK_ID, + NL80211_BSS_MLD_ADDR, /* keep last */ __NL80211_BSS_AFTER_LAST, -- cgit v1.2.3 From 4a502cf4d77e12119e7061a05d5789cd3129d185 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Fri, 2 Sep 2022 10:32:03 +0200 Subject: net: pcs: add new PCS driver for altera TSE PCS The Altera Triple Speed Ethernet has a SGMII/1000BaseC PCS that can be integrated in several ways. It can either be part of the TSE MAC's address space, accessed through 32 bits accesses on the mapped mdio device 0, or through a dedicated 16 bits register set. This driver allows using the TSE PCS outside of altera TSE's driver, since it can be used standalone by other MACs. Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- include/linux/pcs-altera-tse.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 include/linux/pcs-altera-tse.h (limited to 'include') diff --git a/include/linux/pcs-altera-tse.h b/include/linux/pcs-altera-tse.h new file mode 100644 index 000000000000..92ab9f08e835 --- /dev/null +++ b/include/linux/pcs-altera-tse.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2022 Bootlin + * + * Maxime Chevallier + */ + +#ifndef __LINUX_PCS_ALTERA_TSE_H +#define __LINUX_PCS_ALTERA_TSE_H + +struct phylink_pcs; +struct net_device; + +struct phylink_pcs *alt_tse_pcs_create(struct net_device *ndev, + void __iomem *pcs_base, int reg_width); + +#endif /* __LINUX_PCS_ALTERA_TSE_H */ -- cgit v1.2.3 From 8a2dd123f12f69e5373d3103da2c97fc36223e0c Mon Sep 17 00:00:00 2001 From: Chris Mi Date: Mon, 29 Aug 2022 12:06:04 +0300 Subject: RDMA/mlx5: Move function mlx5_core_query_ib_ppcnt() to mlx5_ib This patch doesn't change any functionality, but move one function to mlx5_ib because it is not used by mlx5_core. The actual fix is in the next patch. Reviewed-by: Roi Dayan Signed-off-by: Chris Mi Link: https://lore.kernel.org/r/fd47b9138412bd94ed30f838026cbb4cf3878150.1661763871.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 76d7661e3e63..432bd202e2fe 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1079,8 +1079,6 @@ int mlx5_core_destroy_psv(struct mlx5_core_dev *dev, int psv_num); void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common); int mlx5_query_odp_caps(struct mlx5_core_dev *dev, struct mlx5_odp_caps *odp_caps); -int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev, - u8 port_num, void *out, size_t sz); int mlx5_init_rl_table(struct mlx5_core_dev *dev); void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 914f8b228ede709274b8c80514b352248ec9da00 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 2 Sep 2022 17:57:35 -0400 Subject: soc: fsl: qbman: Add CGR update function This adds a function to update a CGR with new parameters. qman_create_cgr can almost be used for this (with flags=0), but it's not suitable because it also registers the callback function. The _safe variant was modeled off of qman_cgr_delete_safe. However, we handle multiple arguments and a return value. Signed-off-by: Sean Anderson Acked-by: Camelia Groza Signed-off-by: David S. Miller --- include/soc/fsl/qman.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h index 59eeba31c192..0d3d6beb7fdb 100644 --- a/include/soc/fsl/qman.h +++ b/include/soc/fsl/qman.h @@ -1171,6 +1171,15 @@ int qman_delete_cgr(struct qman_cgr *cgr); */ void qman_delete_cgr_safe(struct qman_cgr *cgr); +/** + * qman_update_cgr_safe - Modifies a congestion group object from any CPU + * @cgr: the 'cgr' object to modify + * @opts: state of the CGR settings + * + * This will select the proper CPU and modify the CGR settings. + */ +int qman_update_cgr_safe(struct qman_cgr *cgr, struct qm_mcc_initcgr *opts); + /** * qman_query_cgr_congested - Queries CGR's congestion status * @cgr: the 'cgr' object to query -- cgit v1.2.3 From 05ad5d4581c3c1cc724fe50d4652833fb9f3037b Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Fri, 2 Sep 2022 18:02:39 -0400 Subject: net: phy: Add 1000BASE-KX interface mode Add 1000BASE-KX interface mode. This 1G backplane ethernet as described in clause 70. Clause 73 autonegotiation is mandatory, and only full duplex operation is supported. Although at the PMA level this interface mode is identical to 1000BASE-X, it uses a different form of in-band autonegation. This justifies a separate interface mode, since the interface mode (along with the MLO_AN_* autonegotiation mode) sets the type of autonegotiation which will be used on a link. This results in more than just electrical differences between the link modes. With regard to 1000BASE-X, 1000BASE-KX holds a similar position to SGMII: same signaling, but different autonegotiation. PCS drivers (which typically handle in-band autonegotiation) may only support 1000BASE-X, and not 1000BASE-KX. Similarly, the phy mode is used to configure serdes phys with phy_set_mode_ext. Due to the different electrical standards (SFI or XFI vs Clause 70), they will likely want to use different configuration. Adding a phy interface mode for 1000BASE-KX helps simplify configuration in these areas. Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 7c49ab95441b..337230c135f7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -116,6 +116,7 @@ extern const int phy_10gbit_features_array[1]; * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN * @PHY_INTERFACE_MODE_QUSGMII: Quad Universal SGMII + * @PHY_INTERFACE_MODE_1000BASEKX: 1000Base-KX - with Clause 73 AN * @PHY_INTERFACE_MODE_MAX: Book keeping * * Describes the interface between the MAC and PHY. @@ -154,6 +155,7 @@ typedef enum { /* 10GBASE-KR - with Clause 73 AN */ PHY_INTERFACE_MODE_10GKR, PHY_INTERFACE_MODE_QUSGMII, + PHY_INTERFACE_MODE_1000BASEKX, PHY_INTERFACE_MODE_MAX, } phy_interface_t; @@ -251,6 +253,8 @@ static inline const char *phy_modes(phy_interface_t interface) return "trgmii"; case PHY_INTERFACE_MODE_1000BASEX: return "1000base-x"; + case PHY_INTERFACE_MODE_1000BASEKX: + return "1000base-kx"; case PHY_INTERFACE_MODE_2500BASEX: return "2500base-x"; case PHY_INTERFACE_MODE_5GBASER: -- cgit v1.2.3 From 7c8199e24fa09d2344ae0204527d55d7803e8409 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:43 -0700 Subject: bpf: Introduce any context BPF specific memory allocator. Tracing BPF programs can attach to kprobe and fentry. Hence they run in unknown context where calling plain kmalloc() might not be safe. Front-end kmalloc() with minimal per-cpu cache of free elements. Refill this cache asynchronously from irq_work. BPF programs always run with migration disabled. It's safe to allocate from cache of the current cpu with irqs disabled. Free-ing is always done into bucket of the current cpu as well. irq_work trims extra free elements from buckets with kfree and refills them with kmalloc, so global kmalloc logic takes care of freeing objects allocated by one cpu and freed on another. struct bpf_mem_alloc supports two modes: - When size != 0 create kmem_cache and bpf_mem_cache for each cpu. This is typical bpf hash map use case when all elements have equal size. - When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on kmalloc/kfree. Max allocation size is 4096 in this case. This is bpf_dynptr and bpf_kptr use case. bpf_mem_alloc/bpf_mem_free are bpf specific 'wrappers' of kmalloc/kfree. bpf_mem_cache_alloc/bpf_mem_cache_free are 'wrappers' of kmem_cache_alloc/kmem_cache_free. The allocators are NMI-safe from bpf programs only. They are not NMI-safe in general. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-2-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 include/linux/bpf_mem_alloc.h (limited to 'include') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h new file mode 100644 index 000000000000..804733070f8d --- /dev/null +++ b/include/linux/bpf_mem_alloc.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ +#ifndef _BPF_MEM_ALLOC_H +#define _BPF_MEM_ALLOC_H +#include + +struct bpf_mem_cache; +struct bpf_mem_caches; + +struct bpf_mem_alloc { + struct bpf_mem_caches __percpu *caches; + struct bpf_mem_cache __percpu *cache; +}; + +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size); +void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); + +/* kmalloc/kfree equivalent: */ +void *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size); +void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); + +/* kmem_cache_alloc/free equivalent: */ +void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); +void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); + +#endif /* _BPF_MEM_ALLOC_H */ -- cgit v1.2.3 From 4ab67149f3c6e97c5c506a726f0ebdec38241679 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:52 -0700 Subject: bpf: Add percpu allocation support to bpf_mem_alloc. Extend bpf_mem_alloc to cache free list of fixed size per-cpu allocations. Once such cache is created bpf_mem_cache_alloc() will return per-cpu objects. bpf_mem_cache_free() will free them back into global per-cpu pool after observing RCU grace period. per-cpu flavor of bpf_mem_alloc is going to be used by per-cpu hash maps. The free list cache consists of tuples { llist_node, per-cpu pointer } Unlike alloc_percpu() that returns per-cpu pointer the bpf_mem_cache_alloc() returns a pointer to per-cpu pointer and bpf_mem_cache_free() expects to receive it back. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Kumar Kartikeya Dwivedi Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220902211058.60789-11-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 804733070f8d..653ed1584a03 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -12,7 +12,7 @@ struct bpf_mem_alloc { struct bpf_mem_cache __percpu *cache; }; -int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size); +int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); /* kmalloc/kfree equivalent: */ -- cgit v1.2.3 From 9f2c6e96c65e6fa1aebef546be0c30a5895fcb37 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 2 Sep 2022 14:10:58 -0700 Subject: bpf: Optimize rcu_barrier usage between hash map and bpf_mem_alloc. User space might be creating and destroying a lot of hash maps. Synchronous rcu_barrier-s in a destruction path of hash map delay freeing of hash buckets and other map memory and may cause artificial OOM situation under stress. Optimize rcu_barrier usage between bpf hash map and bpf_mem_alloc: - remove rcu_barrier from hash map, since htab doesn't use call_rcu directly and there are no callback to wait for. - bpf_mem_alloc has call_rcu_in_progress flag that indicates pending callbacks. Use it to avoid barriers in fast path. - When barriers are needed copy bpf_mem_alloc into temp structure and wait for rcu barrier-s in the worker to let the rest of hash map freeing to proceed. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220902211058.60789-17-alexei.starovoitov@gmail.com --- include/linux/bpf_mem_alloc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 653ed1584a03..3e164b8efaa9 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -3,6 +3,7 @@ #ifndef _BPF_MEM_ALLOC_H #define _BPF_MEM_ALLOC_H #include +#include struct bpf_mem_cache; struct bpf_mem_caches; @@ -10,6 +11,7 @@ struct bpf_mem_caches; struct bpf_mem_alloc { struct bpf_mem_caches __percpu *caches; struct bpf_mem_cache __percpu *cache; + struct work_struct work; }; int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); -- cgit v1.2.3 From 261ce8879578f42bc1ff3385ff1be8e31d6fb160 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Fri, 2 Sep 2022 16:12:41 +0200 Subject: wifi: mac80211: make smps_mode per-link The SMPS power save mode needs to be per-link rather than being shared for all links. As such, move it into struct ieee80211_link_sta. Signed-off-by: Benjamin Berg Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ffd0ebbff294..d4e1d73d88cc 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2136,6 +2136,7 @@ struct ieee80211_sta_txpwr { * in ieee80211_sta. For MLO Link STA this addr can be same or different * from addr in ieee80211_sta (representing MLD STA addr) * @link_id: the link ID for this link STA (0 for deflink) + * @smps_mode: current SMPS mode (off, static or dynamic) * @supp_rates: Bitmap of supported rates * @ht_cap: HT capabilities of this STA; restricted to our own capabilities * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities @@ -2153,6 +2154,7 @@ struct ieee80211_sta_txpwr { struct ieee80211_link_sta { u8 addr[ETH_ALEN]; u8 link_id; + enum ieee80211_smps_mode smps_mode; u32 supp_rates[NUM_NL80211_BANDS]; struct ieee80211_sta_ht_cap ht_cap; @@ -2191,7 +2193,6 @@ struct ieee80211_link_sta { * if wme is supported. The bits order is like in * IEEE80211_WMM_IE_STA_QOSINFO_AC_*. * @max_sp: max Service Period. Only valid if wme is supported. - * @smps_mode: current SMPS mode (off, static or dynamic) * @rates: rate control selection table * @tdls: indicates whether the STA is a TDLS peer * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only @@ -2226,7 +2227,6 @@ struct ieee80211_sta { bool wme; u8 uapsd_queues; u8 max_sp; - enum ieee80211_smps_mode smps_mode; struct ieee80211_sta_rates __rcu *rates; bool tdls; bool tdls_initiator; -- cgit v1.2.3 From efe9c2bfd1a82894e455514a68dc794556fbd463 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Sep 2022 16:12:42 +0200 Subject: wifi: mac80211: isolate driver from inactive links In order to let the driver select active links and properly make multi-link connections, as a first step isolate the driver from inactive links, and set the active links to be only the association link for client-side interfaces. For AP side nothing changes since APs always have to have all their links active. To simplify things, update the for_each_sta_active_link() API to include the appropriate vif pointer. This also implies not allocating a chanctx for an inactive link, which requires a few more changes. Since we now no longer try to program multiple links to the driver, remove the check in the MLME code. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d4e1d73d88cc..20a2f25a38fa 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1799,6 +1799,9 @@ struct ieee80211_vif_cfg { * @link_conf: in case of MLD, the per-link BSS configuration, * indexed by link ID * @valid_links: bitmap of valid links, or 0 for non-MLO. + * @active_links: The bitmap of active links, or 0 for non-MLO. + * The driver shouldn't change this directly, but use the + * API calls meant for that purpose. * @addr: address of this interface * @p2p: indicates whether this AP or STA interface is a p2p * interface, i.e. a GO or p2p-sta respectively @@ -1834,7 +1837,7 @@ struct ieee80211_vif { struct ieee80211_vif_cfg cfg; struct ieee80211_bss_conf bss_conf; struct ieee80211_bss_conf __rcu *link_conf[IEEE80211_MLD_MAX_NUM_LINKS]; - u16 valid_links; + u16 valid_links, active_links; u8 addr[ETH_ALEN] __aligned(2); bool p2p; @@ -1861,12 +1864,11 @@ struct ieee80211_vif { u8 drv_priv[] __aligned(sizeof(void *)); }; -/* FIXME: for now loop over all the available links; later will be changed - * to loop only over the active links. - */ -#define for_each_vif_active_link(vif, link, link_id) \ - for (link_id = 0; link_id < ARRAY_SIZE((vif)->link_conf); link_id++) \ - if ((link = rcu_dereference((vif)->link_conf[link_id]))) +#define for_each_vif_active_link(vif, link, link_id) \ + for (link_id = 0; link_id < ARRAY_SIZE((vif)->link_conf); link_id++) \ + if ((!(vif)->active_links || \ + (vif)->active_links & BIT(link_id)) && \ + (link = rcu_dereference((vif)->link_conf[link_id]))) static inline bool ieee80211_vif_is_mesh(struct ieee80211_vif *vif) { @@ -2264,13 +2266,13 @@ struct ieee80211_sta { u8 drv_priv[] __aligned(sizeof(void *)); }; -/* FIXME: need to loop only over links which are active and check the actual - * lock - */ -#define for_each_sta_active_link(sta, link_sta, link_id) \ - for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) \ - if (((link_sta) = rcu_dereference_protected((sta)->link[link_id],\ - 1))) \ +/* FIXME: check the locking correctly */ +#define for_each_sta_active_link(vif, sta, link_sta, link_id) \ + for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) \ + if ((!(vif)->active_links || \ + (vif)->active_links & BIT(link_id)) && \ + ((link_sta) = rcu_dereference_protected((sta)->link[link_id],\ + 1))) /** * enum sta_notify_cmd - sta notify command -- cgit v1.2.3 From ffa9598ecb93630a45564b95ae17362b819b38de Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Sep 2022 16:12:43 +0200 Subject: wifi: mac80211: add ieee80211_find_sta_by_link_addrs API Add a new API function ieee80211_find_sta_by_link_addrs() that looks up the STA and link ID based on interface and station link addresses. We're going to use it for mac80211-hwsim to track on the AP side which links are active. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 20a2f25a38fa..954cc029a9f9 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5986,6 +5986,22 @@ struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr); +/** + * ieee80211_find_sta_by_link_addrs - find STA by link addresses + * @hw: pointer as obtained from ieee80211_alloc_hw() + * @addr: remote station's link address + * @localaddr: local link address, use %NULL for any (but avoid that) + * @link_id: pointer to obtain the link ID if the STA is found, + * may be %NULL if the link ID is not needed + * + * Obtain the STA by link address, must use RCU protection. + */ +struct ieee80211_sta * +ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, + const u8 *addr, + const u8 *localaddr, + unsigned int *link_id); + /** * ieee80211_sta_block_awake - block station from waking up * @hw: the hardware -- cgit v1.2.3 From 0ab26380d98655f0aab720704debfa2ac522cefd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Sep 2022 16:12:47 +0200 Subject: wifi: mac80211: extend ieee80211_nullfunc_get() for MLO Add a link_id parameter to ieee80211_nullfunc_get() to be able to obtain a correctly addressed frame. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 954cc029a9f9..bfa6a1625c5c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5298,6 +5298,9 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, * ieee80211_nullfunc_get - retrieve a nullfunc template * @hw: pointer obtained from ieee80211_alloc_hw(). * @vif: &struct ieee80211_vif pointer from the add_interface callback. + * @link_id: If the vif is an MLD, get a frame with the link addresses + * for the given link ID. For a link_id < 0 you get a frame with + * MLD addresses, however useful that might be. * @qos_ok: QoS NDP is acceptable to the caller, this should be set * if at all possible * @@ -5315,7 +5318,7 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, */ struct sk_buff *ieee80211_nullfunc_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - bool qos_ok); + int link_id, bool qos_ok); /** * ieee80211_probereq_get - retrieve a Probe Request template -- cgit v1.2.3 From 65fd846cb3f94ae63134fbd0f32564cf82539eaa Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Sep 2022 16:12:52 +0200 Subject: wifi: mac80211: add vif/sta link RCU dereference macros Add macros (and an exported function) to allow checking some link RCU protected accesses that are happening in callbacks from mac80211 and are thus under the correct lock. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index bfa6a1625c5c..d9e7f62cc972 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -1901,6 +1902,19 @@ struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev); */ struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif); +/** + * lockdep_vif_mutex_held - for lockdep checks on link poiners + * @vif: the interface to check + */ +static inline bool lockdep_vif_mutex_held(struct ieee80211_vif *vif) +{ + return lockdep_is_held(&ieee80211_vif_to_wdev(vif)->mtx); +} + +#define link_conf_dereference_protected(vif, link_id) \ + rcu_dereference_protected((vif)->link_conf[link_id], \ + lockdep_vif_mutex_held(vif)) + /** * enum ieee80211_key_flags - key flags * @@ -2266,13 +2280,24 @@ struct ieee80211_sta { u8 drv_priv[] __aligned(sizeof(void *)); }; -/* FIXME: check the locking correctly */ +#ifdef CONFIG_LOCKDEP +bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta); +#else +static inline bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) +{ + return true; +} +#endif + +#define link_sta_dereference_protected(sta, link_id) \ + rcu_dereference_protected((sta)->link[link_id], \ + lockdep_sta_mutex_held(sta)) + #define for_each_sta_active_link(vif, sta, link_sta, link_id) \ for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) \ if ((!(vif)->active_links || \ (vif)->active_links & BIT(link_id)) && \ - ((link_sta) = rcu_dereference_protected((sta)->link[link_id],\ - 1))) + ((link_sta) = link_sta_dereference_protected(sta, link_id))) /** * enum sta_notify_cmd - sta notify command -- cgit v1.2.3 From 4c51541ddb78cef2da9c4c30006c0d9d06ea9a77 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Fri, 2 Sep 2022 16:12:54 +0200 Subject: wifi: mac80211: keep A-MSDU data in sta and per-link The A-MSDU data needs to be stored per-link and aggregated into a single value for the station. Add a new struct ieee_80211_sta_aggregates in order to store this data and a new function ieee80211_sta_recalc_aggregates to update the current data for the STA. Note that in the non MLO case the pointer in ieee80211_sta will directly reference the data in deflink.agg, which means that recalculation may be skipped in that case. Signed-off-by: Benjamin Berg Signed-off-by: Johannes Berg --- include/net/mac80211.h | 68 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d9e7f62cc972..873e81a45a97 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2143,6 +2143,34 @@ struct ieee80211_sta_txpwr { enum nl80211_tx_power_setting type; }; +/** + * struct ieee80211_sta_aggregates - info that is aggregated from active links + * + * Used for any per-link data that needs to be aggregated and updated in the + * main &struct ieee80211_sta when updated or the active links change. + * + * @max_amsdu_len: indicates the maximal length of an A-MSDU in bytes. + * This field is always valid for packets with a VHT preamble. + * For packets with a HT preamble, additional limits apply: + * + * * If the skb is transmitted as part of a BA agreement, the + * A-MSDU maximal size is min(max_amsdu_len, 4065) bytes. + * * If the skb is not part of a BA agreement, the A-MSDU maximal + * size is min(max_amsdu_len, 7935) bytes. + * + * Both additional HT limits must be enforced by the low level + * driver. This is defined by the spec (IEEE 802.11-2012 section + * 8.3.2.2 NOTE 2). + * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control. + * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID + */ +struct ieee80211_sta_aggregates { + u16 max_amsdu_len; + + u16 max_rc_amsdu_len; + u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS]; +}; + /** * struct ieee80211_link_sta - station Link specific info * All link specific info for a STA link for a non MLD STA(single) @@ -2179,6 +2207,8 @@ struct ieee80211_link_sta { struct ieee80211_he_6ghz_capa he_6ghz_capa; struct ieee80211_sta_eht_cap eht_cap; + struct ieee80211_sta_aggregates agg; + u8 rx_nss; enum ieee80211_sta_rx_bandwidth bandwidth; struct ieee80211_sta_txpwr txpwr; @@ -2218,9 +2248,10 @@ struct ieee80211_link_sta { * @max_amsdu_subframes: indicates the maximal number of MSDUs in a single * A-MSDU. Taken from the Extended Capabilities element. 0 means * unlimited. + * @cur: currently valid data as aggregated from the active links + * For non MLO STA it will point to the deflink data. For MLO STA + * ieee80211_sta_recalc_aggregates() must be called to update it. * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not. - * @max_rc_amsdu_len: Maximum A-MSDU size in bytes recommended by rate control. - * @max_tid_amsdu_len: Maximum A-MSDU size in bytes for this TID * @txq: per-TID data TX queues (if driver uses the TXQ abstraction); note that * the last entry (%IEEE80211_NUM_TIDS) is used for non-data frames * @deflink: This holds the default link STA information, for non MLO STA all link @@ -2250,25 +2281,9 @@ struct ieee80211_sta { bool mlo; u8 max_amsdu_subframes; - /** - * @max_amsdu_len: - * indicates the maximal length of an A-MSDU in bytes. - * This field is always valid for packets with a VHT preamble. - * For packets with a HT preamble, additional limits apply: - * - * * If the skb is transmitted as part of a BA agreement, the - * A-MSDU maximal size is min(max_amsdu_len, 4065) bytes. - * * If the skb is not part of a BA agreement, the A-MSDU maximal - * size is min(max_amsdu_len, 7935) bytes. - * - * Both additional HT limits must be enforced by the low level - * driver. This is defined by the spec (IEEE 802.11-2012 section - * 8.3.2.2 NOTE 2). - */ - u16 max_amsdu_len; + struct ieee80211_sta_aggregates *cur; + bool support_p2p_ps; - u16 max_rc_amsdu_len; - u16 max_tid_amsdu_len[IEEE80211_NUM_TIDS]; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS + 1]; @@ -6105,6 +6120,19 @@ void ieee80211_sta_eosp(struct ieee80211_sta *pubsta); */ void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid); +/** + * ieee80211_sta_recalc_aggregates - recalculate aggregate data after a change + * @pubsta: the station + * + * Call this function after changing a per-link aggregate data as referenced in + * &struct ieee80211_sta_aggregates by accessing the agg field of + * &struct ieee80211_link_sta. + * + * With non MLO the data in deflink will be referenced directly. In that case + * there is no need to call this function. + */ +void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta); + /** * ieee80211_sta_register_airtime - register airtime usage for a sta/tid * -- cgit v1.2.3 From 3d901102922723eedce6ef10ebd03315a7abb8a5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 2 Sep 2022 16:12:56 +0200 Subject: wifi: mac80211: implement link switching Implement an API function and debugfs file to switch active links. Also provide an async version of the API so drivers can call it in arbitrary contexts, e.g. while in the authorized callback. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 873e81a45a97..ac2bad57933f 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7184,4 +7184,45 @@ static inline bool ieee80211_is_tx_data(struct sk_buff *skb) ieee80211_is_data(hdr->frame_control); } +/** + * ieee80211_set_active_links - set active links in client mode + * @vif: interface to set active links on + * @active_links: the new active links bitmap + * + * This changes the active links on an interface. The interface + * must be in client mode (in AP mode, all links are always active), + * and @active_links must be a subset of the vif's valid_links. + * + * If a link is switched off and another is switched on at the same + * time (e.g. active_links going from 0x1 to 0x10) then you will get + * a sequence of calls like + * - change_vif_links(0x11) + * - unassign_vif_chanctx(link_id=0) + * - change_sta_links(0x11) for each affected STA (the AP) + * (TDLS connections on now inactive links should be torn down) + * - remove group keys on the old link (link_id 0) + * - add new group keys (GTK/IGTK/BIGTK) on the new link (link_id 4) + * - change_sta_links(0x10) for each affected STA (the AP) + * - assign_vif_chanctx(link_id=4) + * - change_vif_links(0x10) + * + * Note: This function acquires some mac80211 locks and must not + * be called with any driver locks held that could cause a + * lock dependency inversion. Best call it without locks. + */ +int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links); + +/** + * ieee80211_set_active_links_async - asynchronously set active links + * @vif: interface to set active links on + * @active_links: the new active links bitmap + * + * See ieee80211_set_active_links() for more information, the only + * difference here is that the link change is triggered async and + * can be called in any context, but the link switch will only be + * completed after it returns. + */ +void ieee80211_set_active_links_async(struct ieee80211_vif *vif, + u16 active_links); + #endif /* MAC80211_H */ -- cgit v1.2.3 From b338d91703fae6f6afd67f3f75caa3b8f36ddef3 Mon Sep 17 00:00:00 2001 From: Brian Gix Date: Thu, 1 Sep 2022 12:19:13 -0700 Subject: Bluetooth: Implement support for Mesh The patch adds state bits, storage and HCI command chains for sending and receiving Bluetooth Mesh advertising packets, and delivery to requesting user space processes. It specifically creates 4 new MGMT commands and 2 new MGMT events: MGMT_OP_SET_MESH_RECEIVER - Sets passive scan parameters and a list of AD Types which will trigger Mesh Packet Received events MGMT_OP_MESH_READ_FEATURES - Returns information on how many outbound Mesh packets can be simultaneously queued, and what the currently queued handles are. MGMT_OP_MESH_SEND - Command to queue a specific outbound Mesh packet, with the number of times it should be sent, and the BD Addr to use. Discrete advertisments are added to the ADV Instance list. MGMT_OP_MESH_SEND_CANCEL - Command to cancel a prior outbound message request. MGMT_EV_MESH_DEVICE_FOUND - Event to deliver entire received Mesh Advertisement packet, along with timing information. MGMT_EV_MESH_PACKET_CMPLT - Event to indicate that an outbound packet is no longer queued for delivery. Signed-off-by: Brian Gix Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 1 + include/net/bluetooth/hci.h | 3 +++ include/net/bluetooth/hci_core.h | 16 ++++++++++-- include/net/bluetooth/mgmt.h | 52 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index e72f3b247b5e..bcc5a4cd2c17 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -627,6 +627,7 @@ static inline bool iso_enabled(void) int mgmt_init(void); void mgmt_exit(void); +void mgmt_cleanup(struct sock *sk); void bt_sock_reclassify_lock(struct sock *sk, int proto); diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index cf29511b25a8..b3ade687531f 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -354,6 +354,9 @@ enum { HCI_LE_SIMULTANEOUS_ROLES, HCI_CMD_DRAIN_WORKQUEUE, + HCI_MESH, + HCI_MESH_SENDING, + __HCI_NUM_FLAGS, }; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 932153e68864..c54bc71254af 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -238,6 +238,7 @@ struct adv_info { bool enabled; bool pending; bool periodic; + __u8 mesh; __u8 instance; __u32 flags; __u16 timeout; @@ -372,6 +373,8 @@ struct hci_dev { __u8 le_resolv_list_size; __u8 le_num_of_adv_sets; __u8 le_states[8]; + __u8 mesh_ad_types[16]; + __u8 mesh_send_ref; __u8 commands[64]; __u8 hci_ver; __u16 hci_rev; @@ -511,6 +514,7 @@ struct hci_dev { struct list_head cmd_sync_work_list; struct mutex cmd_sync_work_lock; struct work_struct cmd_sync_cancel_work; + struct work_struct reenable_adv_work; __u16 discov_timeout; struct delayed_work discov_off; @@ -561,6 +565,7 @@ struct hci_dev { struct hci_conn_hash conn_hash; + struct list_head mesh_pending; struct list_head mgmt_pending; struct list_head reject_list; struct list_head accept_list; @@ -614,6 +619,8 @@ struct hci_dev { struct delayed_work rpa_expired; bdaddr_t rpa; + struct delayed_work mesh_send_done; + enum { INTERLEAVE_SCAN_NONE, INTERLEAVE_SCAN_NO_FILTER, @@ -1576,7 +1583,8 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data, u16 timeout, u16 duration, s8 tx_power, - u32 min_interval, u32 max_interval); + u32 min_interval, u32 max_interval, + u8 mesh_handle); struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval); @@ -1997,6 +2005,9 @@ void hci_mgmt_chan_unregister(struct hci_mgmt_chan *c); #define DISCOV_LE_FAST_ADV_INT_MAX 0x00F0 /* 150 msec */ #define DISCOV_LE_PER_ADV_INT_MIN 0x00A0 /* 200 msec */ #define DISCOV_LE_PER_ADV_INT_MAX 0x00A0 /* 200 msec */ +#define DISCOV_LE_ADV_MESH_MIN 0x00A0 /* 100 msec */ +#define DISCOV_LE_ADV_MESH_MAX 0x00A0 /* 100 msec */ +#define INTERVAL_TO_MS(x) (((x) * 10) / 0x10) #define NAME_RESOLVE_DURATION msecs_to_jiffies(10240) /* 10.24 sec */ @@ -2048,7 +2059,8 @@ void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status); void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status); void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, - u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len); + u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len, + u64 instant); void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, s8 rssi, u8 *name, u8 name_len); void mgmt_discovering(struct hci_dev *hdev, u8 discovering); diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index 7c1ad0f6fcec..743f6f59dff8 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -837,6 +837,42 @@ struct mgmt_cp_add_adv_patterns_monitor_rssi { struct mgmt_adv_pattern patterns[]; } __packed; #define MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE 8 +#define MGMT_OP_SET_MESH_RECEIVER 0x0057 +struct mgmt_cp_set_mesh { + __u8 enable; + __le16 window; + __le16 period; + __u8 num_ad_types; + __u8 ad_types[]; +} __packed; +#define MGMT_SET_MESH_RECEIVER_SIZE 6 + +#define MGMT_OP_MESH_READ_FEATURES 0x0058 +#define MGMT_MESH_READ_FEATURES_SIZE 0 +#define MESH_HANDLES_MAX 3 +struct mgmt_rp_mesh_read_features { + __le16 index; + __u8 max_handles; + __u8 used_handles; + __u8 handles[MESH_HANDLES_MAX]; +} __packed; + +#define MGMT_OP_MESH_SEND 0x0059 +struct mgmt_cp_mesh_send { + struct mgmt_addr_info addr; + __le64 instant; + __le16 delay; + __u8 cnt; + __u8 adv_data_len; + __u8 adv_data[]; +} __packed; +#define MGMT_MESH_SEND_SIZE 19 + +#define MGMT_OP_MESH_SEND_CANCEL 0x005A +struct mgmt_cp_mesh_send_cancel { + __u8 handle; +} __packed; +#define MGMT_MESH_SEND_CANCEL_SIZE 1 #define MGMT_EV_CMD_COMPLETE 0x0001 struct mgmt_ev_cmd_complete { @@ -1120,3 +1156,19 @@ struct mgmt_ev_adv_monitor_device_lost { __le16 monitor_handle; struct mgmt_addr_info addr; } __packed; + +#define MGMT_EV_MESH_DEVICE_FOUND 0x0031 +struct mgmt_ev_mesh_device_found { + struct mgmt_addr_info addr; + __s8 rssi; + __le64 instant; + __le32 flags; + __le16 eir_len; + __u8 eir[]; +} __packed; + + +#define MGMT_EV_MESH_PACKET_CMPLT 0x0032 +struct mgmt_ev_mesh_pkt_cmplt { + __u8 handle; +} __packed; -- cgit v1.2.3 From af6bcc1921ff0b644d2d750c0e3a88623b7211f5 Mon Sep 17 00:00:00 2001 From: Brian Gix Date: Thu, 1 Sep 2022 12:19:14 -0700 Subject: Bluetooth: Add experimental wrapper for MGMT based mesh This introduces a "Mesh UUID" and an Experimental Feature bit to the hdev mask, and depending all underlying Mesh functionality on it. Signed-off-by: Brian Gix Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index b3ade687531f..e004ba04a9ae 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -354,6 +354,7 @@ enum { HCI_LE_SIMULTANEOUS_ROLES, HCI_CMD_DRAIN_WORKQUEUE, + HCI_MESH_EXPERIMENTAL, HCI_MESH, HCI_MESH_SENDING, -- cgit v1.2.3 From 720e6a435194fb5237833a4a7ec6aa60a78964a8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:26:46 -0700 Subject: bpf: Allow struct argument in trampoline based programs Allow struct argument in trampoline based programs where the struct size should be <= 16 bytes. In such cases, the argument will be put into up to 2 registers for bpf, x86_64 and arm64 architectures. To support arch-specific trampoline manipulation, add arg_flags for additional struct information about arguments in btf_func_model. Such information will be used in arch specific function arch_prepare_bpf_trampoline() to prepare argument access properly in trampoline. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152646.2078089-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9c1674973e03..4d32f125f4af 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -727,10 +727,14 @@ enum bpf_cgroup_storage_type { */ #define MAX_BPF_FUNC_REG_ARGS 5 +/* The argument is a structure. */ +#define BTF_FMODEL_STRUCT_ARG BIT(0) + struct btf_func_model { u8 ret_size; u8 nr_args; u8 arg_size[MAX_BPF_FUNC_ARGS]; + u8 arg_flags[MAX_BPF_FUNC_ARGS]; }; /* Restore arguments before returning from trampoline to let original function -- cgit v1.2.3 From 27ed9353aec9de4277b3389c9f2b04beb6ab7622 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 31 Aug 2022 08:26:57 -0700 Subject: bpf: Update descriptions for helpers bpf_get_func_arg[_cnt]() Now instead of the number of arguments, the number of registers holding argument values are stored in trampoline. Update the description of bpf_get_func_arg[_cnt]() helpers. Previous programs without struct arguments should continue to work as usual. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20220831152657.2078805-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 793103b10eab..3df78c56c1bf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5079,12 +5079,12 @@ union bpf_attr { * * long bpf_get_func_arg(void *ctx, u32 n, u64 *value) * Description - * Get **n**-th argument (zero based) of the traced function (for tracing programs) + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) * returned in **value**. * * Return * 0 on success. - * **-EINVAL** if n >= arguments count of traced function. + * **-EINVAL** if n >= argument register count of traced function. * * long bpf_get_func_ret(void *ctx, u64 *value) * Description @@ -5097,10 +5097,11 @@ union bpf_attr { * * long bpf_get_func_arg_cnt(void *ctx) * Description - * Get number of arguments of the traced function (for tracing programs). + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. * * Return - * The number of arguments of the traced function. + * The number of argument registers of the traced function. * * int bpf_get_retval(void) * Description -- cgit v1.2.3 From be376df724aa3b7abdf79390eaab60c58a92f4f0 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 27 Aug 2022 04:49:03 +0200 Subject: wifi: brcmfmac: add 43439 SDIO ids and initialization Add HW and SDIO ids for use with the muRata 1YN (Cypress CYW43439). Add the firmware mapping structures for the CYW43439 chipset. The 43439 needs some things setup similar to the 43430 chipset. Signed-off-by: Marek Vasut Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20220827024903.617294-1-marex@denx.de --- include/linux/mmc/sdio_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mmc/sdio_ids.h b/include/linux/mmc/sdio_ids.h index 53f0efa0bccf..74f9d9a6d330 100644 --- a/include/linux/mmc/sdio_ids.h +++ b/include/linux/mmc/sdio_ids.h @@ -74,6 +74,7 @@ #define SDIO_DEVICE_ID_BROADCOM_43362 0xa962 #define SDIO_DEVICE_ID_BROADCOM_43364 0xa9a4 #define SDIO_DEVICE_ID_BROADCOM_43430 0xa9a6 +#define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43439 0xa9af #define SDIO_DEVICE_ID_BROADCOM_43455 0xa9bf #define SDIO_DEVICE_ID_BROADCOM_CYPRESS_43752 0xaae8 -- cgit v1.2.3 From a1be74c5384c4a47d91ab4af2ed854d3b7eaf763 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 5 Sep 2022 13:58:43 +0300 Subject: net/mlx5: Introduce ifc bits for page tracker Introduce ifc related stuff to enable using page tracker. A page tracker is a dirty page tracking object used by the device to report the tracking log. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220905105852.26398-2-yishaih@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 83 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4acd5610e96b..06eab92b9fb3 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -89,6 +89,7 @@ enum { MLX5_OBJ_TYPE_VIRTIO_NET_Q = 0x000d, MLX5_OBJ_TYPE_VIRTIO_Q_COUNTERS = 0x001c, MLX5_OBJ_TYPE_MATCH_DEFINER = 0x0018, + MLX5_OBJ_TYPE_PAGE_TRACK = 0x46, MLX5_OBJ_TYPE_MKEY = 0xff01, MLX5_OBJ_TYPE_QP = 0xff02, MLX5_OBJ_TYPE_PSV = 0xff03, @@ -1733,7 +1734,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 max_geneve_tlv_options[0x8]; u8 reserved_at_568[0x3]; u8 max_geneve_tlv_option_data_len[0x5]; - u8 reserved_at_570[0x10]; + u8 reserved_at_570[0x9]; + u8 adv_virtualization[0x1]; + u8 reserved_at_57a[0x6]; u8 reserved_at_580[0xb]; u8 log_max_dci_stream_channels[0x5]; @@ -11818,4 +11821,82 @@ struct mlx5_ifc_load_vhca_state_out_bits { u8 reserved_at_40[0x40]; }; +struct mlx5_ifc_adv_virtualization_cap_bits { + u8 reserved_at_0[0x3]; + u8 pg_track_log_max_num[0x5]; + u8 pg_track_max_num_range[0x8]; + u8 pg_track_log_min_addr_space[0x8]; + u8 pg_track_log_max_addr_space[0x8]; + + u8 reserved_at_20[0x3]; + u8 pg_track_log_min_msg_size[0x5]; + u8 reserved_at_28[0x3]; + u8 pg_track_log_max_msg_size[0x5]; + u8 reserved_at_30[0x3]; + u8 pg_track_log_min_page_size[0x5]; + u8 reserved_at_38[0x3]; + u8 pg_track_log_max_page_size[0x5]; + + u8 reserved_at_40[0x7c0]; +}; + +struct mlx5_ifc_page_track_report_entry_bits { + u8 dirty_address_high[0x20]; + + u8 dirty_address_low[0x20]; +}; + +enum { + MLX5_PAGE_TRACK_STATE_TRACKING, + MLX5_PAGE_TRACK_STATE_REPORTING, + MLX5_PAGE_TRACK_STATE_ERROR, +}; + +struct mlx5_ifc_page_track_range_bits { + u8 start_address[0x40]; + + u8 length[0x40]; +}; + +struct mlx5_ifc_page_track_bits { + u8 modify_field_select[0x40]; + + u8 reserved_at_40[0x10]; + u8 vhca_id[0x10]; + + u8 reserved_at_60[0x20]; + + u8 state[0x4]; + u8 track_type[0x4]; + u8 log_addr_space_size[0x8]; + u8 reserved_at_90[0x3]; + u8 log_page_size[0x5]; + u8 reserved_at_98[0x3]; + u8 log_msg_size[0x5]; + + u8 reserved_at_a0[0x8]; + u8 reporting_qpn[0x18]; + + u8 reserved_at_c0[0x18]; + u8 num_ranges[0x8]; + + u8 reserved_at_e0[0x20]; + + u8 range_start_address[0x40]; + + u8 length[0x40]; + + struct mlx5_ifc_page_track_range_bits track_range[0]; +}; + +struct mlx5_ifc_create_page_track_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_page_track_bits obj_context; +}; + +struct mlx5_ifc_modify_page_track_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_page_track_bits obj_context; +}; + #endif /* MLX5_IFC_H */ -- cgit v1.2.3 From 939838632b9119614128028eaea3b1d7bf29f16f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 5 Sep 2022 13:58:44 +0300 Subject: net/mlx5: Query ADV_VIRTUALIZATION capabilities Query ADV_VIRTUALIZATION capabilities which provide information for advanced virtualization related features. Current capabilities refer to the page tracker object which is used for tracking the pages that are dirtied by the device. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220905105852.26398-3-yishaih@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/device.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index b5f58fd37a0f..5b41b9fb3d48 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1200,6 +1200,7 @@ enum mlx5_cap_type { MLX5_CAP_DEV_SHAMPO = 0x1d, MLX5_CAP_GENERAL_2 = 0x20, MLX5_CAP_PORT_SELECTION = 0x25, + MLX5_CAP_ADV_VIRTUALIZATION = 0x26, /* NUM OF CAP Types */ MLX5_CAP_NUM }; @@ -1365,6 +1366,14 @@ enum mlx5_qcam_feature_groups { MLX5_GET(port_selection_cap, \ mdev->caps.hca[MLX5_CAP_PORT_SELECTION]->max, cap) +#define MLX5_CAP_ADV_VIRTUALIZATION(mdev, cap) \ + MLX5_GET(adv_virtualization_cap, \ + mdev->caps.hca[MLX5_CAP_ADV_VIRTUALIZATION]->cur, cap) + +#define MLX5_CAP_ADV_VIRTUALIZATION_MAX(mdev, cap) \ + MLX5_GET(adv_virtualization_cap, \ + mdev->caps.hca[MLX5_CAP_ADV_VIRTUALIZATION]->max, cap) + #define MLX5_CAP_FLOWTABLE_PORT_SELECTION(mdev, cap) \ MLX5_CAP_PORT_SELECTION(mdev, flow_table_properties_port_selection.cap) -- cgit v1.2.3 From 08724ef69907214ce622344fe4945412e38368f0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 5 Sep 2022 12:09:36 +0200 Subject: netlink: introduce NLA_POLICY_MAX_BE netlink allows to specify allowed ranges for integer types. Unfortunately, nfnetlink passes integers in big endian, so the existing NLA_POLICY_MAX() cannot be used. At the moment, nfnetlink users, such as nf_tables, need to resort to programmatic checking via helpers such as nft_parse_u32_check(). This is both cumbersome and error prone. This adds NLA_POLICY_MAX_BE which adds range check support for BE16, BE32 and BE64 integers. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/netlink.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/netlink.h b/include/net/netlink.h index e658d18afa67..4418b1981e31 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -325,6 +325,7 @@ struct nla_policy { struct netlink_range_validation_signed *range_signed; struct { s16 min, max; + u8 network_byte_order:1; }; int (*validate)(const struct nlattr *attr, struct netlink_ext_ack *extack); @@ -418,6 +419,14 @@ struct nla_policy { .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MAX, \ .max = _max, \ + .network_byte_order = 0, \ +} + +#define NLA_POLICY_MAX_BE(tp, _max) { \ + .type = NLA_ENSURE_UINT_TYPE(tp), \ + .validation_type = NLA_VALIDATE_MAX, \ + .max = _max, \ + .network_byte_order = 1, \ } #define NLA_POLICY_MASK(tp, _mask) { \ -- cgit v1.2.3 From 0a28bfd4971fd570d1f3e4653b21415becefc92c Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:13 -0700 Subject: net/macsec: Add MACsec skb_metadata_dst Tx Data path support In the current MACsec offload implementation, MACsec interfaces shares the same MAC address by default. Therefore, HW can't distinguish from which MACsec interface the traffic originated from. MACsec stack will use skb_metadata_dst to store the SCI value, which is unique per Macsec interface, skb_metadat_dst will be used by the offloading device driver to associate the SKB with the corresponding offloaded interface (SCI). Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 10 ++++++++++ include/net/macsec.h | 4 ++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index adab27ba1ecb..22a6924bf6da 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -4,11 +4,13 @@ #include #include +#include #include enum metadata_type { METADATA_IP_TUNNEL, METADATA_HW_PORT_MUX, + METADATA_MACSEC, }; struct hw_port_info { @@ -16,12 +18,17 @@ struct hw_port_info { u32 port_id; }; +struct macsec_info { + sci_t sci; +}; + struct metadata_dst { struct dst_entry dst; enum metadata_type type; union { struct ip_tunnel_info tun_info; struct hw_port_info port_info; + struct macsec_info macsec_info; } u; }; @@ -82,6 +89,9 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, return memcmp(&a->u.tun_info, &b->u.tun_info, sizeof(a->u.tun_info) + a->u.tun_info.options_len); + case METADATA_MACSEC: + return memcmp(&a->u.macsec_info, &b->u.macsec_info, + sizeof(a->u.macsec_info)); default: return 1; } diff --git a/include/net/macsec.h b/include/net/macsec.h index 73780aa73644..8494953fb0de 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -19,6 +19,8 @@ typedef u64 __bitwise sci_t; typedef u32 __bitwise ssci_t; +struct metadata_dst; + typedef union salt { struct { u32 ssci; @@ -182,6 +184,7 @@ struct macsec_tx_sa { * @scb: single copy broadcast flag * @sa: array of secure associations * @stats: stats for this TXSC + * @md_dst: MACsec offload metadata dst */ struct macsec_tx_sc { bool active; @@ -192,6 +195,7 @@ struct macsec_tx_sc { bool scb; struct macsec_tx_sa __rcu *sa[MACSEC_NUM_AN]; struct pcpu_tx_sc_stats __percpu *stats; + struct metadata_dst *md_dst; }; /** -- cgit v1.2.3 From b1671253c6015841f6cabd39730fa42fb6d3d407 Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:15 -0700 Subject: net/macsec: Move some code for sharing with various drivers that implements offload Move some MACsec infrastructure like defines and functions, in order to avoid code duplication for future drivers which implements MACsec offload. Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Reviewed-by: Jiri Pirko Reviewed-by: Ben Ben-Ishay Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/net/macsec.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/net/macsec.h b/include/net/macsec.h index 8494953fb0de..871599b11707 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -16,6 +16,20 @@ #define MACSEC_NUM_AN 4 /* 2 bits for the association number */ +#define MACSEC_SCI_LEN 8 +#define MACSEC_PORT_ES (htons(0x0001)) + +#define MACSEC_TCI_VERSION 0x80 +#define MACSEC_TCI_ES 0x40 /* end station */ +#define MACSEC_TCI_SC 0x20 /* SCI present */ +#define MACSEC_TCI_SCB 0x10 /* epon */ +#define MACSEC_TCI_E 0x08 /* encryption */ +#define MACSEC_TCI_C 0x04 /* changed text */ +#define MACSEC_AN_MASK 0x03 /* association number */ +#define MACSEC_TCI_CONFID (MACSEC_TCI_E | MACSEC_TCI_C) + +#define MACSEC_DEFAULT_ICV_LEN 16 + typedef u64 __bitwise sci_t; typedef u32 __bitwise ssci_t; @@ -292,5 +306,12 @@ struct macsec_ops { }; void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa); +static inline bool macsec_send_sci(const struct macsec_secy *secy) +{ + const struct macsec_tx_sc *tx_sc = &secy->tx_sc; + + return tx_sc->send_sci || + (secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb); +} #endif /* _NET_MACSEC_H_ */ -- cgit v1.2.3 From d1b2234b7fbf94348901bed4ea8d015c8a819d02 Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:16 -0700 Subject: net/mlx5: Removed esp_id from struct mlx5_flow_act esp_id is no longer in used Signed-off-by: Lior Nahmanson Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 8e73c377da2c..920cbc9524ad 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -245,7 +245,6 @@ struct mlx5_flow_act { struct mlx5_pkt_reformat *pkt_reformat; union { u32 ipsec_obj_id; - uintptr_t esp_id; }; u32 flags; struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH]; -- cgit v1.2.3 From e227ee990bf974f655ec3132b496409990f6634b Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:17 -0700 Subject: net/mlx5: Generalize Flow Context for new crypto fields In order to support MACsec offload (and maybe some other crypto features in the future), generalize flow action parameters / defines to be used by crypto offlaods other than IPsec. The following changes made: ipsec_obj_id field at flow action context was changed to crypto_obj_id, intreduced a new crypto_type field where IPsec is the default zero type for backward compatibility. Action ipsec_decrypt was changed to crypto_decrypt. Action ipsec_encrypt was changed to crypto_encrypt. IPsec offload code was updated accordingly for backward compatibility. Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Raed Salem Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 7 ++++--- include/linux/mlx5/mlx5_ifc.h | 12 ++++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 920cbc9524ad..e62d50acb6bd 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -243,9 +243,10 @@ struct mlx5_flow_act { u32 action; struct mlx5_modify_hdr *modify_hdr; struct mlx5_pkt_reformat *pkt_reformat; - union { - u32 ipsec_obj_id; - }; + struct mlx5_flow_act_crypto_params { + u8 type; + u32 obj_id; + } crypto; u32 flags; struct mlx5_fs_vlan vlan[MLX5_FS_VLAN_DEPTH]; struct ib_counters *counters; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4acd5610e96b..5758218cb3fa 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3310,8 +3310,8 @@ enum { MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH = 0x100, MLX5_FLOW_CONTEXT_ACTION_VLAN_POP_2 = 0x400, MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH_2 = 0x800, - MLX5_FLOW_CONTEXT_ACTION_IPSEC_DECRYPT = 0x1000, - MLX5_FLOW_CONTEXT_ACTION_IPSEC_ENCRYPT = 0x2000, + MLX5_FLOW_CONTEXT_ACTION_CRYPTO_DECRYPT = 0x1000, + MLX5_FLOW_CONTEXT_ACTION_CRYPTO_ENCRYPT = 0x2000, MLX5_FLOW_CONTEXT_ACTION_EXECUTE_ASO = 0x4000, }; @@ -3321,6 +3321,10 @@ enum { MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT = 0x2, }; +enum { + MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC = 0x0, +}; + struct mlx5_ifc_vlan_bits { u8 ethtype[0x10]; u8 prio[0x3]; @@ -3374,7 +3378,7 @@ struct mlx5_ifc_flow_context_bits { u8 extended_destination[0x1]; u8 reserved_at_81[0x1]; u8 flow_source[0x2]; - u8 reserved_at_84[0x4]; + u8 encrypt_decrypt_type[0x4]; u8 destination_list_size[0x18]; u8 reserved_at_a0[0x8]; @@ -3386,7 +3390,7 @@ struct mlx5_ifc_flow_context_bits { struct mlx5_ifc_vlan_bits push_vlan_2; - u8 ipsec_obj_id[0x20]; + u8 encrypt_decrypt_obj_id[0x20]; u8 reserved_at_140[0xc0]; struct mlx5_ifc_fte_match_param_bits match_value; -- cgit v1.2.3 From 8385c51ff5bceb92f7401138e16b0a617ca2ab02 Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:18 -0700 Subject: net/mlx5: Introduce MACsec Connect-X offload hardware bits and structures Add MACsec offload related IFC structs, layouts and enumerations. Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/device.h | 4 ++ include/linux/mlx5/mlx5_ifc.h | 99 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index b5f58fd37a0f..2927810f172b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1198,6 +1198,7 @@ enum mlx5_cap_type { MLX5_CAP_DEV_EVENT = 0x14, MLX5_CAP_IPSEC, MLX5_CAP_DEV_SHAMPO = 0x1d, + MLX5_CAP_MACSEC = 0x1f, MLX5_CAP_GENERAL_2 = 0x20, MLX5_CAP_PORT_SELECTION = 0x25, /* NUM OF CAP Types */ @@ -1446,6 +1447,9 @@ enum mlx5_qcam_feature_groups { #define MLX5_CAP_DEV_SHAMPO(mdev, cap)\ MLX5_GET(shampo_cap, mdev->caps.hca_cur[MLX5_CAP_DEV_SHAMPO], cap) +#define MLX5_CAP_MACSEC(mdev, cap)\ + MLX5_GET(macsec_cap, (mdev)->caps.hca[MLX5_CAP_MACSEC]->cur, cap) + enum { MLX5_CMD_STAT_OK = 0x0, MLX5_CMD_STAT_INT_ERR = 0x1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5758218cb3fa..8decbf9a7bdd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -82,6 +82,7 @@ enum { MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM = (1ULL << MLX5_OBJ_TYPE_SW_ICM), MLX5_GENERAL_OBJ_TYPES_CAP_GENEVE_TLV_OPT = (1ULL << 11), MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q = (1ULL << 13), + MLX5_GENERAL_OBJ_TYPES_CAP_MACSEC_OFFLOAD = (1ULL << 39), }; enum { @@ -449,7 +450,12 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reserved_at_60[0x2]; u8 reformat_insert[0x1]; u8 reformat_remove[0x1]; - u8 reserver_at_64[0x14]; + u8 macsec_encrypt[0x1]; + u8 macsec_decrypt[0x1]; + u8 reserved_at_66[0x2]; + u8 reformat_add_macsec[0x1]; + u8 reformat_remove_macsec[0x1]; + u8 reserved_at_6a[0xe]; u8 log_max_ft_num[0x8]; u8 reserved_at_80[0x10]; @@ -611,7 +617,11 @@ struct mlx5_ifc_fte_match_set_misc2_bits { u8 metadata_reg_a[0x20]; - u8 reserved_at_1a0[0x60]; + u8 reserved_at_1a0[0x8]; + + u8 macsec_syndrome[0x8]; + + u8 reserved_at_1b0[0x50]; }; struct mlx5_ifc_fte_match_set_misc3_bits { @@ -1276,6 +1286,24 @@ struct mlx5_ifc_ipsec_cap_bits { u8 reserved_at_30[0x7d0]; }; +struct mlx5_ifc_macsec_cap_bits { + u8 macsec_epn[0x1]; + u8 reserved_at_1[0x2]; + u8 macsec_crypto_esp_aes_gcm_256_encrypt[0x1]; + u8 macsec_crypto_esp_aes_gcm_128_encrypt[0x1]; + u8 macsec_crypto_esp_aes_gcm_256_decrypt[0x1]; + u8 macsec_crypto_esp_aes_gcm_128_decrypt[0x1]; + u8 reserved_at_7[0x4]; + u8 log_max_macsec_offload[0x5]; + u8 reserved_at_10[0x10]; + + u8 min_log_macsec_full_replay_window[0x8]; + u8 max_log_macsec_full_replay_window[0x8]; + u8 reserved_at_30[0x10]; + + u8 reserved_at_40[0x7c0]; +}; + enum { MLX5_WQ_TYPE_LINKED_LIST = 0x0, MLX5_WQ_TYPE_CYCLIC = 0x1, @@ -3295,6 +3323,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_device_mem_cap_bits device_mem_cap; struct mlx5_ifc_virtio_emulation_cap_bits virtio_emulation_cap; struct mlx5_ifc_shampo_cap_bits shampo_cap; + struct mlx5_ifc_macsec_cap_bits macsec_cap; u8 reserved_at_0[0x8000]; }; @@ -3323,6 +3352,7 @@ enum { enum { MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_IPSEC = 0x0, + MLX5_FLOW_CONTEXT_ENCRYPT_DECRYPT_TYPE_MACSEC = 0x1, }; struct mlx5_ifc_vlan_bits { @@ -6320,6 +6350,8 @@ enum mlx5_reformat_ctx_type { MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4, MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf, MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10, + MLX5_REFORMAT_TYPE_ADD_MACSEC = 0x11, + MLX5_REFORMAT_TYPE_DEL_MACSEC = 0x12, }; struct mlx5_ifc_alloc_packet_reformat_context_in_bits { @@ -11475,6 +11507,7 @@ enum { MLX5_GENERAL_OBJECT_TYPES_IPSEC = 0x13, MLX5_GENERAL_OBJECT_TYPES_SAMPLER = 0x20, MLX5_GENERAL_OBJECT_TYPES_FLOW_METER_ASO = 0x24, + MLX5_GENERAL_OBJECT_TYPES_MACSEC = 0x27, }; enum { @@ -11525,6 +11558,67 @@ struct mlx5_ifc_modify_ipsec_obj_in_bits { struct mlx5_ifc_ipsec_obj_bits ipsec_object; }; +struct mlx5_ifc_macsec_aso_bits { + u8 valid[0x1]; + u8 reserved_at_1[0x1]; + u8 mode[0x2]; + u8 window_size[0x2]; + u8 soft_lifetime_arm[0x1]; + u8 hard_lifetime_arm[0x1]; + u8 remove_flow_enable[0x1]; + u8 epn_event_arm[0x1]; + u8 reserved_at_a[0x16]; + + u8 remove_flow_packet_count[0x20]; + + u8 remove_flow_soft_lifetime[0x20]; + + u8 reserved_at_60[0x80]; + + u8 mode_parameter[0x20]; + + u8 replay_protection_window[8][0x20]; +}; + +struct mlx5_ifc_macsec_offload_obj_bits { + u8 modify_field_select[0x40]; + + u8 confidentiality_en[0x1]; + u8 reserved_at_41[0x1]; + u8 esn_en[0x1]; + u8 esn_overlap[0x1]; + u8 reserved_at_44[0x2]; + u8 confidentiality_offset[0x2]; + u8 reserved_at_48[0x4]; + u8 aso_return_reg[0x4]; + u8 reserved_at_50[0x10]; + + u8 esn_msb[0x20]; + + u8 reserved_at_80[0x8]; + u8 dekn[0x18]; + + u8 reserved_at_a0[0x20]; + + u8 sci[0x40]; + + u8 reserved_at_100[0x8]; + u8 macsec_aso_access_pd[0x18]; + + u8 reserved_at_120[0x60]; + + u8 salt[3][0x20]; + + u8 reserved_at_1e0[0x20]; + + struct mlx5_ifc_macsec_aso_bits macsec_aso; +}; + +struct mlx5_ifc_create_macsec_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_macsec_offload_obj_bits macsec_object; +}; + struct mlx5_ifc_encryption_key_obj_bits { u8 modify_field_select[0x40]; @@ -11642,6 +11736,7 @@ enum { enum { MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_TLS = 0x1, MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_IPSEC = 0x2, + MLX5_GENERAL_OBJECT_TYPE_ENCRYPTION_KEY_TYPE_MACSEC = 0x4, }; struct mlx5_ifc_tls_static_params_bits { -- cgit v1.2.3 From ee534d7f81ba9cec028580f91429b3dc29b90c7f Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:20 -0700 Subject: net/mlx5: Add MACsec Tx tables support to fs_core Changed EGRESS_KERNEL namespace to EGRESS_IPSEC and add new namespace for MACsec TX. This namespace should be the last namespace for transmitted packets. Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index e62d50acb6bd..53d186774206 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -92,7 +92,8 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_SNIFFER_RX, MLX5_FLOW_NAMESPACE_SNIFFER_TX, MLX5_FLOW_NAMESPACE_EGRESS, - MLX5_FLOW_NAMESPACE_EGRESS_KERNEL, + MLX5_FLOW_NAMESPACE_EGRESS_IPSEC, + MLX5_FLOW_NAMESPACE_EGRESS_MACSEC, MLX5_FLOW_NAMESPACE_RDMA_RX, MLX5_FLOW_NAMESPACE_RDMA_RX_KERNEL, MLX5_FLOW_NAMESPACE_RDMA_TX, -- cgit v1.2.3 From e467b283ffd50cf15b84c73eef68787e257eaed5 Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:21 -0700 Subject: net/mlx5e: Add MACsec TX steering rules Tx flow steering consists of two flow tables (FTs). The first FT (crypto table) has two fixed rules: One default miss rule so non MACsec offloaded packets bypass the MACSec tables, another rule to make sure that MACsec key exchange (MKE) traffic passes unencrypted as expected (matched of ethertype). On each new MACsec offload flow, a new MACsec rule is added. This rule is matched on metadata_reg_a (which contains the id of the flow) and invokes the MACsec offload action on match. The second FT (check table) has two fixed rules: One rule for verifying that the previous offload actions were finished successfully and packet need to be transmitted. Another default rule for dropping packets that were failed in the offload actions. The MACsec FTs should be created on demand when the first MACsec rule is added and destroyed when the last MACsec rule is deleted. Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Raed Salem Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/qp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 8bda3ba5b109..be640c749d5e 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -252,6 +252,7 @@ enum { enum { MLX5_ETH_WQE_FT_META_IPSEC = BIT(0), + MLX5_ETH_WQE_FT_META_MACSEC = BIT(1), }; struct mlx5_wqe_eth_seg { -- cgit v1.2.3 From 15d187e285b3d5f4fe0328c09566355c1e387ff6 Mon Sep 17 00:00:00 2001 From: Lior Nahmanson Date: Mon, 5 Sep 2022 22:21:24 -0700 Subject: net/mlx5: Add MACsec Rx tables support to fs_core Add new namespace for MACsec RX flows. Encrypted MACsec packets should be first decrypted and stripped from MACsec header and then continues with the kernel's steering pipeline. Signed-off-by: Lior Nahmanson Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed Signed-off-by: David S. Miller --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 53d186774206..c7a91981cd5a 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -79,6 +79,7 @@ static inline void build_leftovers_ft_param(int *priority, enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_BYPASS, + MLX5_FLOW_NAMESPACE_KERNEL_RX_MACSEC, MLX5_FLOW_NAMESPACE_LAG, MLX5_FLOW_NAMESPACE_OFFLOADS, MLX5_FLOW_NAMESPACE_ETHTOOL, -- cgit v1.2.3 From a0a4de4d897f5ce672e086cb6b9f91a306af6953 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 22 Aug 2022 16:41:21 +0200 Subject: netfilter: remove NFPROTO_DECNET Decnet has been removed. so no need to reserve space in arrays for it. Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index 53411ccc69db..5a79ccb76701 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -63,7 +63,9 @@ enum { NFPROTO_NETDEV = 5, NFPROTO_BRIDGE = 7, NFPROTO_IPV6 = 10, +#ifndef __KERNEL__ /* no longer supported by kernel */ NFPROTO_DECNET = 12, +#endif NFPROTO_NUMPROTO, }; -- cgit v1.2.3 From c92c27171040554cfda7a3fc925e9dbcb5b4a698 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 6 Sep 2022 17:20:35 +0200 Subject: netfilter: nat: move repetitive nat port reserve loop to a helper Almost all nat helpers reserve an expecation port the same way: Try the port inidcated by the peer, then move to next port if that port is already in use. We can squash this into a helper. Suggested-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- include/net/netfilter/nf_nat_helper.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_nat_helper.h b/include/net/netfilter/nf_nat_helper.h index efae84646353..44c421b9be85 100644 --- a/include/net/netfilter/nf_nat_helper.h +++ b/include/net/netfilter/nf_nat_helper.h @@ -38,4 +38,5 @@ bool nf_nat_mangle_udp_packet(struct sk_buff *skb, struct nf_conn *ct, * to port ct->master->saved_proto. */ void nf_nat_follow_master(struct nf_conn *ct, struct nf_conntrack_expect *this); +u16 nf_nat_exp_find_port(struct nf_conntrack_expect *exp, u16 port); #endif -- cgit v1.2.3 From 95f2f26f3cac06cfc046d2b29e60719d7848ea54 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:12:58 +0200 Subject: bpf: split btf_check_subprog_arg_match in two btf_check_subprog_arg_match() was used twice in verifier.c: - when checking for the type mismatches between a (sub)prog declaration and BTF - when checking the call of a subprog to see if the provided arguments are correct and valid This is problematic when we check if the first argument of a program (pointer to ctx) is correctly accessed: To be able to ensure we access a valid memory in the ctx, the verifier assumes the pointer to context is not null. This has the side effect of marking the program accessing the entire context, even if the context is never dereferenced. For example, by checking the context access with the current code, the following eBPF program would fail with -EINVAL if the ctx is set to null from the userspace: ``` SEC("syscall") int prog(struct my_ctx *args) { return 0; } ``` In that particular case, we do not want to actually check that the memory is correct while checking for the BTF validity, but we just want to ensure that the (sub)prog definition matches the BTF we have. So split btf_check_subprog_arg_match() in two so we can actually check for the memory used when in a call, and ignore that part when not. Note that a further patch is in preparation to disentangled btf_check_func_arg_match() from these two purposes, and so right now we just add a new hack around that by adding a boolean to this function. Signed-off-by: Benjamin Tissoires Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220906151303.2780789-3-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4d32f125f4af..3cf161cfd396 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1947,6 +1947,8 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); +int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, + struct bpf_reg_state *regs); int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, -- cgit v1.2.3 From eb1f7f71c126c8fd50ea81af98f97c4b581ea4ae Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Sep 2022 17:13:02 +0200 Subject: bpf/verifier: allow kfunc to return an allocated mem For drivers (outside of network), the incoming data is not statically defined in a struct. Most of the time the data buffer is kzalloc-ed and thus we can not rely on eBPF and BTF to explore the data. This commit allows to return an arbitrary memory, previously allocated by the driver. An interesting extra point is that the kfunc can mark the exported memory region as read only or read/write. So, when a kfunc is not returning a pointer to a struct but to a plain type, we can consider it is a valid allocated memory assuming that: - one of the arguments is either called rdonly_buf_size or rdwr_buf_size - and this argument is a const from the caller point of view We can then use this parameter as the size of the allocated memory. The memory is either read-only or read-write based on the name of the size parameter. Acked-by: Kumar Kartikeya Dwivedi Signed-off-by: Benjamin Tissoires Link: https://lore.kernel.org/r/20220906151303.2780789-7-benjamin.tissoires@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 ++++++++- include/linux/bpf_verifier.h | 2 ++ include/linux/btf.h | 10 ++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3cf161cfd396..79883f883ff3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1944,6 +1944,13 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, const char *func_name, struct btf_func_model *m); +struct bpf_kfunc_arg_meta { + u64 r0_size; + bool r0_rdonly; + int ref_obj_id; + u32 flags; +}; + struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); @@ -1952,7 +1959,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, - u32 kfunc_flags); + struct bpf_kfunc_arg_meta *meta); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1fdddbf3546b..8fbc1d05281e 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -598,6 +598,8 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, struct bpf_attach_target_info *tgt_info); void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab); +int mark_chain_precision(struct bpf_verifier_env *env, int regno); + #define BPF_BASE_TYPE_MASK GENMASK(BPF_BASE_TYPE_BITS - 1, 0) /* extract base type from bpf_{arg, return, reg}_type. */ diff --git a/include/linux/btf.h b/include/linux/btf.h index ad93c2d9cc1c..1fcc833a8690 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -441,4 +441,14 @@ static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dt } #endif +static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) +{ + if (!btf_type_is_ptr(t)) + return false; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + return btf_type_is_struct(t); +} + #endif -- cgit v1.2.3 From 448325199f574d33824dbf9121efb03558412966 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Sep 2022 22:41:14 +0200 Subject: bpf: Add copy_map_value_long to copy to remote percpu memory bpf_long_memcpy is used while copying to remote percpu regions from BPF syscall and helpers, so that the copy is atomic at word size granularity. This might not be possible when you copy from map value hosting kptrs from or to percpu maps, as the alignment or size in disjoint regions may not be multiple of word size. Hence, to avoid complicating the copy loop, we only use bpf_long_memcpy when special fields are not present, otherwise use normal memcpy to copy the disjoint regions. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220904204145.3089-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 52 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 79883f883ff3..6a73e94821c4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -280,14 +280,33 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst) } } -/* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ -static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) +/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and + * forced to use 'long' read/writes to try to atomically copy long counters. + * Best-effort only. No barriers here, since it _will_ race with concurrent + * updates from BPF programs. Called from bpf syscall and mostly used with + * size 8 or 16 bytes, so ask compiler to inline it. + */ +static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) +{ + const long *lsrc = src; + long *ldst = dst; + + size /= sizeof(long); + while (size--) + *ldst++ = *lsrc++; +} + +/* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ +static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, bool long_memcpy) { u32 curr_off = 0; int i; if (likely(!map->off_arr)) { - memcpy(dst, src, map->value_size); + if (long_memcpy) + bpf_long_memcpy(dst, src, round_up(map->value_size, 8)); + else + memcpy(dst, src, map->value_size); return; } @@ -299,6 +318,17 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) } memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); } + +static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) +{ + __copy_map_value(map, dst, src, false); +} + +static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) +{ + __copy_map_value(map, dst, src, true); +} + void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); @@ -1827,22 +1857,6 @@ int bpf_get_file_flag(int flags); int bpf_check_uarg_tail_zero(bpfptr_t uaddr, size_t expected_size, size_t actual_size); -/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and - * forced to use 'long' read/writes to try to atomically copy long counters. - * Best-effort only. No barriers here, since it _will_ race with concurrent - * updates from BPF programs. Called from bpf syscall and mostly used with - * size 8 or 16 bytes, so ask compiler to inline it. - */ -static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) -{ - const long *lsrc = src; - long *ldst = dst; - - size /= sizeof(long); - while (size--) - *ldst++ = *lsrc++; -} - /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, bpfptr_t uattr); -- cgit v1.2.3 From cc48755808c646666436745b35629c3f0d05e165 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Sep 2022 22:41:16 +0200 Subject: bpf: Add zero_map_value to zero map value with special fields We need this helper to skip over special fields (bpf_spin_lock, bpf_timer, kptrs) while zeroing a map value. Use the same logic as copy_map_value but memset instead of memcpy. Currently, the code zeroing map value memory does not have to deal with special fields, hence this is a prerequisite for introducing such support. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220904204145.3089-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6a73e94821c4..48ae05099f36 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -329,6 +329,25 @@ static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src __copy_map_value(map, dst, src, true); } +static inline void zero_map_value(struct bpf_map *map, void *dst) +{ + u32 curr_off = 0; + int i; + + if (likely(!map->off_arr)) { + memset(dst, 0, map->value_size); + return; + } + + for (i = 0; i < map->off_arr->cnt; i++) { + u32 next_off = map->off_arr->field_off[i]; + + memset(dst + curr_off, 0, next_off - curr_off); + curr_off += map->off_arr->field_sz[i]; + } + memset(dst + curr_off, 0, map->value_size - curr_off); +} + void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); -- cgit v1.2.3 From b239da34203f49c40b5d656220c39647c3ff0b3c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sun, 4 Sep 2022 22:41:28 +0200 Subject: bpf: Add helper macro bpf_for_each_reg_in_vstate For a lot of use cases in future patches, we will want to modify the state of registers part of some same 'group' (e.g. same ref_obj_id). It won't just be limited to releasing reference state, but setting a type flag dynamically based on certain actions, etc. Hence, we need a way to easily pass a callback to the function that iterates over all registers in current bpf_verifier_state in all frames upto (and including) the curframe. While in C++ we would be able to easily use a lambda to pass state and the callback together, sadly we aren't using C++ in the kernel. The next best thing to avoid defining a function for each case seems like statement expressions in GNU C. The kernel already uses them heavily, hence they can passed to the macro in the style of a lambda. The statement expression will then be substituted in the for loop bodies. Variables __state and __reg are set to current bpf_func_state and reg for each invocation of the expression inside the passed in verifier state. Then, convert mark_ptr_or_null_regs, clear_all_pkt_pointers, release_reference, find_good_pkt_pointers, find_equal_scalars to use bpf_for_each_reg_in_vstate. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220904204145.3089-16-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8fbc1d05281e..b49a349cc6ae 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -348,6 +348,27 @@ struct bpf_verifier_state { iter < frame->allocated_stack / BPF_REG_SIZE; \ iter++, reg = bpf_get_spilled_reg(iter, frame)) +/* Invoke __expr over regsiters in __vst, setting __state and __reg */ +#define bpf_for_each_reg_in_vstate(__vst, __state, __reg, __expr) \ + ({ \ + struct bpf_verifier_state *___vstate = __vst; \ + int ___i, ___j; \ + for (___i = 0; ___i <= ___vstate->curframe; ___i++) { \ + struct bpf_reg_state *___regs; \ + __state = ___vstate->frame[___i]; \ + ___regs = __state->regs; \ + for (___j = 0; ___j < MAX_BPF_REG; ___j++) { \ + __reg = &___regs[___j]; \ + (void)(__expr); \ + } \ + bpf_for_each_spilled_reg(___j, __state, __reg) { \ + if (!__reg) \ + continue; \ + (void)(__expr); \ + } \ + } \ + }) + /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; -- cgit v1.2.3 From 2a40f883781d6cbbf547ed13b0cec2f9808d839d Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 8 Sep 2022 13:57:50 -0700 Subject: Bluetooth: Fix HCIGETDEVINFO regression Recent changes breaks HCIGETDEVINFO since it changes the size of hci_dev_info. Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Reported-by: Marek Szyprowski Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_sock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_sock.h b/include/net/bluetooth/hci_sock.h index 0520e21ab698..9949870f7d78 100644 --- a/include/net/bluetooth/hci_sock.h +++ b/include/net/bluetooth/hci_sock.h @@ -124,8 +124,6 @@ struct hci_dev_info { __u16 acl_pkts; __u16 sco_mtu; __u16 sco_pkts; - __u16 iso_mtu; - __u16 iso_pkts; struct hci_dev_stats stat; }; -- cgit v1.2.3 From bb5721f063ba63cd54cc5916881a706c21e6abc6 Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Mon, 5 Sep 2022 09:21:25 -0700 Subject: mfd: ocelot: Add helper to get regmap from a resource Several ocelot-related modules are designed for MMIO / regmaps. As such, they often use a combination of devm_platform_get_and_ioremap_resource() and devm_regmap_init_mmio(). Operating in an MFD might be different, in that it could be memory mapped, or it could be SPI, I2C... In these cases a fallback to use IORESOURCE_REG instead of IORESOURCE_MEM becomes necessary. When this happens, there's redundant logic that needs to be implemented in every driver. In order to avoid this redundancy, utilize a single function that, if the MFD scenario is enabled, will perform this fallback logic. Signed-off-by: Colin Foster Reviewed-by: Vladimir Oltean Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20220905162132.2943088-2-colin.foster@in-advantage.com --- include/linux/mfd/ocelot.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 include/linux/mfd/ocelot.h (limited to 'include') diff --git a/include/linux/mfd/ocelot.h b/include/linux/mfd/ocelot.h new file mode 100644 index 000000000000..dd72073d2d4f --- /dev/null +++ b/include/linux/mfd/ocelot.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* Copyright 2022 Innovative Advantage Inc. */ + +#ifndef _LINUX_MFD_OCELOT_H +#define _LINUX_MFD_OCELOT_H + +#include +#include +#include +#include +#include +#include + +struct resource; + +static inline struct regmap * +ocelot_regmap_from_resource_optional(struct platform_device *pdev, + unsigned int index, + const struct regmap_config *config) +{ + struct device *dev = &pdev->dev; + struct resource *res; + void __iomem *regs; + + /* + * Don't use _get_and_ioremap_resource() here, since that will invoke + * prints of "invalid resource" which will simply add confusion. + */ + res = platform_get_resource(pdev, IORESOURCE_MEM, index); + if (res) { + regs = devm_ioremap_resource(dev, res); + if (IS_ERR(regs)) + return ERR_CAST(regs); + return devm_regmap_init_mmio(dev, regs, config); + } + + /* + * Fall back to using REG and getting the resource from the parent + * device, which is possible in an MFD configuration + */ + if (dev->parent) { + res = platform_get_resource(pdev, IORESOURCE_REG, index); + if (!res) + return NULL; + + return dev_get_regmap(dev->parent, res->name); + } + + return NULL; +} + +static inline struct regmap * +ocelot_regmap_from_resource(struct platform_device *pdev, unsigned int index, + const struct regmap_config *config) +{ + struct regmap *map; + + map = ocelot_regmap_from_resource_optional(pdev, index, config); + return map ?: ERR_PTR(-ENOENT); +} + +#endif -- cgit v1.2.3 From 39f7d0832c28a6a31abb5b7363e7b1ba0714fe18 Mon Sep 17 00:00:00 2001 From: Colin Foster Date: Mon, 5 Sep 2022 09:21:30 -0700 Subject: resource: add define macro for register address resources DEFINE_RES_ macros have been created for the commonly used resource types, but not IORESOURCE_REG. Add the macro so it can be used in a similar manner to all other resource types. Signed-off-by: Colin Foster Reviewed-by: Vladimir Oltean Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones Link: https://lore.kernel.org/r/20220905162132.2943088-7-colin.foster@in-advantage.com --- include/linux/ioport.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 616b683563a9..8a76dca9deee 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -172,6 +172,11 @@ enum { #define DEFINE_RES_MEM(_start, _size) \ DEFINE_RES_MEM_NAMED((_start), (_size), NULL) +#define DEFINE_RES_REG_NAMED(_start, _size, _name) \ + DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_REG) +#define DEFINE_RES_REG(_start, _size) \ + DEFINE_RES_REG_NAMED((_start), (_size), NULL) + #define DEFINE_RES_IRQ_NAMED(_irq, _name) \ DEFINE_RES_NAMED((_irq), 1, (_name), IORESOURCE_IRQ) #define DEFINE_RES_IRQ(_irq) \ -- cgit v1.2.3 From acd0a7ab6334f35c3720120d53f79eb8e9b3ac2e Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Thu, 8 Sep 2022 12:14:33 +0800 Subject: net: sched: act: move global static variable net_id to tc_action_ops Each tc action module has a corresponding net_id, so put net_id directly into the structure tc_action_ops. Signed-off-by: Zhengchao Shao Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_api.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/act_api.h b/include/net/act_api.h index 9cf6870b526e..61f2ceb3939e 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -111,6 +111,7 @@ struct tc_action_ops { struct list_head head; char kind[IFNAMSIZ]; enum tca_id id; /* identifier should match kind */ + unsigned int net_id; size_t size; struct module *owner; int (*act)(struct sk_buff *, const struct tc_action *, -- cgit v1.2.3 From 0a2360c59687e4caec363565c166e8a2b3e30677 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 8 Sep 2022 19:48:03 +0300 Subject: net: dsa: felix: add definitions for the stream filter counters TSN stream (802.1Qci, 802.1CB) filters are also accessed through STAT_VIEW, just like the port registers, but these counters are per stream, rather than per port. So we don't keep them in ocelot_port_update_stats(). What we can do, however, is we can create register definitions for them just like we have for the port counters, and delete the last remaining user of the SYS_CNT register + a group index (read_gix). Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 2a7e18ee5577..99d679235070 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -411,6 +411,10 @@ enum ocelot_reg { SYS_COUNT_DROP_GREEN_PRIO_5, SYS_COUNT_DROP_GREEN_PRIO_6, SYS_COUNT_DROP_GREEN_PRIO_7, + SYS_COUNT_SF_MATCHING_FRAMES, + SYS_COUNT_SF_NOT_PASSING_FRAMES, + SYS_COUNT_SF_NOT_PASSING_SDU, + SYS_COUNT_SF_RED_FRAMES, SYS_RESET_CFG, SYS_SR_ETYPE_CFG, SYS_VLAN_ETYPE_CFG, @@ -433,7 +437,6 @@ enum ocelot_reg { SYS_MMGT_FAST, SYS_EVENTS_DIF, SYS_EVENTS_CORE, - SYS_CNT, SYS_PTP_STATUS, SYS_PTP_TXSTAMP, SYS_PTP_NXT, -- cgit v1.2.3 From 96980ff7c2caa5baef0c684e719547a53762e82c Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 8 Sep 2022 19:48:04 +0300 Subject: net: mscc: ocelot: make access to STAT_VIEW sleepable again To support SPI-controlled switches in the future, access to SYS_STAT_CFG_STAT_VIEW needs to be done outside of any spinlock protected region, but it still needs to be serialized (by a mutex). Split the ocelot->stats_lock spinlock into a mutex that serializes indirect access to hardware registers (ocelot->stat_view_lock) and a spinlock that serializes access to the u64 ocelot->stats array. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 99d679235070..e85fb3b15524 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -901,12 +901,15 @@ struct ocelot { struct ocelot_psfp_list psfp; - /* Workqueue to check statistics for overflow with its lock */ - spinlock_t stats_lock; - u64 *stats; + /* Workqueue to check statistics for overflow */ struct delayed_work stats_work; struct workqueue_struct *stats_queue; + /* Lock for serializing access to the statistics array */ + spinlock_t stats_lock; + u64 *stats; + /* Lock for serializing indirect access to STAT_VIEW registers */ + struct mutex stat_view_lock; /* Lock for serializing access to the MAC table */ struct mutex mact_lock; /* Lock for serializing forwarding domain changes */ -- cgit v1.2.3 From 25027c8409b4541611d0060077607d8ca70740df Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 8 Sep 2022 19:48:05 +0300 Subject: net: dsa: felix: check the 32-bit PSFP stats against overflow The Felix PSFP counters suffer from the same problem as the ocelot ndo_get_stats64 ones - they are 32-bit, so they can easily overflow and this can easily go undetected. Add a custom hook in ocelot_check_stats_work() through which driver specific actions can be taken, and update the stats for the existing PSFP filters from that hook. Previously, vsc9959_psfp_filter_add() and vsc9959_psfp_filter_del() were serialized with respect to each other via rtnl_lock(). However, with the new entry point into &psfp->sfi_list coming from the periodic worker, we now need an explicit mutex to serialize access to these lists. We used to keep a struct felix_stream_filter_counters on stack, through which vsc9959_psfp_stats_get() - a FLOW_CLS_STATS callback - would retrieve data from vsc9959_psfp_counters_get(). We need to become smarter about that in 3 ways: - we need to keep a persistent set of counters for each stream instead of keeping them on stack - we need to promote those counters from u32 to u64, and create a procedure that properly keeps 64-bit counters. Since we clear the hardware counters anyway, and we poll every 2 seconds, a simple increment of a u64 counter with a u32 value will perfectly do the job. - FLOW_CLS_STATS also expect incremental counters, so we also need to zeroize our u64 counters every time sch_flower calls us Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index e85fb3b15524..bc6ca1be08f3 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -729,6 +729,7 @@ struct ocelot_ops { struct flow_stats *stats); void (*cut_through_fwd)(struct ocelot *ocelot); void (*tas_clock_adjust)(struct ocelot *ocelot); + void (*update_stats)(struct ocelot *ocelot); }; struct ocelot_vcap_policer { @@ -766,6 +767,8 @@ struct ocelot_psfp_list { struct list_head stream_list; struct list_head sfi_list; struct list_head sgi_list; + /* Serialize access to the lists */ + struct mutex lock; }; enum ocelot_sb { -- cgit v1.2.3 From 776b71e553841f9cde87c21078ce2d9772e6da7a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 8 Sep 2022 19:48:11 +0300 Subject: net: dsa: felix: use ocelot's ndo_get_stats64 method Move the logic from the ocelot switchdev driver's ocelot_get_stats64() method to the common switch lib and reuse it for the DSA driver. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index bc6ca1be08f3..2f639ef88f8f 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -1043,6 +1043,8 @@ u32 ocelot_port_assigned_dsa_8021q_cpu_mask(struct ocelot *ocelot, int port); void ocelot_get_strings(struct ocelot *ocelot, int port, u32 sset, u8 *data); void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data); int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset); +void ocelot_port_get_stats64(struct ocelot *ocelot, int port, + struct rtnl_link_stats64 *stats); int ocelot_get_ts_info(struct ocelot *ocelot, int port, struct ethtool_ts_info *info); void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs); -- cgit v1.2.3 From e32036e1ae7bb71b64559313f9ea8790a0acaa51 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 8 Sep 2022 19:48:13 +0300 Subject: net: mscc: ocelot: add support for all sorts of standardized counters present in DSA DSA is integrated with the new standardized ethtool -S --groups option, but the felix driver only exports unstructured statistics. Reuse the array of 64-bit statistics collected by ocelot_check_stats_work(), but just export select values from it. Since ocelot_check_stats_work() runs periodically to avoid 32-bit overflow, and the ethtool calling context is sleepable, we update the 64-bit stats one more time, to provide up-to-date values. The locking scheme with a mutex followed by a spinlock is a bit hard to digest, so we create and use a ocelot_port_stats_run() helper with a callback that populates the ethool stats group the caller is interested in. The exported stats are: ethtool -S swp0 --groups eth-phy ethtool -S swp0 --groups eth-mac ethtool -S swp0 --groups eth-ctrl ethtool -S swp0 --groups rmon ethtool --include-statistics --show-pause swp0 Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 2f639ef88f8f..050e142518e6 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -1045,6 +1045,17 @@ void ocelot_get_ethtool_stats(struct ocelot *ocelot, int port, u64 *data); int ocelot_get_sset_count(struct ocelot *ocelot, int port, int sset); void ocelot_port_get_stats64(struct ocelot *ocelot, int port, struct rtnl_link_stats64 *stats); +void ocelot_port_get_pause_stats(struct ocelot *ocelot, int port, + struct ethtool_pause_stats *pause_stats); +void ocelot_port_get_rmon_stats(struct ocelot *ocelot, int port, + struct ethtool_rmon_stats *rmon_stats, + const struct ethtool_rmon_hist_range **ranges); +void ocelot_port_get_eth_ctrl_stats(struct ocelot *ocelot, int port, + struct ethtool_eth_ctrl_stats *ctrl_stats); +void ocelot_port_get_eth_mac_stats(struct ocelot *ocelot, int port, + struct ethtool_eth_mac_stats *mac_stats); +void ocelot_port_get_eth_phy_stats(struct ocelot *ocelot, int port, + struct ethtool_eth_phy_stats *phy_stats); int ocelot_get_ts_info(struct ocelot *ocelot, int port, struct ethtool_ts_info *info); void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs); -- cgit v1.2.3 From be5c13f262050f90b56a4290645bf7a14055d3cd Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 8 Sep 2022 19:48:14 +0300 Subject: net: mscc: ocelot: harmonize names of SYS_COUNT_TX_AGING and OCELOT_STAT_TX_AGED The hardware counter is called C_TX_AGED, so rename SYS_COUNT_TX_AGING to SYS_COUNT_TX_AGED. This will become important since we want to minimize the way in which we declare struct ocelot_stat_layout elements, using the C preprocessor. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 050e142518e6..860ec592c689 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -392,7 +392,7 @@ enum ocelot_reg { SYS_COUNT_TX_GREEN_PRIO_5, SYS_COUNT_TX_GREEN_PRIO_6, SYS_COUNT_TX_GREEN_PRIO_7, - SYS_COUNT_TX_AGING, + SYS_COUNT_TX_AGED, SYS_COUNT_DROP_LOCAL, SYS_COUNT_DROP_TAIL, SYS_COUNT_DROP_YELLOW_PRIO_0, -- cgit v1.2.3 From b69cf1c675723b1417e018971faf2bcb1620ee7a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 8 Sep 2022 19:48:15 +0300 Subject: net: mscc: ocelot: minimize definitions for stats The current definition of struct ocelot_stat_layout is long-winded (4 lines per entry, and we have hundreds of entries), so we could make an effort to use the C preprocessor and reduce the line count. Create an implicit correspondence between enum ocelot_reg, which tells us the register address (SYS_COUNT_RX_OCTETS etc) and enum ocelot_stat which allows us to index the ocelot->stats array (OCELOT_STAT_RX_OCTETS etc), and don't require us to specify both when we define what stats each switch family has. Create an OCELOT_STAT() macro that pairs only an enum ocelot_stat to an enum ocelot_reg, and an OCELOT_STAT_ETHTOOL() macro which also contains a name exported to the unstructured ethtool -S stringset API. For now, we define all counters as having the OCELOT_STAT_ETHTOOL() kind, but we will add more counters in the future which are not exported to the unstructured ethtool -S. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 860ec592c689..2fd8486bb7f0 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -698,6 +698,17 @@ struct ocelot_stat_layout { char name[ETH_GSTRING_LEN]; }; +/* 32-bit counter checked for wraparound by ocelot_port_update_stats() + * and copied to ocelot->stats. + */ +#define OCELOT_STAT(kind) \ + [OCELOT_STAT_ ## kind] = { .reg = SYS_COUNT_ ## kind } +/* Same as above, except also exported to ethtool -S. Standard counters should + * only be exposed to more specific interfaces rather than by their string name. + */ +#define OCELOT_STAT_ETHTOOL(kind, ethtool_name) \ + [OCELOT_STAT_ ## kind] = { .reg = SYS_COUNT_ ## kind, .name = ethtool_name } + struct ocelot_stats_region { struct list_head node; u32 base; -- cgit v1.2.3 From 4d1d157fb6a49f6720a3fb3135299e475d5bc844 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 8 Sep 2022 19:48:16 +0300 Subject: net: mscc: ocelot: share the common stat definitions between all drivers All switch families supported by the ocelot lib (ocelot, felix, seville) export the same registers so far. But for example felix also has TSN counters, while the others don't. To reduce the bloat even further, create an OCELOT_COMMON_STATS() macro which just lists all stats that are common between switches. The array elements are still replicated among all of vsc9959_stats_layout, vsc9953_stats_layout and ocelot_stats_layout. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- include/soc/mscc/ocelot.h | 95 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 2fd8486bb7f0..355cfdedc43b 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -709,6 +709,101 @@ struct ocelot_stat_layout { #define OCELOT_STAT_ETHTOOL(kind, ethtool_name) \ [OCELOT_STAT_ ## kind] = { .reg = SYS_COUNT_ ## kind, .name = ethtool_name } +#define OCELOT_COMMON_STATS \ + OCELOT_STAT_ETHTOOL(RX_OCTETS, "rx_octets"), \ + OCELOT_STAT_ETHTOOL(RX_UNICAST, "rx_unicast"), \ + OCELOT_STAT_ETHTOOL(RX_MULTICAST, "rx_multicast"), \ + OCELOT_STAT_ETHTOOL(RX_BROADCAST, "rx_broadcast"), \ + OCELOT_STAT_ETHTOOL(RX_SHORTS, "rx_shorts"), \ + OCELOT_STAT_ETHTOOL(RX_FRAGMENTS, "rx_fragments"), \ + OCELOT_STAT_ETHTOOL(RX_JABBERS, "rx_jabbers"), \ + OCELOT_STAT_ETHTOOL(RX_CRC_ALIGN_ERRS, "rx_crc_align_errs"), \ + OCELOT_STAT_ETHTOOL(RX_SYM_ERRS, "rx_sym_errs"), \ + OCELOT_STAT_ETHTOOL(RX_64, "rx_frames_below_65_octets"), \ + OCELOT_STAT_ETHTOOL(RX_65_127, "rx_frames_65_to_127_octets"), \ + OCELOT_STAT_ETHTOOL(RX_128_255, "rx_frames_128_to_255_octets"), \ + OCELOT_STAT_ETHTOOL(RX_256_511, "rx_frames_256_to_511_octets"), \ + OCELOT_STAT_ETHTOOL(RX_512_1023, "rx_frames_512_to_1023_octets"), \ + OCELOT_STAT_ETHTOOL(RX_1024_1526, "rx_frames_1024_to_1526_octets"), \ + OCELOT_STAT_ETHTOOL(RX_1527_MAX, "rx_frames_over_1526_octets"), \ + OCELOT_STAT_ETHTOOL(RX_PAUSE, "rx_pause"), \ + OCELOT_STAT_ETHTOOL(RX_CONTROL, "rx_control"), \ + OCELOT_STAT_ETHTOOL(RX_LONGS, "rx_longs"), \ + OCELOT_STAT_ETHTOOL(RX_CLASSIFIED_DROPS, "rx_classified_drops"), \ + OCELOT_STAT_ETHTOOL(RX_RED_PRIO_0, "rx_red_prio_0"), \ + OCELOT_STAT_ETHTOOL(RX_RED_PRIO_1, "rx_red_prio_1"), \ + OCELOT_STAT_ETHTOOL(RX_RED_PRIO_2, "rx_red_prio_2"), \ + OCELOT_STAT_ETHTOOL(RX_RED_PRIO_3, "rx_red_prio_3"), \ + OCELOT_STAT_ETHTOOL(RX_RED_PRIO_4, "rx_red_prio_4"), \ + OCELOT_STAT_ETHTOOL(RX_RED_PRIO_5, "rx_red_prio_5"), \ + OCELOT_STAT_ETHTOOL(RX_RED_PRIO_6, "rx_red_prio_6"), \ + OCELOT_STAT_ETHTOOL(RX_RED_PRIO_7, "rx_red_prio_7"), \ + OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_0, "rx_yellow_prio_0"), \ + OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_1, "rx_yellow_prio_1"), \ + OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_2, "rx_yellow_prio_2"), \ + OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_3, "rx_yellow_prio_3"), \ + OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_4, "rx_yellow_prio_4"), \ + OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_5, "rx_yellow_prio_5"), \ + OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_6, "rx_yellow_prio_6"), \ + OCELOT_STAT_ETHTOOL(RX_YELLOW_PRIO_7, "rx_yellow_prio_7"), \ + OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_0, "rx_green_prio_0"), \ + OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_1, "rx_green_prio_1"), \ + OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_2, "rx_green_prio_2"), \ + OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_3, "rx_green_prio_3"), \ + OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_4, "rx_green_prio_4"), \ + OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_5, "rx_green_prio_5"), \ + OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_6, "rx_green_prio_6"), \ + OCELOT_STAT_ETHTOOL(RX_GREEN_PRIO_7, "rx_green_prio_7"), \ + OCELOT_STAT_ETHTOOL(TX_OCTETS, "tx_octets"), \ + OCELOT_STAT_ETHTOOL(TX_UNICAST, "tx_unicast"), \ + OCELOT_STAT_ETHTOOL(TX_MULTICAST, "tx_multicast"), \ + OCELOT_STAT_ETHTOOL(TX_BROADCAST, "tx_broadcast"), \ + OCELOT_STAT_ETHTOOL(TX_COLLISION, "tx_collision"), \ + OCELOT_STAT_ETHTOOL(TX_DROPS, "tx_drops"), \ + OCELOT_STAT_ETHTOOL(TX_PAUSE, "tx_pause"), \ + OCELOT_STAT_ETHTOOL(TX_64, "tx_frames_below_65_octets"), \ + OCELOT_STAT_ETHTOOL(TX_65_127, "tx_frames_65_to_127_octets"), \ + OCELOT_STAT_ETHTOOL(TX_128_255, "tx_frames_128_255_octets"), \ + OCELOT_STAT_ETHTOOL(TX_256_511, "tx_frames_256_511_octets"), \ + OCELOT_STAT_ETHTOOL(TX_512_1023, "tx_frames_512_1023_octets"), \ + OCELOT_STAT_ETHTOOL(TX_1024_1526, "tx_frames_1024_1526_octets"), \ + OCELOT_STAT_ETHTOOL(TX_1527_MAX, "tx_frames_over_1526_octets"), \ + OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_0, "tx_yellow_prio_0"), \ + OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_1, "tx_yellow_prio_1"), \ + OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_2, "tx_yellow_prio_2"), \ + OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_3, "tx_yellow_prio_3"), \ + OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_4, "tx_yellow_prio_4"), \ + OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_5, "tx_yellow_prio_5"), \ + OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_6, "tx_yellow_prio_6"), \ + OCELOT_STAT_ETHTOOL(TX_YELLOW_PRIO_7, "tx_yellow_prio_7"), \ + OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_0, "tx_green_prio_0"), \ + OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_1, "tx_green_prio_1"), \ + OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_2, "tx_green_prio_2"), \ + OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_3, "tx_green_prio_3"), \ + OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_4, "tx_green_prio_4"), \ + OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_5, "tx_green_prio_5"), \ + OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_6, "tx_green_prio_6"), \ + OCELOT_STAT_ETHTOOL(TX_GREEN_PRIO_7, "tx_green_prio_7"), \ + OCELOT_STAT_ETHTOOL(TX_AGED, "tx_aged"), \ + OCELOT_STAT_ETHTOOL(DROP_LOCAL, "drop_local"), \ + OCELOT_STAT_ETHTOOL(DROP_TAIL, "drop_tail"), \ + OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_0, "drop_yellow_prio_0"), \ + OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_1, "drop_yellow_prio_1"), \ + OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_2, "drop_yellow_prio_2"), \ + OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_3, "drop_yellow_prio_3"), \ + OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_4, "drop_yellow_prio_4"), \ + OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_5, "drop_yellow_prio_5"), \ + OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_6, "drop_yellow_prio_6"), \ + OCELOT_STAT_ETHTOOL(DROP_YELLOW_PRIO_7, "drop_yellow_prio_7"), \ + OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_0, "drop_green_prio_0"), \ + OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_1, "drop_green_prio_1"), \ + OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_2, "drop_green_prio_2"), \ + OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_3, "drop_green_prio_3"), \ + OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_4, "drop_green_prio_4"), \ + OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_5, "drop_green_prio_5"), \ + OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_6, "drop_green_prio_6"), \ + OCELOT_STAT_ETHTOOL(DROP_GREEN_PRIO_7, "drop_green_prio_7") + struct ocelot_stats_region { struct list_head node; u32 base; -- cgit v1.2.3 From d4f7bdb2ed7bf320a8772258fdc257655d225afb Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 7 Sep 2022 10:40:37 -0600 Subject: bpf: Add stub for btf_struct_access() Add corresponding unimplemented stub for when CONFIG_BPF_SYSCALL=n Signed-off-by: Daniel Xu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/4021398e884433b1fef57a4d28361bb9fcf1bd05.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 48ae05099f36..54178b9e9c3a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2211,6 +2211,15 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) return ERR_PTR(-ENOTSUPP); } +static inline int btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag) +{ + return -EACCES; +} + static inline const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { -- cgit v1.2.3 From 864b656f82ccd433d3e38149c3673d295ad64bf6 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Wed, 7 Sep 2022 10:40:40 -0600 Subject: bpf: Add support for writing to nf_conn:mark Support direct writes to nf_conn:mark from TC and XDP prog types. This is useful when applications want to store per-connection metadata. This is also particularly useful for applications that run both bpf and iptables/nftables because the latter can trivially access this metadata. One example use case would be if a bpf prog is responsible for advanced packet classification and iptables/nftables is later used for routing due to pre-existing/legacy code. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/ebca06dea366e3e7e861c12f375a548cc4c61108.1662568410.git.dxu@dxuuu.xyz Signed-off-by: Alexei Starovoitov --- include/net/netfilter/nf_conntrack_bpf.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index a473b56842c5..a61a93d1c6dc 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -3,13 +3,22 @@ #ifndef _NF_CONNTRACK_BPF_H #define _NF_CONNTRACK_BPF_H +#include #include #include +#include #if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) extern int register_nf_conntrack_bpf(void); +extern void cleanup_nf_conntrack_bpf(void); + +extern struct mutex nf_conn_btf_access_lock; +extern int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); #else @@ -18,6 +27,20 @@ static inline int register_nf_conntrack_bpf(void) return 0; } +static inline void cleanup_nf_conntrack_bpf(void) +{ +} + +static inline int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + return -EACCES; +} + #endif #endif /* _NF_CONNTRACK_BPF_H */ -- cgit v1.2.3 From 1bfe26fb082724be453e4d7fd9bb358e3ba669b2 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 8 Sep 2022 16:07:16 -0700 Subject: bpf: Add verifier support for custom callback return range Verifier logic to confirm that a callback function returns 0 or 1 was added in commit 69c087ba6225b ("bpf: Add bpf_for_each_map_elem() helper"). At the time, callback return value was only used to continue or stop iteration. In order to support callbacks with a broader return value range, such as those added in rbtree series[0] and others, add a callback_ret_range to bpf_func_state. Verifier's helpers which set in_callback_fn will also set the new field, which the verifier will later use to check return value bounds. Default to tnum_range(0, 0) instead of using tnum_unknown as a sentinel value as the latter would prevent the valid range (0, U64_MAX) being used. Previous global default tnum_range(0, 1) is explicitly set for extant callback helpers. The change to global default was made after discussion around this patch in rbtree series [1], goal here is to make it more obvious that callback_ret_range should be explicitly set. [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com/ [1]: lore.kernel.org/bpf/20220830172759.4069786-2-davemarchevsky@fb.com/ Signed-off-by: Dave Marchevsky Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20220908230716.2751723-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b49a349cc6ae..e197f8fb27e2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -248,6 +248,7 @@ struct bpf_func_state { */ u32 async_entry_cnt; bool in_callback_fn; + struct tnum callback_ret_range; bool in_async_callback_fn; /* The following fields should be last. See copy_func_state() */ -- cgit v1.2.3 From 96a7457a14d9cf98cf58de1e26c03180e0f28141 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Sep 2022 19:07:19 +0200 Subject: can: skb: unify skb CAN frame identification helpers Replace open coded checks for sk_buffs containing Classical CAN and CAN FD frame structures as a preparation for CAN XL support. With the added length check the unintended processing of CAN XL frames having the CANXL_XLF bit set can be suppressed even when the skb->len fits to non CAN XL frames. The CAN_RAW socket needs a rework to use these helpers. Therefore the use of these helpers is postponed to the CAN_RAW CAN XL integration. The J1939 protocol gets a check for Classical CAN frames too. Acked-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20220912170725.120748-2-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/linux/can/skb.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index 182749e858b3..27ebfc28510f 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -97,10 +97,20 @@ static inline struct sk_buff *can_create_echo_skb(struct sk_buff *skb) return nskb; } +static inline bool can_is_can_skb(const struct sk_buff *skb) +{ + struct can_frame *cf = (struct can_frame *)skb->data; + + /* the CAN specific type of skb is identified by its data length */ + return (skb->len == CAN_MTU && cf->len <= CAN_MAX_DLEN); +} + static inline bool can_is_canfd_skb(const struct sk_buff *skb) { + struct canfd_frame *cfd = (struct canfd_frame *)skb->data; + /* the CAN specific type of skb is identified by its data length */ - return skb->len == CANFD_MTU; + return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN); } #endif /* !_CAN_SKB_H */ -- cgit v1.2.3 From 467ef4c7b9d1c22ee64342804bf92549d765df14 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Sep 2022 19:07:20 +0200 Subject: can: skb: add skb CAN frame data length helpers Add two helpers to retrieve the data length from CAN sk_buffs and prepare the length information to be a uint16 value for the CAN XL support. Acked-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20220912170725.120748-3-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/linux/can/skb.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index 27ebfc28510f..ddffc2fc008c 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -20,7 +20,8 @@ void can_flush_echo_skb(struct net_device *dev); int can_put_echo_skb(struct sk_buff *skb, struct net_device *dev, unsigned int idx, unsigned int frame_len); struct sk_buff *__can_get_echo_skb(struct net_device *dev, unsigned int idx, - u8 *len_ptr, unsigned int *frame_len_ptr); + unsigned int *len_ptr, + unsigned int *frame_len_ptr); unsigned int __must_check can_get_echo_skb(struct net_device *dev, unsigned int idx, unsigned int *frame_len_ptr); @@ -113,4 +114,25 @@ static inline bool can_is_canfd_skb(const struct sk_buff *skb) return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN); } +/* get length element value from can[fd]_frame structure */ +static inline unsigned int can_skb_get_len_val(struct sk_buff *skb) +{ + const struct canfd_frame *cfd = (struct canfd_frame *)skb->data; + + return cfd->len; +} + +/* get needed data length inside CAN frame for all frame types (RTR aware) */ +static inline unsigned int can_skb_get_data_len(struct sk_buff *skb) +{ + unsigned int len = can_skb_get_len_val(skb); + const struct can_frame *cf = (struct can_frame *)skb->data; + + /* RTR frames have an actual length of zero */ + if (can_is_can_skb(skb) && cf->can_id & CAN_RTR_FLAG) + return 0; + + return len; +} + #endif /* !_CAN_SKB_H */ -- cgit v1.2.3 From 061834624c87282c6d9d8c5395aaff4380e5e1fc Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Sep 2022 19:07:21 +0200 Subject: can: set CANFD_FDF flag in all CAN FD frame structures To simplify the testing in user space all struct canfd_frame's provided by the CAN subsystem of the Linux kernel now have the CANFD_FDF flag set in canfd_frame::flags. NB: Handcrafted ETH_P_CANFD frames introduced via PF_PACKET socket might not set this bit correctly. During the check for sufficient headroom in PF_PACKET sk_buffs the uninitialized CAN sk_buff data structures are filled. In the case of a CAN FD frame the CANFD_FDF flag is set accordingly. As the CAN frame content is already zero initialized in alloc_canfd_skb() the obsolete initialization of cf->flags in the CTU CAN FD driver has been removed as it would overwrite the already set CANFD_FDF flag. Acked-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20220912170725.120748-4-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h index 90801ada2bbe..7b23eeeb3273 100644 --- a/include/uapi/linux/can.h +++ b/include/uapi/linux/can.h @@ -141,8 +141,8 @@ struct can_frame { * When this is done the former differentiation via CAN_MTU / CANFD_MTU gets * lost. CANFD_FDF allows programmers to mark CAN FD frames in the case of * using struct canfd_frame for mixed CAN / CAN FD content (dual use). - * N.B. the Kernel APIs do NOT provide mixed CAN / CAN FD content inside of - * struct canfd_frame therefore the CANFD_FDF flag is disregarded by Linux. + * Since the introduction of CAN XL the CANFD_FDF flag is set in all CAN FD + * frame structures provided by the CAN subsystem of the Linux kernel. */ #define CANFD_BRS 0x01 /* bit rate switch (second bitrate for payload data) */ #define CANFD_ESI 0x02 /* error state indicator of the transmitting node */ -- cgit v1.2.3 From 1a3e3034c049503ec6992a4a7d573e7fff31fac4 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Sep 2022 19:07:22 +0200 Subject: can: canxl: introduce CAN XL data structure This patch adds defines for data structures and length information for CAN XL (CAN with eXtended data Length) which can transfer up to 2048 byte inside a single frame. Notable changes from CAN FD: - the 11 bit arbitration field is now named 'priority' instead of 'can_id' (there are no 29 bit identifiers nor RTR frames anymore) - the data length needs a uint16 value to cover up to 2048 byte (the length element position is different to struct can[fd]_frame) - new fields (SDT, AF) and a SEC bit have been introduced - the virtual CAN interface identifier is not part if the CAN XL frame struct as this VCID value is stored in struct skbuff (analog to vlan id) Acked-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20220912170725.120748-5-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can.h | 51 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/can.h b/include/uapi/linux/can.h index 7b23eeeb3273..dd645ea72306 100644 --- a/include/uapi/linux/can.h +++ b/include/uapi/linux/can.h @@ -48,6 +48,7 @@ #include #include +#include /* for offsetof */ /* controller area network (CAN) kernel definitions */ @@ -60,6 +61,7 @@ #define CAN_SFF_MASK 0x000007FFU /* standard frame format (SFF) */ #define CAN_EFF_MASK 0x1FFFFFFFU /* extended frame format (EFF) */ #define CAN_ERR_MASK 0x1FFFFFFFU /* omit EFF, RTR, ERR flags */ +#define CANXL_PRIO_MASK CAN_SFF_MASK /* 11 bit priority mask */ /* * Controller Area Network Identifier structure @@ -73,6 +75,7 @@ typedef __u32 canid_t; #define CAN_SFF_ID_BITS 11 #define CAN_EFF_ID_BITS 29 +#define CANXL_PRIO_BITS CAN_SFF_ID_BITS /* * Controller Area Network Error Message Frame Mask structure @@ -91,6 +94,16 @@ typedef __u32 can_err_mask_t; #define CANFD_MAX_DLC 15 #define CANFD_MAX_DLEN 64 +/* + * CAN XL payload length and DLC definitions according to ISO 11898-1 + * CAN XL DLC ranges from 0 .. 2047 => data length from 1 .. 2048 byte + */ +#define CANXL_MIN_DLC 0 +#define CANXL_MAX_DLC 2047 +#define CANXL_MAX_DLC_MASK 0x07FF +#define CANXL_MIN_DLEN 1 +#define CANXL_MAX_DLEN 2048 + /** * struct can_frame - Classical CAN frame structure (aka CAN 2.0B) * @can_id: CAN ID of the frame and CAN_*_FLAG flags, see canid_t definition @@ -166,8 +179,46 @@ struct canfd_frame { __u8 data[CANFD_MAX_DLEN] __attribute__((aligned(8))); }; +/* + * defined bits for canxl_frame.flags + * + * The canxl_frame.flags element contains two bits CANXL_XLF and CANXL_SEC + * and shares the relative position of the struct can[fd]_frame.len element. + * The CANXL_XLF bit ALWAYS needs to be set to indicate a valid CAN XL frame. + * As a side effect setting this bit intentionally breaks the length checks + * for Classical CAN and CAN FD frames. + * + * Undefined bits in canxl_frame.flags are reserved and shall be set to zero. + */ +#define CANXL_XLF 0x80 /* mandatory CAN XL frame flag (must always be set!) */ +#define CANXL_SEC 0x01 /* Simple Extended Content (security/segmentation) */ + +/** + * struct canxl_frame - CAN with e'X'tended frame 'L'ength frame structure + * @prio: 11 bit arbitration priority with zero'ed CAN_*_FLAG flags + * @flags: additional flags for CAN XL + * @sdt: SDU (service data unit) type + * @len: frame payload length in byte (CANXL_MIN_DLEN .. CANXL_MAX_DLEN) + * @af: acceptance field + * @data: CAN XL frame payload (CANXL_MIN_DLEN .. CANXL_MAX_DLEN byte) + * + * @prio shares the same position as @can_id from struct can[fd]_frame. + */ +struct canxl_frame { + canid_t prio; /* 11 bit priority for arbitration (canid_t) */ + __u8 flags; /* additional flags for CAN XL */ + __u8 sdt; /* SDU (service data unit) type */ + __u16 len; /* frame payload length in byte */ + __u32 af; /* acceptance field */ + __u8 data[CANXL_MAX_DLEN]; +}; + #define CAN_MTU (sizeof(struct can_frame)) #define CANFD_MTU (sizeof(struct canfd_frame)) +#define CANXL_MTU (sizeof(struct canxl_frame)) +#define CANXL_HDR_SIZE (offsetof(struct canxl_frame, data)) +#define CANXL_MIN_MTU (CANXL_HDR_SIZE + 64) +#define CANXL_MAX_MTU CANXL_MTU /* particular protocols of the protocol family PF_CAN */ #define CAN_RAW 1 /* RAW sockets */ -- cgit v1.2.3 From fb08cba12b52cba4366e858932307649dc5304e2 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Sep 2022 19:07:23 +0200 Subject: can: canxl: update CAN infrastructure for CAN XL frames - add new ETH_P_CANXL ethernet protocol type - update skb checks for CAN XL - add alloc_canxl_skb() which now needs a data length parameter - introduce init_can_skb_reserve() to reduce code duplication Acked-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20220912170725.120748-6-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/linux/can/skb.h | 23 ++++++++++++++++++++++- include/uapi/linux/if_ether.h | 1 + 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/can/skb.h b/include/linux/can/skb.h index ddffc2fc008c..1abc25a8d144 100644 --- a/include/linux/can/skb.h +++ b/include/linux/can/skb.h @@ -30,6 +30,9 @@ void can_free_echo_skb(struct net_device *dev, unsigned int idx, struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf); struct sk_buff *alloc_canfd_skb(struct net_device *dev, struct canfd_frame **cfd); +struct sk_buff *alloc_canxl_skb(struct net_device *dev, + struct canxl_frame **cxl, + unsigned int data_len); struct sk_buff *alloc_can_err_skb(struct net_device *dev, struct can_frame **cf); bool can_dropped_invalid_skb(struct net_device *dev, struct sk_buff *skb); @@ -114,11 +117,29 @@ static inline bool can_is_canfd_skb(const struct sk_buff *skb) return (skb->len == CANFD_MTU && cfd->len <= CANFD_MAX_DLEN); } -/* get length element value from can[fd]_frame structure */ +static inline bool can_is_canxl_skb(const struct sk_buff *skb) +{ + const struct canxl_frame *cxl = (struct canxl_frame *)skb->data; + + if (skb->len < CANXL_HDR_SIZE + CANXL_MIN_DLEN || skb->len > CANXL_MTU) + return false; + + /* this also checks valid CAN XL data length boundaries */ + if (skb->len != CANXL_HDR_SIZE + cxl->len) + return false; + + return cxl->flags & CANXL_XLF; +} + +/* get length element value from can[|fd|xl]_frame structure */ static inline unsigned int can_skb_get_len_val(struct sk_buff *skb) { + const struct canxl_frame *cxl = (struct canxl_frame *)skb->data; const struct canfd_frame *cfd = (struct canfd_frame *)skb->data; + if (can_is_canxl_skb(skb)) + return cxl->len; + return cfd->len; } diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index d370165bc621..69e0457eb200 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -138,6 +138,7 @@ #define ETH_P_LOCALTALK 0x0009 /* Localtalk pseudo type */ #define ETH_P_CAN 0x000C /* CAN: Controller Area Network */ #define ETH_P_CANFD 0x000D /* CANFD: CAN flexible data rate*/ +#define ETH_P_CANXL 0x000E /* CANXL: eXtended frame Length */ #define ETH_P_PPPTALK 0x0010 /* Dummy type for Atalk over PPP*/ #define ETH_P_TR_802_2 0x0011 /* 802.2 frames */ #define ETH_P_MOBITEX 0x0015 /* Mobitex (kaz@cafe.net) */ -- cgit v1.2.3 From ebf87fc728502244550eaf8819fc785e2014b2ad Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Sep 2022 19:07:24 +0200 Subject: can: dev: add CAN XL support to virtual CAN Make use of new can_skb_get_data_len() helper. Add support for variable CANXL MTU using the new can_is_canxl_dev_mtu(). Acked-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20220912170725.120748-7-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index c3e50e537e39..58f5431a5559 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -147,6 +147,11 @@ static inline u32 can_get_static_ctrlmode(struct can_priv *priv) return priv->ctrlmode & ~priv->ctrlmode_supported; } +static inline bool can_is_canxl_dev_mtu(unsigned int mtu) +{ + return (mtu >= CANXL_MIN_MTU && mtu <= CANXL_MAX_MTU); +} + void can_setup(struct net_device *dev); struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, -- cgit v1.2.3 From 626332696d7506e8f844a564277bdba2dc78fcb5 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 12 Sep 2022 19:07:25 +0200 Subject: can: raw: add CAN XL support Enable CAN_RAW sockets to read and write CAN XL frames analogue to the CAN FD extension (new CAN_RAW_XL_FRAMES sockopt). A CAN XL network interface is capable to handle Classical CAN, CAN FD and CAN XL frames. When CAN_RAW_XL_FRAMES is enabled, the CAN_RAW socket checks whether the addressed CAN network interface is capable to handle the provided CAN frame. In opposite to the fixed number of bytes for - CAN frames (CAN_MTU = sizeof(struct can_frame)) - CAN FD frames (CANFD_MTU = sizeof(struct can_frame)) the number of bytes when reading/writing CAN XL frames depends on the number of data bytes. For efficiency reasons the length of the struct canxl_frame is truncated to the needed size for read/write operations. This leads to a calculated size of CANXL_HDR_SIZE + canxl_frame::len which is enforced on write() operations and guaranteed on read() operations. NB: Valid length values are 1 .. 2048 (CANXL_MIN_DLEN .. CANXL_MAX_DLEN). Acked-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20220912170725.120748-8-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/raw.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/can/raw.h b/include/uapi/linux/can/raw.h index 3386aa81fdf2..ff12f525c37c 100644 --- a/include/uapi/linux/can/raw.h +++ b/include/uapi/linux/can/raw.h @@ -62,6 +62,7 @@ enum { CAN_RAW_RECV_OWN_MSGS, /* receive my own msgs (default:off) */ CAN_RAW_FD_FRAMES, /* allow CAN FD frames (default:off) */ CAN_RAW_JOIN_FILTERS, /* all filters must match to trigger */ + CAN_RAW_XL_FRAMES, /* allow CAN XL frames (default:off) */ }; #endif /* !_UAPI_CAN_RAW_H */ -- cgit v1.2.3 From 47e34cb74d376ddfeaef94abb1d6dfb3c905ee51 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 12 Sep 2022 08:45:44 -0700 Subject: bpf: Add verifier check for BPF_PTR_POISON retval and arg BPF_PTR_POISON was added in commit c0a5a21c25f37 ("bpf: Allow storing referenced kptr in map") to denote a bpf_func_proto btf_id which the verifier will replace with a dynamically-determined btf_id at verification time. This patch adds verifier 'poison' functionality to BPF_PTR_POISON in order to prepare for expanded use of the value to poison ret- and arg-btf_id in ongoing work, namely rbtree and linked list patchsets [0, 1]. Specifically, when the verifier checks helper calls, it assumes that BPF_PTR_POISON'ed ret type will be replaced with a valid type before - or in lieu of - the default ret_btf_id logic. Similarly for arg btf_id. If poisoned btf_id reaches default handling block for either, consider this a verifier internal error and fail verification. Otherwise a helper w/ poisoned btf_id but no verifier logic replacing the type will cause a crash as the invalid pointer is dereferenced. Also move BPF_PTR_POISON to existing include/linux/posion.h header and remove unnecessary shift. [0]: lore.kernel.org/bpf/20220830172759.4069786-1-davemarchevsky@fb.com [1]: lore.kernel.org/bpf/20220904204145.3089-1-memxor@gmail.com Signed-off-by: Dave Marchevsky Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220912154544.1398199-1-davemarchevsky@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/poison.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/poison.h b/include/linux/poison.h index d62ef5a6b4e9..2d3249eb0e62 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -81,4 +81,7 @@ /********** net/core/page_pool.c **********/ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) +/********** kernel/bpf/ **********/ +#define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA)) + #endif -- cgit v1.2.3 From 7e6e1b57162ed6a2d32d2f0929c27d79482ff706 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Tue, 6 Sep 2022 11:55:58 +0200 Subject: rtnetlink: advertise allmulti counter Like what was done with IFLA_PROMISCUITY, add IFLA_ALLMULTI to advertise the allmulti counter. The flag IFF_ALLMULTI is advertised only if it was directly set by a userland app. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index e36d9d2c65a7..0bfa9a99ebb6 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -370,6 +370,7 @@ enum { IFLA_GRO_MAX_SIZE, IFLA_TSO_MAX_SIZE, IFLA_TSO_MAX_SEGS, + IFLA_ALLMULTI, /* Allmulti count: > 0 means acts ALLMULTI */ __IFLA_MAX }; -- cgit v1.2.3 From 7187440dd7c45fd8b6dd4f3ff56b03ca4aa1bbd2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 8 Sep 2022 15:20:23 +0300 Subject: iov_iter: use "maxpages" parameter This was intended to be "maxpages" instead of INT_MAX. There is only one caller and it passes INT_MAX so this does not affect runtime. Fixes: b93235e68921 ("tls: cap the output scatter list to something reasonable") Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- include/linux/uio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/uio.h b/include/linux/uio.h index 5896af36199c..2e3134b14ffd 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -298,7 +298,7 @@ iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes) shorted = iov_iter_count(i) - max_bytes; iov_iter_truncate(i, max_bytes); } - npages = iov_iter_npages(i, INT_MAX); + npages = iov_iter_npages(i, maxpages); if (shorted) iov_iter_reexpand(i, iov_iter_count(i) + shorted); -- cgit v1.2.3 From bfeb7e399bacae4ee46ad978f5fce3e47f0978d6 Mon Sep 17 00:00:00 2001 From: Yauheni Kaliuta Date: Mon, 5 Sep 2022 12:01:49 +0300 Subject: bpf: Use bpf_capable() instead of CAP_SYS_ADMIN for blinding decision The full CAP_SYS_ADMIN requirement for blinding looks too strict nowadays. These days given unprivileged BPF is disabled by default, the main users for constant blinding coming from unprivileged in particular via cBPF -> eBPF migration (e.g. old-style socket filters). Signed-off-by: Yauheni Kaliuta Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220831090655.156434-1-ykaliuta@redhat.com Link: https://lore.kernel.org/bpf/20220905090149.61221-1-ykaliuta@redhat.com --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 527ae1d64e27..75335432fcbc 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1099,7 +1099,7 @@ static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) return false; if (!bpf_jit_harden) return false; - if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN)) + if (bpf_jit_harden == 1 && bpf_capable()) return false; return true; -- cgit v1.2.3 From 9440155ccb948f8e3ce5308907a2e7378799be60 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Sat, 3 Sep 2022 15:11:53 +0200 Subject: ftrace: Add HAVE_DYNAMIC_FTRACE_NO_PATCHABLE x86 will shortly start using -fpatchable-function-entry for purposes other than ftrace, make sure the __patchable_function_entry section isn't merged in the mcount_loc section. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20220903131154.420467-2-jolsa@kernel.org --- include/asm-generic/vmlinux.lds.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 7515a465ec03..13b197ef0d63 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -154,6 +154,14 @@ #define MEM_DISCARD(sec) *(.mem##sec) #endif +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_NO_PATCHABLE +#define KEEP_PATCHABLE KEEP(*(__patchable_function_entries)) +#define PATCHABLE_DISCARDS +#else +#define KEEP_PATCHABLE +#define PATCHABLE_DISCARDS *(__patchable_function_entries) +#endif + #ifdef CONFIG_FTRACE_MCOUNT_RECORD /* * The ftrace call sites are logged to a section whose name depends on the @@ -172,7 +180,7 @@ #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__mcount_loc)) \ - KEEP(*(__patchable_function_entries)) \ + KEEP_PATCHABLE \ __stop_mcount_loc = .; \ ftrace_stub_graph = ftrace_stub; \ ftrace_ops_list_func = arch_ftrace_ops_list_func; @@ -1024,6 +1032,7 @@ #define COMMON_DISCARDS \ SANITIZER_DISCARDS \ + PATCHABLE_DISCARDS \ *(.discard) \ *(.discard.*) \ *(.modinfo) \ -- cgit v1.2.3 From ceea991a019c57a1fb0edd12a5f836a0fa431aee Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 3 Sep 2022 15:11:54 +0200 Subject: bpf: Move bpf_dispatcher function out of ftrace locations The dispatcher function is attached/detached to trampoline by dispatcher update function. At the same time it's available as ftrace attachable function. After discussion [1] the proposed solution is to use compiler attributes to alter bpf_dispatcher_##name##_func function: - remove it from being instrumented with __no_instrument_function__ attribute, so ftrace has no track of it - but still generate 5 nop instructions with patchable_function_entry(5) attribute, which are expected by bpf_arch_text_poke used by dispatcher update function Enabling HAVE_DYNAMIC_FTRACE_NO_PATCHABLE option for x86, so __patchable_function_entries functions are not part of ftrace/mcount locations. Adding attributes to bpf_dispatcher_XXX function on x86_64 so it's kept out of ftrace locations and has 5 byte nop generated at entry. These attributes need to be arch specific as pointed out by Ilya Leoshkevic in here [2]. The dispatcher image is generated only for x86_64 arch, so the code can stay as is for other archs. [1] https://lore.kernel.org/bpf/20220722110811.124515-1-jolsa@kernel.org/ [2] https://lore.kernel.org/bpf/969a14281a7791c334d476825863ee449964dd0c.camel@linux.ibm.com/ Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20220903131154.420467-3-jolsa@kernel.org --- include/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 54178b9e9c3a..e0dbe0c0a17e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -977,7 +977,14 @@ int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); }, \ } +#ifdef CONFIG_X86_64 +#define BPF_DISPATCHER_ATTRIBUTES __attribute__((patchable_function_entry(5))) +#else +#define BPF_DISPATCHER_ATTRIBUTES +#endif + #define DEFINE_BPF_DISPATCHER(name) \ + notrace BPF_DISPATCHER_ATTRIBUTES \ noinline __nocfi unsigned int bpf_dispatcher_##name##_func( \ const void *ctx, \ const struct bpf_insn *insnsi, \ -- cgit v1.2.3 From e63efbcaba7d6f6846b1c2ec5cf259249b9ed88f Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Fri, 16 Sep 2022 17:02:46 +0100 Subject: wifi: brcmfmac: pcie: Read Apple OTP information On Apple platforms, the One Time Programmable ROM in the Broadcom chips contains information about the specific board design (module, vendor, version) that is required to select the correct NVRAM file. Parse this OTP ROM and extract the required strings. Note that the user OTP offset/size is per-chip. This patch does not add any chips yet. Reviewed-by: Arend van Spriel Signed-off-by: Hector Martin Signed-off-by: Russell King (Oracle) Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/E1oZDni-0077aM-I6@rmk-PC.armlinux.org.uk --- include/linux/bcma/bcma_driver_chipcommon.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index e3314f746bfa..2d94c30ed439 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -271,6 +271,7 @@ #define BCMA_CC_SROM_CONTROL_OP_WRDIS 0x40000000 #define BCMA_CC_SROM_CONTROL_OP_WREN 0x60000000 #define BCMA_CC_SROM_CONTROL_OTPSEL 0x00000010 +#define BCMA_CC_SROM_CONTROL_OTP_PRESENT 0x00000020 #define BCMA_CC_SROM_CONTROL_LOCK 0x00000008 #define BCMA_CC_SROM_CONTROL_SIZE_MASK 0x00000006 #define BCMA_CC_SROM_CONTROL_SIZE_1K 0x00000000 -- cgit v1.2.3 From 65b32f801bfbc54dc98144a6ec26082b59d131ee Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Thu, 8 Sep 2022 10:16:40 -0700 Subject: uapi: move IPPROTO_L2TP to in.h IPPROTO_L2TP is currently defined in l2tp.h, but most of ip protocols are defined in in.h file. Move it there in order to keep code clean. Acked-by: Guillaume Nault Signed-off-by: Wojciech Drewek Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni --- include/uapi/linux/in.h | 2 ++ include/uapi/linux/l2tp.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 578daa6f816b..f243ce665f74 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -68,6 +68,8 @@ enum { #define IPPROTO_PIM IPPROTO_PIM IPPROTO_COMP = 108, /* Compression Header Protocol */ #define IPPROTO_COMP IPPROTO_COMP + IPPROTO_L2TP = 115, /* Layer 2 Tunnelling Protocol */ +#define IPPROTO_L2TP IPPROTO_L2TP IPPROTO_SCTP = 132, /* Stream Control Transport Protocol */ #define IPPROTO_SCTP IPPROTO_SCTP IPPROTO_UDPLITE = 136, /* UDP-Lite (RFC 3828) */ diff --git a/include/uapi/linux/l2tp.h b/include/uapi/linux/l2tp.h index bab8c9708611..7d81c3e1ec29 100644 --- a/include/uapi/linux/l2tp.h +++ b/include/uapi/linux/l2tp.h @@ -13,8 +13,6 @@ #include #include -#define IPPROTO_L2TP 115 - /** * struct sockaddr_l2tpip - the sockaddr structure for L2TP-over-IP sockets * @l2tp_family: address family number AF_L2TPIP. -- cgit v1.2.3 From dda2fa08a13c688bed320ef2e4ba541abb4d6c17 Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Thu, 8 Sep 2022 10:16:41 -0700 Subject: flow_dissector: Add L2TPv3 dissectors Allow to dissect L2TPv3 specific field which is: - session ID (32 bits) L2TPv3 might be transported over IP or over UDP, this implementation is only about L2TPv3 over IP. IP protocol carries L2TPv3 when ip_proto is IPPROTO_L2TP (115). Acked-by: Guillaume Nault Signed-off-by: Wojciech Drewek Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni --- include/net/flow_dissector.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 6c74812d64b2..5ccf52ef8809 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -289,6 +289,14 @@ struct flow_dissector_key_pppoe { __be16 type; }; +/** + * struct flow_dissector_key_l2tpv3: + * @session_id: identifier for a l2tp session + */ +struct flow_dissector_key_l2tpv3 { + __be32 session_id; +}; + enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */ @@ -320,6 +328,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */ FLOW_DISSECTOR_KEY_NUM_OF_VLANS, /* struct flow_dissector_key_num_of_vlans */ FLOW_DISSECTOR_KEY_PPPOE, /* struct flow_dissector_key_pppoe */ + FLOW_DISSECTOR_KEY_L2TPV3, /* struct flow_dissector_key_l2tpv3 */ FLOW_DISSECTOR_KEY_MAX, }; -- cgit v1.2.3 From 8b189ea08c334f25dbb3d076f8adb8b80491d01d Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Thu, 8 Sep 2022 10:16:42 -0700 Subject: net/sched: flower: Add L2TPv3 filter Add support for matching on L2TPv3 session ID. Session ID can be specified only when ip proto was set to IPPROTO_L2TP. Example filter: # tc filter add dev $PF1 ingress prio 1 protocol ip \ flower \ ip_proto l2tp \ l2tpv3_sid 1234 \ skip_sw \ action mirred egress redirect dev $VF1_PR Acked-by: Guillaume Nault Signed-off-by: Wojciech Drewek Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni --- include/uapi/linux/pkt_cls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 877309d6ca3c..648a82f32666 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -592,6 +592,8 @@ enum { TCA_FLOWER_KEY_PPPOE_SID, /* be16 */ TCA_FLOWER_KEY_PPP_PROTO, /* be16 */ + TCA_FLOWER_KEY_L2TPV3_SID, /* be32 */ + __TCA_FLOWER_MAX, }; -- cgit v1.2.3 From 2c1befaced504a125d1ab7479684a9208879d350 Mon Sep 17 00:00:00 2001 From: Wojciech Drewek Date: Thu, 8 Sep 2022 10:16:43 -0700 Subject: flow_offload: Introduce flow_match_l2tpv3 Allow to offload L2TPv3 filters by adding flow_rule_match_l2tpv3. Drivers can extract L2TPv3 specific fields from now on. Signed-off-by: Wojciech Drewek Signed-off-by: Tony Nguyen Signed-off-by: Paolo Abeni --- include/net/flow_offload.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 2a9a9e42e7fd..e343f9f8363e 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -80,6 +80,10 @@ struct flow_match_pppoe { struct flow_dissector_key_pppoe *key, *mask; }; +struct flow_match_l2tpv3 { + struct flow_dissector_key_l2tpv3 *key, *mask; +}; + struct flow_rule; void flow_rule_match_meta(const struct flow_rule *rule, @@ -128,6 +132,8 @@ void flow_rule_match_ct(const struct flow_rule *rule, struct flow_match_ct *out); void flow_rule_match_pppoe(const struct flow_rule *rule, struct flow_match_pppoe *out); +void flow_rule_match_l2tpv3(const struct flow_rule *rule, + struct flow_match_l2tpv3 *out); enum flow_action_id { FLOW_ACTION_ACCEPT = 0, -- cgit v1.2.3 From db01868bf2e919a10551066d54cf4bef5dd5a01e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 11 Sep 2022 04:06:57 +0300 Subject: net: introduce iterators over synced hw addresses Some network drivers use __dev_mc_sync()/__dev_uc_sync() and therefore program the hardware only with addresses with a non-zero sync_cnt. Some of the above drivers also need to save/restore the address filtering lists when certain events happen, and they need to walk through the struct net_device :: uc and struct net_device :: mc lists. But these lists contain unsynced addresses too. To keep the appearance of an elementary form of data encapsulation, provide iterators through these lists that only look at entries with a non-zero sync_cnt, instead of filtering entries out from device drivers. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f0068c1ff1df..9f42fc871c3b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -253,11 +253,17 @@ struct netdev_hw_addr_list { #define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc) #define netdev_for_each_uc_addr(ha, dev) \ netdev_hw_addr_list_for_each(ha, &(dev)->uc) +#define netdev_for_each_synced_uc_addr(_ha, _dev) \ + netdev_for_each_uc_addr((_ha), (_dev)) \ + if ((_ha)->sync_cnt) #define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc) #define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc) #define netdev_for_each_mc_addr(ha, dev) \ netdev_hw_addr_list_for_each(ha, &(dev)->mc) +#define netdev_for_each_synced_mc_addr(_ha, _dev) \ + netdev_for_each_mc_addr((_ha), (_dev)) \ + if ((_ha)->sync_cnt) struct hh_cache { unsigned int hh_len; -- cgit v1.2.3 From 8f6a19c0316deb48cdfdc5335de9a6d7db5b7b62 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 11 Sep 2022 04:06:58 +0300 Subject: net: dsa: introduce dsa_port_get_master() There is a desire to support for DSA masters in a LAG. That configuration is intended to work by simply enslaving the master to a bonding/team device. But the physical DSA master (the LAG slave) still has a dev->dsa_ptr, and that cpu_dp still corresponds to the physical CPU port. However, we would like to be able to retrieve the LAG that's the upper of the physical DSA master. In preparation for that, introduce a helper called dsa_port_get_master() that replaces all occurrences of the dp->cpu_dp->master pattern. The distinction between LAG and non-LAG will be made later within the helper itself. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni --- include/net/dsa.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index f2ce12860546..23eac1bda843 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -718,6 +718,11 @@ static inline bool dsa_port_offloads_lag(struct dsa_port *dp, return dsa_port_lag_dev_get(dp) == lag->dev; } +static inline struct net_device *dsa_port_to_master(const struct dsa_port *dp) +{ + return dp->cpu_dp->master; +} + static inline struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp) { -- cgit v1.2.3 From 95f510d0b792f308d3d748242fe960c35bdc2c62 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 11 Sep 2022 04:06:59 +0300 Subject: net: dsa: allow the DSA master to be seen and changed through rtnetlink Some DSA switches have multiple CPU ports, which can be used to improve CPU termination throughput, but DSA, through dsa_tree_setup_cpu_ports(), sets up only the first one, leading to suboptimal use of hardware. The desire is to not change the default configuration but to permit the user to create a dynamic mapping between individual user ports and the CPU port that they are served by, configurable through rtnetlink. It is also intended to permit load balancing between CPU ports, and in that case, the foreseen model is for the DSA master to be a bonding interface whose lowers are the physical DSA masters. To that end, we create a struct rtnl_link_ops for DSA user ports with the "dsa" kind. We expose the IFLA_DSA_MASTER link attribute that contains the ifindex of the newly desired DSA master. Signed-off-by: Vladimir Oltean Signed-off-by: Paolo Abeni --- include/net/dsa.h | 8 ++++++++ include/uapi/linux/if_link.h | 10 ++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 23eac1bda843..3f717c3fcba0 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -559,6 +559,10 @@ static inline bool dsa_is_user_port(struct dsa_switch *ds, int p) list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_user((_dp))) +#define dsa_tree_for_each_user_port_continue_reverse(_dp, _dst) \ + list_for_each_entry_continue_reverse((_dp), &(_dst)->ports, list) \ + if (dsa_port_is_user((_dp))) + #define dsa_tree_for_each_cpu_port(_dp, _dst) \ list_for_each_entry((_dp), &(_dst)->ports, list) \ if (dsa_port_is_cpu((_dp))) @@ -830,6 +834,10 @@ struct dsa_switch_ops { int (*connect_tag_protocol)(struct dsa_switch *ds, enum dsa_tag_protocol proto); + int (*port_change_master)(struct dsa_switch *ds, int port, + struct net_device *master, + struct netlink_ext_ack *extack); + /* Optional switch-wide initialization and destruction methods */ int (*setup)(struct dsa_switch *ds); void (*teardown)(struct dsa_switch *ds); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 0bfa9a99ebb6..3d39fb398d65 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1375,4 +1375,14 @@ enum { #define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1) +/* DSA section */ + +enum { + IFLA_DSA_UNSPEC, + IFLA_DSA_MASTER, + __IFLA_DSA_MAX, +}; + +#define IFLA_DSA_MAX (__IFLA_DSA_MAX - 1) + #endif /* _UAPI_LINUX_IF_LINK_H */ -- cgit v1.2.3 From 2e359b00a11715c7a89d7448e6e4cb4d84543520 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 11 Sep 2022 04:07:03 +0300 Subject: net: dsa: propagate extack to port_lag_join Drivers could refuse to offload a LAG configuration for a variety of reasons, mainly having to do with its TX type. Additionally, since DSA masters may now also be LAG interfaces, and this will translate into a call to port_lag_join on the CPU ports, there may be extra restrictions there. Propagate the netlink extack to this DSA method in order for drivers to give a meaningful error message back to the user. Signed-off-by: Vladimir Oltean Signed-off-by: Paolo Abeni --- include/net/dsa.h | 6 ++++-- include/soc/mscc/ocelot.h | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 3f717c3fcba0..1c80e75b3cad 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1094,7 +1094,8 @@ struct dsa_switch_ops { int port); int (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index, int port, struct dsa_lag lag, - struct netdev_lag_upper_info *info); + struct netdev_lag_upper_info *info, + struct netlink_ext_ack *extack); int (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index, int port, struct dsa_lag lag); @@ -1169,7 +1170,8 @@ struct dsa_switch_ops { int (*port_lag_change)(struct dsa_switch *ds, int port); int (*port_lag_join)(struct dsa_switch *ds, int port, struct dsa_lag lag, - struct netdev_lag_upper_info *info); + struct netdev_lag_upper_info *info, + struct netlink_ext_ack *extack); int (*port_lag_leave)(struct dsa_switch *ds, int port, struct dsa_lag lag); diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index 355cfdedc43b..ea19e8ef1f61 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -1229,7 +1229,8 @@ int ocelot_port_mdb_del(struct ocelot *ocelot, int port, const struct net_device *bridge); int ocelot_port_lag_join(struct ocelot *ocelot, int port, struct net_device *bond, - struct netdev_lag_upper_info *info); + struct netdev_lag_upper_info *info, + struct netlink_ext_ack *extack); void ocelot_port_lag_leave(struct ocelot *ocelot, int port, struct net_device *bond); void ocelot_port_lag_change(struct ocelot *ocelot, int port, bool lag_tx_active); -- cgit v1.2.3 From acc43b7bf52a015221d989164c2600c1e1f28790 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 11 Sep 2022 04:07:04 +0300 Subject: net: dsa: allow masters to join a LAG There are 2 ways in which a DSA user port may become handled by 2 CPU ports in a LAG: (1) its current DSA master joins a LAG ip link del bond0 && ip link add bond0 type bond mode 802.3ad ip link set eno2 master bond0 When this happens, all user ports with "eno2" as DSA master get automatically migrated to "bond0" as DSA master. (2) it is explicitly configured as such by the user # Before, the DSA master was eno3 ip link set swp0 type dsa master bond0 The design of this configuration is that the LAG device dynamically becomes a DSA master through dsa_master_setup() when the first physical DSA master becomes a LAG slave, and stops being so through dsa_master_teardown() when the last physical DSA master leaves. A LAG interface is considered as a valid DSA master only if it contains existing DSA masters, and no other lower interfaces. Therefore, we mainly rely on method (1) to enter this configuration. Each physical DSA master (LAG slave) retains its dev->dsa_ptr for when it becomes a standalone DSA master again. But the LAG master also has a dev->dsa_ptr, and this is actually duplicated from one of the physical LAG slaves, and therefore needs to be balanced when LAG slaves come and go. To the switch driver, putting DSA masters in a LAG is seen as putting their associated CPU ports in a LAG. We need to prepare cross-chip host FDB notifiers for CPU ports in a LAG, by calling the driver's ->lag_fdb_add method rather than ->port_fdb_add. Signed-off-by: Vladimir Oltean Signed-off-by: Paolo Abeni --- include/net/dsa.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 1c80e75b3cad..d777eac5694f 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -300,6 +300,9 @@ struct dsa_port { u8 master_admin_up:1; u8 master_oper_up:1; + /* Valid only on user ports */ + u8 cpu_port_in_lag:1; + u8 setup:1; struct device_node *dn; @@ -724,6 +727,9 @@ static inline bool dsa_port_offloads_lag(struct dsa_port *dp, static inline struct net_device *dsa_port_to_master(const struct dsa_port *dp) { + if (dp->cpu_port_in_lag) + return dsa_port_lag_dev_get(dp->cpu_dp); + return dp->cpu_dp->master; } @@ -811,6 +817,12 @@ dsa_tree_offloads_bridge_dev(struct dsa_switch_tree *dst, return false; } +static inline bool dsa_port_tree_same(const struct dsa_port *a, + const struct dsa_port *b) +{ + return a->ds->dst == b->ds->dst; +} + typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { -- cgit v1.2.3 From eca70102cfb19d783d30d3e9b13713d58581eb67 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 11 Sep 2022 04:07:06 +0300 Subject: net: dsa: felix: add support for changing DSA master Changing the DSA master means different things depending on the tagging protocol in use. For NPI mode ("ocelot" and "seville"), there is a single port which can be configured as NPI, but DSA only permits changing the CPU port affinity of user ports one by one. So changing a user port to a different NPI port globally changes what the NPI port is, and breaks the user ports still using the old one. To address this while still permitting the change of the NPI port, require that the user ports which are still affine to the old NPI port are down, and cannot be brought up until they are all affine to the same NPI port. The tag_8021q mode ("ocelot-8021q") is more flexible, in that each user port can be freely assigned to one CPU port or to the other. This works by filtering host addresses towards both tag_8021q CPU ports, and then restricting the forwarding from a certain user port only to one of the two tag_8021q CPU ports. Additionally, the 2 tag_8021q CPU ports can be placed in a LAG. This works by enabling forwarding via PGID_SRC from a certain user port towards the logical port ID containing both tag_8021q CPU ports, but then restricting forwarding per packet, via the LAG hash codes in PGID_AGGR, to either one or the other. When we change the DSA master to a LAG device, DSA guarantees us that the LAG has at least one lower interface as a physical DSA master. But DSA masters can come and go as lowers of that LAG, and ds->ops->port_change_master() will not get called, because the DSA master is still the same (the LAG). So we need to hook into the ds->ops->port_lag_{join,leave} calls on the CPU ports and update the logical port ID of the LAG that user ports are assigned to. Signed-off-by: Vladimir Oltean Signed-off-by: Paolo Abeni --- include/soc/mscc/ocelot.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h index ea19e8ef1f61..967ba30ea636 100644 --- a/include/soc/mscc/ocelot.h +++ b/include/soc/mscc/ocelot.h @@ -1234,6 +1234,7 @@ int ocelot_port_lag_join(struct ocelot *ocelot, int port, void ocelot_port_lag_leave(struct ocelot *ocelot, int port, struct net_device *bond); void ocelot_port_lag_change(struct ocelot *ocelot, int port, bool lag_tx_active); +int ocelot_bond_get_id(struct ocelot *ocelot, struct net_device *bond); int ocelot_devlink_sb_register(struct ocelot *ocelot); void ocelot_devlink_sb_unregister(struct ocelot *ocelot); -- cgit v1.2.3 From 848f3c0d47694924536e2894cb349613201321c6 Mon Sep 17 00:00:00 2001 From: Andrea Mayer Date: Mon, 12 Sep 2022 19:16:18 +0200 Subject: seg6: add NEXT-C-SID support for SRv6 End behavior The NEXT-C-SID mechanism described in [1] offers the possibility of encoding several SRv6 segments within a single 128 bit SID address. Such a SID address is called a Compressed SID (C-SID) container. In this way, the length of the SID List can be drastically reduced. A SID instantiated with the NEXT-C-SID flavor considers an IPv6 address logically structured in three main blocks: i) Locator-Block; ii) Locator-Node Function; iii) Argument. C-SID container +------------------------------------------------------------------+ | Locator-Block |Loc-Node| Argument | | |Function| | +------------------------------------------------------------------+ <--------- B -----------> <- NF -> <------------- A ---------------> (i) The Locator-Block can be any IPv6 prefix available to the provider; (ii) The Locator-Node Function represents the node and the function to be triggered when a packet is received on the node; (iii) The Argument carries the remaining C-SIDs in the current C-SID container. The NEXT-C-SID mechanism relies on the "flavors" framework defined in [2]. The flavors represent additional operations that can modify or extend a subset of the existing behaviors. This patch introduces the support for flavors in SRv6 End behavior implementing the NEXT-C-SID one. An SRv6 End behavior with NEXT-C-SID flavor works as an End behavior but it is capable of processing the compressed SID List encoded in C-SID containers. An SRv6 End behavior with NEXT-C-SID flavor can be configured to support user-provided Locator-Block and Locator-Node Function lengths. In this implementation, such lengths must be evenly divisible by 8 (i.e. must be byte-aligned), otherwise the kernel informs the user about invalid values with a meaningful error code and message through netlink_ext_ack. If Locator-Block and/or Locator-Node Function lengths are not provided by the user during configuration of an SRv6 End behavior instance with NEXT-C-SID flavor, the kernel will choose their default values i.e., 32-bit Locator-Block and 16-bit Locator-Node Function. [1] - https://datatracker.ietf.org/doc/html/draft-ietf-spring-srv6-srh-compression [2] - https://datatracker.ietf.org/doc/html/rfc8986 Signed-off-by: Andrea Mayer Reviewed-by: David Ahern Signed-off-by: Paolo Abeni --- include/uapi/linux/seg6_local.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h index 332b18f318f8..4fdc424c9cb3 100644 --- a/include/uapi/linux/seg6_local.h +++ b/include/uapi/linux/seg6_local.h @@ -28,6 +28,7 @@ enum { SEG6_LOCAL_BPF, SEG6_LOCAL_VRFTABLE, SEG6_LOCAL_COUNTERS, + SEG6_LOCAL_FLAVORS, __SEG6_LOCAL_MAX, }; #define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1) @@ -110,4 +111,27 @@ enum { #define SEG6_LOCAL_CNT_MAX (__SEG6_LOCAL_CNT_MAX - 1) +/* SRv6 End* Flavor attributes */ +enum { + SEG6_LOCAL_FLV_UNSPEC, + SEG6_LOCAL_FLV_OPERATION, + SEG6_LOCAL_FLV_LCBLOCK_BITS, + SEG6_LOCAL_FLV_LCNODE_FN_BITS, + __SEG6_LOCAL_FLV_MAX, +}; + +#define SEG6_LOCAL_FLV_MAX (__SEG6_LOCAL_FLV_MAX - 1) + +/* Designed flavor operations for SRv6 End* Behavior */ +enum { + SEG6_LOCAL_FLV_OP_UNSPEC, + SEG6_LOCAL_FLV_OP_PSP, + SEG6_LOCAL_FLV_OP_USP, + SEG6_LOCAL_FLV_OP_USD, + SEG6_LOCAL_FLV_OP_NEXT_CSID, + __SEG6_LOCAL_FLV_OP_MAX +}; + +#define SEG6_LOCAL_FLV_OP_MAX (__SEG6_LOCAL_FLV_OP_MAX - 1) + #endif -- cgit v1.2.3 From 256dea9134c38fcc409ec7ea6a1eb67829b563bc Mon Sep 17 00:00:00 2001 From: Ronak Jain Date: Wed, 14 Sep 2022 18:03:15 +0530 Subject: firmware: xilinx: add support for sd/gem config Add new APIs in firmware to configure SD/GEM registers. Internally it calls PM IOCTL for below SD/GEM register configuration: - SD/EMMC select - SD slot type - SD base clock - SD 8 bit support - SD fixed config - GEM SGMII Mode - GEM fixed config Signed-off-by: Ronak Jain Signed-off-by: Radhey Shyam Pandey Reviewed-by: Claudiu Beznea Signed-off-by: Jakub Kicinski --- include/linux/firmware/xlnx-zynqmp.h | 45 ++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 9f50dacbf7d6..76d2b3ebad84 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -153,6 +153,9 @@ enum pm_ioctl_id { /* Runtime feature configuration */ IOCTL_SET_FEATURE_CONFIG = 26, IOCTL_GET_FEATURE_CONFIG = 27, + /* Dynamic SD/GEM configuration */ + IOCTL_SET_SD_CONFIG = 30, + IOCTL_SET_GEM_CONFIG = 31, }; enum pm_query_id { @@ -399,6 +402,30 @@ enum pm_feature_config_id { PM_FEATURE_EXTWDT_VALUE = 4, }; +/** + * enum pm_sd_config_type - PM SD configuration. + * @SD_CONFIG_EMMC_SEL: To set SD_EMMC_SEL in CTRL_REG_SD and SD_SLOTTYPE + * @SD_CONFIG_BASECLK: To set SD_BASECLK in SD_CONFIG_REG1 + * @SD_CONFIG_8BIT: To set SD_8BIT in SD_CONFIG_REG2 + * @SD_CONFIG_FIXED: To set fixed config registers + */ +enum pm_sd_config_type { + SD_CONFIG_EMMC_SEL = 1, + SD_CONFIG_BASECLK = 2, + SD_CONFIG_8BIT = 3, + SD_CONFIG_FIXED = 4, +}; + +/** + * enum pm_gem_config_type - PM GEM configuration. + * @GEM_CONFIG_SGMII_MODE: To set GEM_SGMII_MODE in GEM_CLK_CTRL register + * @GEM_CONFIG_FIXED: To set fixed config registers + */ +enum pm_gem_config_type { + GEM_CONFIG_SGMII_MODE = 1, + GEM_CONFIG_FIXED = 2, +}; + /** * struct zynqmp_pm_query_data - PM query data * @qid: query ID @@ -475,6 +502,9 @@ int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id); int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value); int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, u32 *payload); int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset); +int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value); +int zynqmp_pm_set_gem_config(u32 node, enum pm_gem_config_type config, + u32 value); #else static inline int zynqmp_pm_get_api_version(u32 *version) { @@ -745,6 +775,21 @@ static inline int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset) { return -ENODEV; } + +static inline int zynqmp_pm_set_sd_config(u32 node, + enum pm_sd_config_type config, + u32 value) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_set_gem_config(u32 node, + enum pm_gem_config_type config, + u32 value) +{ + return -ENODEV; +} + #endif #endif /* __FIRMWARE_ZYNQMP_H__ */ -- cgit v1.2.3 From 17df341d35265c8e14d71a48886fc7dcf92ef8a1 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 11 Sep 2022 13:39:01 +0200 Subject: headers: Remove some left-over license text Remove a left-over from commit 2874c5fd2842 ("treewide: Replace GPLv2 boilerplate/reference with SPDX - rule 152") There is no need for an empty "License:". Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/0e5ff727626b748238f4b78932f81572143d8f0b.1662896317.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski --- include/linux/if_pppol2tp.h | 2 -- include/linux/if_pppox.h | 2 -- 2 files changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/if_pppol2tp.h b/include/linux/if_pppol2tp.h index 96d40942e5a3..c87efd333faa 100644 --- a/include/linux/if_pppol2tp.h +++ b/include/linux/if_pppol2tp.h @@ -4,8 +4,6 @@ * * This file supplies definitions required by the PPP over L2TP driver * (l2tp_ppp.c). All version information wrt this file is located in l2tp_ppp.c - * - * License: */ #ifndef __LINUX_IF_PPPOL2TP_H #define __LINUX_IF_PPPOL2TP_H diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index 69e813bcb947..ff3beda1312c 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -5,8 +5,6 @@ * * This file supplies definitions required by the PPP over Ethernet driver * (pppox.c). All version information wrt this file is located in pppox.c - * - * License: */ #ifndef __LINUX_IF_PPPOX_H #define __LINUX_IF_PPPOX_H -- cgit v1.2.3 From e9bd0cca09d13ac2f08d25e195203e42d4ad1ce8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 7 Sep 2022 18:10:18 -0700 Subject: tcp: Don't allocate tcp_death_row outside of struct netns_ipv4. We will soon introduce an optional per-netns ehash and access hash tables via net->ipv4.tcp_death_row->hashinfo instead of &tcp_hashinfo in most places. It could harm the fast path because dereferences of two fields in net and tcp_death_row might incur two extra cache line misses. To save one dereference, let's place tcp_death_row back in netns_ipv4 and fetch hashinfo via net->ipv4.tcp_death_row"."hashinfo. Note tcp_death_row was initially placed in netns_ipv4, and commit fbb8295248e1 ("tcp: allocate tcp_death_row outside of struct netns_ipv4") changed it to a pointer so that we can fire TIME_WAIT timers after freeing net. However, we don't do so after commit 04c494e68a13 ("Revert "tcp/dccp: get rid of inet_twsk_purge()""), so we need not define tcp_death_row as a pointer. Also, we move refcount_dec_and_test(&tw_refcount) from tcp_sk_exit() to tcp_sk_exit_batch() as a debug check. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 6320a76cefdc..2c7df93e3403 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -34,6 +34,7 @@ struct inet_hashinfo; struct inet_timewait_death_row { refcount_t tw_refcount; + /* Padding to avoid false sharing, tw_refcount can be often written */ struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; int sysctl_max_tw_buckets; }; @@ -41,7 +42,7 @@ struct inet_timewait_death_row { struct tcp_fastopen_context; struct netns_ipv4 { - struct inet_timewait_death_row *tcp_death_row; + struct inet_timewait_death_row tcp_death_row; #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; -- cgit v1.2.3 From 429e42c1c54e0d9bfe880195f7d4a8fd5a727194 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 7 Sep 2022 18:10:19 -0700 Subject: tcp: Set NULL to sk->sk_prot->h.hashinfo. We will soon introduce an optional per-netns ehash. This means we cannot use the global sk->sk_prot->h.hashinfo to fetch a TCP hashinfo. Instead, set NULL to sk->sk_prot->h.hashinfo for TCP and get a proper hashinfo from net->ipv4.tcp_death_row.hashinfo. Note that we need not use sk->sk_prot->h.hashinfo if DCCP is disabled. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 44a419b9e3d5..520dd894b73d 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -170,6 +170,16 @@ struct inet_hashinfo { struct inet_listen_hashbucket *lhash2; }; +static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IP_DCCP) + return sk->sk_prot->h.hashinfo ? : + sock_net(sk)->ipv4.tcp_death_row.hashinfo; +#else + return sock_net(sk)->ipv4.tcp_death_row.hashinfo; +#endif +} + static inline struct inet_listen_hashbucket * inet_lhash2_bucket(struct inet_hashinfo *h, u32 hash) { -- cgit v1.2.3 From edc12f032a5a1633474db566878ce0012e7ca2f5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 7 Sep 2022 18:10:21 -0700 Subject: tcp: Save unnecessary inet_twsk_purge() calls. While destroying netns, we call inet_twsk_purge() in tcp_sk_exit_batch() and tcpv6_net_exit_batch() for AF_INET and AF_INET6. These commands trigger the kernel to walk through the potentially big ehash twice even though the netns has no TIME_WAIT sockets. # ip netns add test # ip netns del test or # unshare -n /bin/true >/dev/null When tw_refcount is 1, we need not call inet_twsk_purge() at least for the net. We can save such unneeded iterations if all netns in net_exit_list have no TIME_WAIT sockets. This change eliminates the tax by the additional unshare() described in the next patch to guarantee the per-netns ehash size. Tested: # mount -t debugfs none /sys/kernel/debug/ # echo cleanup_net > /sys/kernel/debug/tracing/set_ftrace_filter # echo inet_twsk_purge >> /sys/kernel/debug/tracing/set_ftrace_filter # echo function > /sys/kernel/debug/tracing/current_tracer # cat ./add_del_unshare.sh for i in `seq 1 40` do (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) & done wait; # ./add_del_unshare.sh Before the patch: # cat /sys/kernel/debug/tracing/trace_pipe kworker/u128:0-8 [031] ...1. 174.162765: cleanup_net <-process_one_work kworker/u128:0-8 [031] ...1. 174.240796: inet_twsk_purge <-cleanup_net kworker/u128:0-8 [032] ...1. 174.244759: inet_twsk_purge <-tcp_sk_exit_batch kworker/u128:0-8 [034] ...1. 174.290861: cleanup_net <-process_one_work kworker/u128:0-8 [039] ...1. 175.245027: inet_twsk_purge <-cleanup_net kworker/u128:0-8 [046] ...1. 175.290541: inet_twsk_purge <-tcp_sk_exit_batch kworker/u128:0-8 [037] ...1. 175.321046: cleanup_net <-process_one_work kworker/u128:0-8 [024] ...1. 175.941633: inet_twsk_purge <-cleanup_net kworker/u128:0-8 [025] ...1. 176.242539: inet_twsk_purge <-tcp_sk_exit_batch After: # cat /sys/kernel/debug/tracing/trace_pipe kworker/u128:0-8 [038] ...1. 428.116174: cleanup_net <-process_one_work kworker/u128:0-8 [038] ...1. 428.262532: cleanup_net <-process_one_work kworker/u128:0-8 [030] ...1. 429.292645: cleanup_net <-process_one_work Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 735e957f7f4b..27e8d378c70a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -346,6 +346,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); +void tcp_twsk_purge(struct list_head *net_exit_list, int family); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); -- cgit v1.2.3 From d1e5e6408b305ff78b825d437df8d3f77e82a4be Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 7 Sep 2022 18:10:22 -0700 Subject: tcp: Introduce optional per-netns ehash. The more sockets we have in the hash table, the longer we spend looking up the socket. While running a number of small workloads on the same host, they penalise each other and cause performance degradation. The root cause might be a single workload that consumes much more resources than the others. It often happens on a cloud service where different workloads share the same computing resource. On EC2 c5.24xlarge instance (196 GiB memory and 524288 (1Mi / 2) ehash entries), after running iperf3 in different netns, creating 24Mi sockets without data transfer in the root netns causes about 10% performance regression for the iperf3's connection. thash_entries sockets length Gbps 524288 1 1 50.7 24Mi 48 45.1 It is basically related to the length of the list of each hash bucket. For testing purposes to see how performance drops along the length, I set 131072 (1Mi / 8) to thash_entries, and here's the result. thash_entries sockets length Gbps 131072 1 1 50.7 1Mi 8 49.9 2Mi 16 48.9 4Mi 32 47.3 8Mi 64 44.6 16Mi 128 40.6 24Mi 192 36.3 32Mi 256 32.5 40Mi 320 27.0 48Mi 384 25.0 To resolve the socket lookup degradation, we introduce an optional per-netns hash table for TCP, but it's just ehash, and we still share the global bhash, bhash2 and lhash2. With a smaller ehash, we can look up non-listener sockets faster and isolate such noisy neighbours. In addition, we can reduce lock contention. We can control the ehash size by a new sysctl knob. However, depending on workloads, it will require very sensitive tuning, so we disable the feature by default (net.ipv4.tcp_child_ehash_entries == 0). Moreover, we can fall back to using the global ehash in case we fail to allocate enough memory for a new ehash. The maximum size is 16Mi, which is large enough that even if we have 48Mi sockets, the average list length is 3, and regression would be less than 1%. We can check the current ehash size by another read-only sysctl knob, net.ipv4.tcp_ehash_entries. A negative value means the netns shares the global ehash (per-netns ehash is disabled or failed to allocate memory). # dmesg | cut -d ' ' -f 5- | grep "established hash" TCP established hash table entries: 524288 (order: 10, 4194304 bytes, vmalloc hugepage) # sysctl net.ipv4.tcp_ehash_entries net.ipv4.tcp_ehash_entries = 524288 # can be changed by thash_entries # sysctl net.ipv4.tcp_child_ehash_entries net.ipv4.tcp_child_ehash_entries = 0 # disabled by default # ip netns add test1 # ip netns exec test1 sysctl net.ipv4.tcp_ehash_entries net.ipv4.tcp_ehash_entries = -524288 # share the global ehash # sysctl -w net.ipv4.tcp_child_ehash_entries=100 net.ipv4.tcp_child_ehash_entries = 100 # ip netns add test2 # ip netns exec test2 sysctl net.ipv4.tcp_ehash_entries net.ipv4.tcp_ehash_entries = 128 # own a per-netns ehash with 2^n buckets When more than two processes in the same netns create per-netns ehash concurrently with different sizes, we need to guarantee the size in one of the following ways: 1) Share the global ehash and create per-netns ehash First, unshare() with tcp_child_ehash_entries==0. It creates dedicated netns sysctl knobs where we can safely change tcp_child_ehash_entries and clone()/unshare() to create a per-netns ehash. 2) Control write on sysctl by BPF We can use BPF_PROG_TYPE_CGROUP_SYSCTL to allow/deny read/write on sysctl knobs. Note that the global ehash allocated at the boot time is spread over available NUMA nodes, but inet_pernet_hashinfo_alloc() will allocate pages for each per-netns ehash depending on the current process's NUMA policy. By default, the allocation is done in the local node only, so the per-netns hash table could fully reside on a random node. Thus, depending on the NUMA policy the netns is created with and the CPU the current thread is running on, we could see some performance differences for highly optimised networking applications. Note also that the default values of two sysctl knobs depend on the ehash size and should be tuned carefully: tcp_max_tw_buckets : tcp_child_ehash_entries / 2 tcp_max_syn_backlog : max(128, tcp_child_ehash_entries / 128) As a bonus, we can dismantle netns faster. Currently, while destroying netns, we call inet_twsk_purge(), which walks through the global ehash. It can be potentially big because it can have many sockets other than TIME_WAIT in all netns. Splitting ehash changes that situation, where it's only necessary for inet_twsk_purge() to clean up TIME_WAIT sockets in each netns. With regard to this, we do not free the per-netns ehash in inet_twsk_kill() to avoid UAF while iterating the per-netns ehash in inet_twsk_purge(). Instead, we do it in tcp_sk_exit_batch() after calling tcp_twsk_purge() to keep it protocol-family-independent. In the future, we could optimise ehash lookup/iteration further by removing netns comparison for the per-netns ehash. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 6 ++++++ include/net/netns/ipv4.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 520dd894b73d..9121ccab1fa1 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -168,6 +168,8 @@ struct inet_hashinfo { /* The 2nd listener table hashed by local port and address */ unsigned int lhash2_mask; struct inet_listen_hashbucket *lhash2; + + bool pernet; }; static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) @@ -214,6 +216,10 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) hashinfo->ehash_locks = NULL; } +struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, + unsigned int ehash_entries); +void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo); + struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2c7df93e3403..1b8004679445 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -171,6 +171,7 @@ struct netns_ipv4 { int sysctl_tcp_pacing_ca_ratio; int sysctl_tcp_wmem[3]; int sysctl_tcp_rmem[3]; + unsigned int sysctl_tcp_child_ehash_entries; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; int sysctl_max_syn_backlog; -- cgit v1.2.3 From 52bdae37c92ae10d47d54bd7cd39e0a17547ebfa Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 20 Sep 2022 08:15:22 -0600 Subject: bpf: Remove unused btf_struct_access stub This stub was not being used anywhere. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/590e7bd6172ffe0f3d7b51cd40e8ded941aaf7e8.1663683114.git.dxu@dxuuu.xyz Signed-off-by: Martin KaFai Lau --- include/net/netfilter/nf_conntrack_bpf.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index a61a93d1c6dc..9c07d2d59da5 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -3,8 +3,6 @@ #ifndef _NF_CONNTRACK_BPF_H #define _NF_CONNTRACK_BPF_H -#include -#include #include #include @@ -31,16 +29,6 @@ static inline void cleanup_nf_conntrack_bpf(void) { } -static inline int nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, - int size, enum bpf_access_type atype, - u32 *next_btf_id, - enum bpf_type_flag *flag) -{ - return -EACCES; -} - #endif #endif /* _NF_CONNTRACK_BPF_H */ -- cgit v1.2.3 From 5a090aa35038e3dad1ee334e3c509c39e7599bb4 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 20 Sep 2022 08:15:23 -0600 Subject: bpf: Rename nfct_bsa to nfct_btf_struct_access The former name was a little hard to guess. Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/73adc72385c8b162391fbfb404f0b6d4c5cc55d7.1663683114.git.dxu@dxuuu.xyz Signed-off-by: Martin KaFai Lau --- include/net/netfilter/nf_conntrack_bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index 9c07d2d59da5..1199d4f8e019 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -13,10 +13,10 @@ extern int register_nf_conntrack_bpf(void); extern void cleanup_nf_conntrack_bpf(void); extern struct mutex nf_conn_btf_access_lock; -extern int (*nfct_bsa)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); +extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); #else -- cgit v1.2.3 From fdf214978a71b2749d26f6da2b1d51d9ac23831d Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 20 Sep 2022 08:15:24 -0600 Subject: bpf: Move nf_conn extern declarations to filter.h We're seeing the following new warnings on netdev/build_32bit and netdev/build_allmodconfig_warn CI jobs: ../net/core/filter.c:8608:1: warning: symbol 'nf_conn_btf_access_lock' was not declared. Should it be static? ../net/core/filter.c:8611:5: warning: symbol 'nfct_bsa' was not declared. Should it be static? Fix by ensuring extern declaration is present while compiling filter.o. Fixes: 864b656f82cc ("bpf: Add support for writing to nf_conn:mark") Signed-off-by: Daniel Xu Link: https://lore.kernel.org/r/2bd2e0283df36d8a4119605878edb1838d144174.1663683114.git.dxu@dxuuu.xyz Signed-off-by: Martin KaFai Lau --- include/linux/filter.h | 6 ++++++ include/net/netfilter/nf_conntrack_bpf.h | 7 ------- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 75335432fcbc..98e28126c24b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -567,6 +567,12 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); +extern struct mutex nf_conn_btf_access_lock; +extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); + typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index 1199d4f8e019..c8b80add1142 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -4,7 +4,6 @@ #define _NF_CONNTRACK_BPF_H #include -#include #if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) @@ -12,12 +11,6 @@ extern int register_nf_conntrack_bpf(void); extern void cleanup_nf_conntrack_bpf(void); -extern struct mutex nf_conn_btf_access_lock; -extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); - #else static inline int register_nf_conntrack_bpf(void) -- cgit v1.2.3 From fe0df81df51eb932a83e0c3844106ac6c0f914db Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Fri, 16 Sep 2022 10:02:43 +0800 Subject: net/sched: cls_api: add helper for tc cls walker stats dump The walk implementation of most tc cls modules is basically the same. That is, the values of count and skip are checked first. If count is greater than or equal to skip, the registered fn function is executed. Otherwise, increase the value of count. So we can reconstruct them. Signed-off-by: Zhengchao Shao Reviewed-by: Jamal Hadi Salim Reviewed-by: Victor Nogueira Tested-by: Victor Nogueira Signed-off-by: Jakub Kicinski --- include/net/pkt_cls.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index d9d90e6925e1..d376c995d906 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -81,6 +81,19 @@ int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode); +static inline bool tc_cls_stats_dump(struct tcf_proto *tp, + struct tcf_walker *arg, + void *filter) +{ + if (arg->count >= arg->skip && arg->fn(tp, filter, arg) < 0) { + arg->stop = 1; + return false; + } + + arg->count++; + return true; +} + #else static inline bool tcf_block_shared(struct tcf_block *block) { -- cgit v1.2.3 From 7b5541a932c21f7e07f068a785afcc25986e4893 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 11 Sep 2022 14:03:09 +0200 Subject: headers: Remove some left-over license text in include/uapi/linux/netfilter/ When the SPDX-License-Identifier tag has been added, the corresponding license text has not been removed. Remove it now. Also, in xt_connmark.h, move the copyright text at the top of the file which is a much more common pattern. Signed-off-by: Christophe JAILLET Signed-off-by: Florian Westphal --- include/uapi/linux/netfilter/ipset/ip_set.h | 4 ---- include/uapi/linux/netfilter/xt_AUDIT.h | 4 ---- include/uapi/linux/netfilter/xt_connmark.h | 13 ++++--------- include/uapi/linux/netfilter/xt_osf.h | 14 -------------- 4 files changed, 4 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 6397d75899bc..79e5d68b87af 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -3,10 +3,6 @@ * Patrick Schaaf * Martin Josefsson * Copyright (C) 2003-2011 Jozsef Kadlecsik - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef _UAPI_IP_SET_H #define _UAPI_IP_SET_H diff --git a/include/uapi/linux/netfilter/xt_AUDIT.h b/include/uapi/linux/netfilter/xt_AUDIT.h index 1b314e2f84ac..56a3f6092e0c 100644 --- a/include/uapi/linux/netfilter/xt_AUDIT.h +++ b/include/uapi/linux/netfilter/xt_AUDIT.h @@ -4,10 +4,6 @@ * * (C) 2010-2011 Thomas Graf * (C) 2010-2011 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. */ #ifndef _XT_AUDIT_TARGET_H diff --git a/include/uapi/linux/netfilter/xt_connmark.h b/include/uapi/linux/netfilter/xt_connmark.h index f01c19b83a2b..41b578ccd03b 100644 --- a/include/uapi/linux/netfilter/xt_connmark.h +++ b/include/uapi/linux/netfilter/xt_connmark.h @@ -1,18 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* Copyright (C) 2002,2004 MARA Systems AB + * by Henrik Nordstrom + */ + #ifndef _XT_CONNMARK_H #define _XT_CONNMARK_H #include -/* Copyright (C) 2002,2004 MARA Systems AB - * by Henrik Nordstrom - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - enum { XT_CONNMARK_SET = 0, XT_CONNMARK_SAVE, diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h index 6e466236ca4b..f1f097896bdf 100644 --- a/include/uapi/linux/netfilter/xt_osf.h +++ b/include/uapi/linux/netfilter/xt_osf.h @@ -1,20 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* * Copyright (c) 2003+ Evgeniy Polyakov - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . */ #ifndef _XT_OSF_H -- cgit v1.2.3 From 0e426a3ae030a9e891899370229e117158b35de6 Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Wed, 21 Sep 2022 10:46:02 +0000 Subject: bpf, cgroup: Reject prog_attach_flags array when effective query Attach flags is only valid for attached progs of this layer cgroup, but not for effective progs. For querying with EFFECTIVE flags, exporting attach flags does not make sense. So when effective query, we reject prog_attach_flags array and don't need to populate it. Also we limit attach_flags to output 0 during effective query. Fixes: b79c9fc9551b ("bpf: implement BPF_PROG_QUERY for BPF_LSM_CGROUP") Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20220921104604.2340580-2-pulehui@huaweicloud.com Signed-off-by: Martin KaFai Lau --- include/uapi/linux/bpf.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 59a217ca2dfd..4eff7fc7ae58 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1233,7 +1233,7 @@ enum { /* Query effective (directly attached + inherited from ancestor cgroups) * programs that will be executed for events within a cgroup. - * attach_flags with this flag are returned only for directly attached programs. + * attach_flags with this flag are always returned 0. */ #define BPF_F_QUERY_EFFECTIVE (1U << 0) @@ -1432,7 +1432,10 @@ union bpf_attr { __u32 attach_flags; __aligned_u64 prog_ids; __u32 prog_cnt; - __aligned_u64 prog_attach_flags; /* output: per-program attach_flags */ + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; } query; struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ -- cgit v1.2.3 From 583c1f420173f7d84413a1a1fbf5109d798b4faa Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 19 Sep 2022 19:00:57 -0500 Subject: bpf: Define new BPF_MAP_TYPE_USER_RINGBUF map type We want to support a ringbuf map type where samples are published from user-space, to be consumed by BPF programs. BPF currently supports a kernel -> user-space circular ring buffer via the BPF_MAP_TYPE_RINGBUF map type. We'll need to define a new map type for user-space -> kernel, as none of the helpers exported for BPF_MAP_TYPE_RINGBUF will apply to a user-space producer ring buffer, and we'll want to add one or more helper functions that would not apply for a kernel-producer ring buffer. This patch therefore adds a new BPF_MAP_TYPE_USER_RINGBUF map type definition. The map type is useless in its current form, as there is no way to access or use it for anything until we one or more BPF helpers. A follow-on patch will therefore add a new helper function that allows BPF programs to run callbacks on samples that are published to the ring buffer. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920000100.477320-2-void@manifault.com --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2b9112b80171..2c6a4f2562a7 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -126,6 +126,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3df78c56c1bf..e18c85324db6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -928,6 +928,7 @@ enum bpf_map_type { BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, }; /* Note that tracing related programs such as -- cgit v1.2.3 From 20571567384428dfc9fe5cf9f2e942e1df13c2dd Mon Sep 17 00:00:00 2001 From: David Vernet Date: Mon, 19 Sep 2022 19:00:58 -0500 Subject: bpf: Add bpf_user_ringbuf_drain() helper In a prior change, we added a new BPF_MAP_TYPE_USER_RINGBUF map type which will allow user-space applications to publish messages to a ring buffer that is consumed by a BPF program in kernel-space. In order for this map-type to be useful, it will require a BPF helper function that BPF programs can invoke to drain samples from the ring buffer, and invoke callbacks on those samples. This change adds that capability via a new BPF helper function: bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) BPF programs may invoke this function to run callback_fn() on a series of samples in the ring buffer. callback_fn() has the following signature: long callback_fn(struct bpf_dynptr *dynptr, void *context); Samples are provided to the callback in the form of struct bpf_dynptr *'s, which the program can read using BPF helper functions for querying struct bpf_dynptr's. In order to support bpf_ringbuf_drain(), a new PTR_TO_DYNPTR register type is added to the verifier to reflect a dynptr that was allocated by a helper function and passed to a BPF program. Unlike PTR_TO_STACK dynptrs which are allocated on the stack by a BPF program, PTR_TO_DYNPTR dynptrs need not use reference tracking, as the BPF helper is trusted to properly free the dynptr before returning. The verifier currently only supports PTR_TO_DYNPTR registers that are also DYNPTR_TYPE_LOCAL. Note that while the corresponding user-space libbpf logic will be added in a subsequent patch, this patch does contain an implementation of the .map_poll() callback for BPF_MAP_TYPE_USER_RINGBUF maps. This .map_poll() callback guarantees that an epoll-waiting user-space producer will receive at least one event notification whenever at least one sample is drained in an invocation of bpf_user_ringbuf_drain(), provided that the function is not invoked with the BPF_RB_NO_WAKEUP flag. If the BPF_RB_FORCE_WAKEUP flag is provided, a wakeup notification is sent even if no sample was drained. Signed-off-by: David Vernet Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20220920000100.477320-3-void@manifault.com --- include/linux/bpf.h | 11 +++++++++-- include/uapi/linux/bpf.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e0dbe0c0a17e..33e543b86e1a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -451,7 +451,7 @@ enum bpf_type_flag { /* DYNPTR points to memory local to the bpf program. */ DYNPTR_TYPE_LOCAL = BIT(8 + BPF_BASE_TYPE_BITS), - /* DYNPTR points to a ringbuf record. */ + /* DYNPTR points to a kernel-produced ringbuf record. */ DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), /* Size is known at compile time. */ @@ -656,6 +656,7 @@ enum bpf_reg_type { PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_DYNPTR, /* reg points to a dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ @@ -1394,6 +1395,11 @@ struct bpf_array { #define BPF_MAP_CAN_READ BIT(0) #define BPF_MAP_CAN_WRITE BIT(1) +/* Maximum number of user-producer ring buffer samples that can be drained in + * a call to bpf_user_ringbuf_drain(). + */ +#define BPF_MAX_USER_RINGBUF_SAMPLES (128 * 1024) + static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) { u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); @@ -2495,6 +2501,7 @@ extern const struct bpf_func_proto bpf_loop_proto; extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; +extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); @@ -2639,7 +2646,7 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_INVALID, /* Points to memory that is local to the bpf program */ BPF_DYNPTR_TYPE_LOCAL, - /* Underlying data is a ringbuf record */ + /* Underlying data is a kernel-produced ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e18c85324db6..ead35f39f185 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5388,6 +5388,43 @@ union bpf_attr { * Return * Current *ktime*. * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5599,6 +5636,7 @@ union bpf_attr { FN(tcp_raw_check_syncookie_ipv4), \ FN(tcp_raw_check_syncookie_ipv6), \ FN(ktime_get_tai_ns), \ + FN(user_ringbuf_drain), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From b8d31762a0ae6861e1115302ee338560d853e317 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:42 +0200 Subject: btf: Allow dynamic pointer parameters in kfuncs Allow dynamic pointers (struct bpf_dynptr_kern *) to be specified as parameters in kfuncs. Also, ensure that dynamic pointers passed as argument are valid and initialized, are a pointer to the stack, and of the type local. More dynamic pointer types can be supported in the future. To properly detect whether a parameter is of the desired type, introduce the stringify_struct() macro to compare the returned structure name with the desired name. In addition, protect against structure renames, by halting the build with BUILD_BUG_ON(), so that developers have to revisit the code. To check if a dynamic pointer passed to the kfunc is valid and initialized, and if its type is local, export the existing functions is_dynptr_reg_valid_init() and is_dynptr_type_expected(). Cc: Joanne Koong Cc: Kumar Kartikeya Dwivedi Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-5-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ include/linux/btf.h | 9 +++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e197f8fb27e2..9e1e6965f407 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -593,6 +593,11 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); +bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, + struct bpf_reg_state *reg); +bool is_dynptr_type_expected(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + enum bpf_arg_type arg_type); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/include/linux/btf.h b/include/linux/btf.h index 1fcc833a8690..f9aababc5d78 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -52,6 +52,15 @@ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ +/* + * Return the name of the passed struct, if exists, or halt the build if for + * example the structure gets renamed. In this way, developers have to revisit + * the code using that structure name, and update it accordingly. + */ +#define stringify_struct(x) \ + ({ BUILD_BUG_ON(sizeof(struct x) < 0); \ + __stringify(x); }) + struct btf; struct btf_member; struct btf_type; -- cgit v1.2.3 From 51df4865718540f51bb5d3e552c50dc88e1333d6 Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:43 +0200 Subject: bpf: Export bpf_dynptr_get_size() Export bpf_dynptr_get_size(), so that kernel code dealing with eBPF dynamic pointers can obtain the real size of data carried by this data structure. Signed-off-by: Roberto Sassu Reviewed-by: Joanne Koong Acked-by: KP Singh Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20220920075951.929132-6-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 33e543b86e1a..6535fb1e21b9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2654,6 +2654,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); +u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); -- cgit v1.2.3 From 90fd8f26edd47942203639bf3a5dde8fa1931a0e Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:44 +0200 Subject: KEYS: Move KEY_LOOKUP_ to include/linux/key.h and define KEY_LOOKUP_ALL In preparation for the patch that introduces the bpf_lookup_user_key() eBPF kfunc, move KEY_LOOKUP_ definitions to include/linux/key.h, to be able to validate the kfunc parameters. Add them to enum key_lookup_flag, so that all the current ones and the ones defined in the future are automatically exported through BTF and available to eBPF programs. Also, add KEY_LOOKUP_ALL to the enum, with the logical OR of currently defined flags as value, to facilitate checking whether a variable contains only those flags. Signed-off-by: Roberto Sassu Acked-by: Jarkko Sakkinen Link: https://lore.kernel.org/r/20220920075951.929132-7-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/key.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/key.h b/include/linux/key.h index 7febc4881363..d27477faf00d 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -88,6 +88,12 @@ enum key_need_perm { KEY_DEFER_PERM_CHECK, /* Special: permission check is deferred */ }; +enum key_lookup_flag { + KEY_LOOKUP_CREATE = 0x01, + KEY_LOOKUP_PARTIAL = 0x02, + KEY_LOOKUP_ALL = (KEY_LOOKUP_CREATE | KEY_LOOKUP_PARTIAL), +}; + struct seq_file; struct user_struct; struct signal_struct; -- cgit v1.2.3 From f3cf4134c5c6c47b9b5c7aa3cb2d67e107887a7b Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Tue, 20 Sep 2022 09:59:45 +0200 Subject: bpf: Add bpf_lookup_*_key() and bpf_key_put() kfuncs Add the bpf_lookup_user_key(), bpf_lookup_system_key() and bpf_key_put() kfuncs, to respectively search a key with a given key handle serial number and flags, obtain a key from a pre-determined ID defined in include/linux/verification.h, and cleanup. Introduce system_keyring_id_check() to validate the keyring ID parameter of bpf_lookup_system_key(). Signed-off-by: Roberto Sassu Acked-by: Kumar Kartikeya Dwivedi Acked-by: Song Liu Link: https://lore.kernel.org/r/20220920075951.929132-8-roberto.sassu@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++++ include/linux/verification.h | 8 ++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6535fb1e21b9..a1435b019aca 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2664,4 +2664,12 @@ static inline void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) {} static inline void bpf_cgroup_atype_put(int cgroup_atype) {} #endif /* CONFIG_BPF_LSM */ +struct key; + +#ifdef CONFIG_KEYS +struct bpf_key { + struct key *key; + bool has_ref; +}; +#endif /* CONFIG_KEYS */ #endif /* _LINUX_BPF_H */ diff --git a/include/linux/verification.h b/include/linux/verification.h index a655923335ae..f34e50ebcf60 100644 --- a/include/linux/verification.h +++ b/include/linux/verification.h @@ -17,6 +17,14 @@ #define VERIFY_USE_SECONDARY_KEYRING ((struct key *)1UL) #define VERIFY_USE_PLATFORM_KEYRING ((struct key *)2UL) +static inline int system_keyring_id_check(u64 id) +{ + if (id > (unsigned long)VERIFY_USE_PLATFORM_KEYRING) + return -EINVAL; + + return 0; +} + /* * The use to which an asymmetric key is being put. */ -- cgit v1.2.3 From 05b24ff9b2cfabfcfd951daaa915a036ab53c9e1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 16 Sep 2022 09:19:14 +0200 Subject: bpf: Prevent bpf program recursion for raw tracepoint probes We got report from sysbot [1] about warnings that were caused by bpf program attached to contention_begin raw tracepoint triggering the same tracepoint by using bpf_trace_printk helper that takes trace_printk_lock lock. Call Trace: ? trace_event_raw_event_bpf_trace_printk+0x5f/0x90 bpf_trace_printk+0x2b/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 bpf_trace_printk+0x3f/0xe0 bpf_prog_a9aec6167c091eef_prog+0x1f/0x24 bpf_trace_run2+0x26/0x90 native_queued_spin_lock_slowpath+0x1c6/0x2b0 _raw_spin_lock_irqsave+0x44/0x50 __unfreeze_partials+0x5b/0x160 ... The can be reproduced by attaching bpf program as raw tracepoint on contention_begin tracepoint. The bpf prog calls bpf_trace_printk helper. Then by running perf bench the spin lock code is forced to take slow path and call contention_begin tracepoint. Fixing this by skipping execution of the bpf program if it's already running, Using bpf prog 'active' field, which is being currently used by trampoline programs for the same reason. Moving bpf_prog_inc_misses_counter to syscall.c because trampoline.c is compiled in just for CONFIG_BPF_JIT option. Reviewed-by: Stanislav Fomichev Reported-by: syzbot+2251879aa068ad9c960d@syzkaller.appspotmail.com [1] https://lore.kernel.org/bpf/YxhFe3EwqchC%2FfYf@krava/T/#t Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220916071914.7156-1-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a1435b019aca..edd43edb27d6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2042,6 +2042,8 @@ static inline bool has_current_bpf_ctx(void) { return !!current->bpf_ctx; } + +void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2264,6 +2266,10 @@ static inline bool has_current_bpf_ctx(void) { return false; } + +static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, -- cgit v1.2.3 From 1d14b30b5a5e52da93467af7c1dca08f124186df Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Mon, 19 Sep 2022 13:06:27 +0000 Subject: net: sched: remove unused tcf_result extension Added by: commit e5cf1baf92cb ("act_mirred: use TC_ACT_REINSERT when possible") but no longer useful. Signed-off-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20220919130627.3551233-1-jhs@mojatatu.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 7dc83400bde4..32819299937d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -326,11 +326,6 @@ struct tcf_result { }; const struct tcf_proto *goto_tp; - /* used in the skb_tc_reinsert function */ - struct { - bool ingress; - struct gnet_stats_queue *qstats; - }; }; }; -- cgit v1.2.3 From adb5c33e4d4c83fb848a402e2191fbf3e2bf50d1 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 14 Sep 2022 19:04:03 +0200 Subject: xfrm: add extack support to xfrm_dev_state_add Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 28b988577ed2..9c1cccf85f12 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1886,7 +1886,8 @@ void xfrm_dev_resume(struct sk_buff *skb); void xfrm_dev_backlog(struct softnet_data *sd); struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again); int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, - struct xfrm_user_offload *xuo); + struct xfrm_user_offload *xuo, + struct netlink_ext_ack *extack); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) @@ -1949,7 +1950,7 @@ static inline struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_fea return skb; } -static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo) +static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo, struct netlink_ext_ack *extack) { return 0; } -- cgit v1.2.3 From 741f9a1064985512567eca1552643738ecfb5cc5 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 14 Sep 2022 19:04:05 +0200 Subject: xfrm: add extack to __xfrm_init_state Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 9c1cccf85f12..f427a74d571b 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1582,7 +1582,8 @@ void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); int xfrm_init_replay(struct xfrm_state *x); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); -int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload); +int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, + struct netlink_ext_ack *extack); int xfrm_init_state(struct xfrm_state *x); int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type); int xfrm_input_resume(struct sk_buff *skb, int nexthdr); -- cgit v1.2.3 From 1cf9a3ae3e2de359471a7036f48ac59e48b15256 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 14 Sep 2022 19:04:06 +0200 Subject: xfrm: add extack support to xfrm_init_replay Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f427a74d571b..c504d07bcb7c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1580,7 +1580,7 @@ int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_vali void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); -int xfrm_init_replay(struct xfrm_state *x); +int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 77eee32514314209961af5c2982e871ecb364445 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Tue, 20 Sep 2022 17:52:21 +0800 Subject: net/smc: Introduce a specific sysctl for TEST_LINK time SMC-R tests the viability of link by sending out TEST_LINK LLC messages over RoCE fabric when connections on link have been idle for a time longer than keepalive interval (testlink time). But using tcp_keepalive_time as testlink time maybe not quite suitable because it is default no less than two hours[1], which is too long for single link to find peer dead. The active host will still use peer-dead link (QP) sending messages, and can't find out until get IB_WC_RETRY_EXC_ERR error CQEs, which takes more time than TEST_LINK timeout (SMC_LLC_WAIT_TIME) normally. So this patch introduces a independent sysctl for SMC-R to set link keepalive time, in order to detect link down in time. The default value is 30 seconds. [1] https://www.rfc-editor.org/rfc/rfc1122#page-101 Signed-off-by: Wen Gu Signed-off-by: Paolo Abeni --- include/net/netns/smc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 2adbe2b245df..d295e2c10dca 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -19,5 +19,6 @@ struct netns_smc { #endif unsigned int sysctl_autocorking_size; unsigned int sysctl_smcr_buf_type; + int sysctl_smcr_testlink_time; }; #endif -- cgit v1.2.3 From 0227f058aa29f5ab6f6ec79c3a36ae41f1e03a13 Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Tue, 20 Sep 2022 17:52:22 +0800 Subject: net/smc: Unbind r/w buffer size from clcsock and make them tunable Currently, SMC uses smc->sk.sk_{rcv|snd}buf to create buffers for send buffer and RMB. And the values of buffer size are from tcp_{w|r}mem in clcsock. The buffer size from TCP socket doesn't fit SMC well. Generally, buffers are usually larger than TCP for SMC-R/-D to get higher performance, for they are different underlay devices and paths. So this patch unbinds buffer size from TCP, and introduces two sysctl knobs to tune them independently. Also, these knobs are per net namespace and work for containers. Signed-off-by: Tony Lu Signed-off-by: Paolo Abeni --- include/net/netns/smc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index d295e2c10dca..582212ada3ba 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -20,5 +20,7 @@ struct netns_smc { unsigned int sysctl_autocorking_size; unsigned int sysctl_smcr_buf_type; int sysctl_smcr_testlink_time; + int sysctl_wmem; + int sysctl_rmem; }; #endif -- cgit v1.2.3 From de84a090d99a3b991bd89cd86a94b65d15bd1bbe Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 20 Sep 2022 12:11:21 +0200 Subject: net: ethernet: mtk_eth_wed: add wed support for mt7986 chipset Introduce Wireless Etherne Dispatcher support on transmission side for mt7986 chipset Tested-by: Daniel Golle Co-developed-by: Bo Jiao Signed-off-by: Bo Jiao Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: Paolo Abeni --- include/linux/soc/mediatek/mtk_wed.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 7e00cca06709..592221a7149b 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -14,6 +14,7 @@ struct mtk_wdma_desc; struct mtk_wed_ring { struct mtk_wdma_desc *desc; dma_addr_t desc_phys; + u32 desc_size; int size; u32 reg_base; @@ -45,10 +46,17 @@ struct mtk_wed_device { struct pci_dev *pci_dev; u32 wpdma_phys; + u32 wpdma_int; + u32 wpdma_mask; + u32 wpdma_tx; + u32 wpdma_txfree; u16 token_start; unsigned int nbuf; + u8 tx_tbit[MTK_WED_TX_QUEUES]; + u8 txfree_tbit; + u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id); int (*offload_enable)(struct mtk_wed_device *wed); void (*offload_disable)(struct mtk_wed_device *wed); -- cgit v1.2.3 From 2b2ba3ecb2411c5e2a0d670be5e9ded2c93351e9 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 20 Sep 2022 12:11:22 +0200 Subject: net: ethernet: mtk_eth_wed: add axi bus support Other than pcie bus, introduce support for axi bus to mtk wed driver. Axi bus is used to connect mt7986-wmac soc chip available on mt7986 device. Tested-by: Daniel Golle Co-developed-by: Bo Jiao Signed-off-by: Bo Jiao Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: Paolo Abeni --- include/linux/soc/mediatek/mtk_wed.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 592221a7149b..4450c8b7a1cb 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -11,6 +11,11 @@ struct mtk_wed_hw; struct mtk_wdma_desc; +enum mtk_wed_bus_tye { + MTK_WED_BUS_PCIE, + MTK_WED_BUS_AXI, +}; + struct mtk_wed_ring { struct mtk_wdma_desc *desc; dma_addr_t desc_phys; @@ -43,7 +48,11 @@ struct mtk_wed_device { /* filled by driver: */ struct { - struct pci_dev *pci_dev; + union { + struct platform_device *platform_dev; + struct pci_dev *pci_dev; + }; + enum mtk_wed_bus_tye bus_type; u32 wpdma_phys; u32 wpdma_int; -- cgit v1.2.3 From 60240bc26114543fcbfcd8a28466e67e77b20388 Mon Sep 17 00:00:00 2001 From: Jalal Mostafa Date: Wed, 21 Sep 2022 13:57:01 +0000 Subject: xsk: Inherit need_wakeup flag for shared sockets The flag for need_wakeup is not set for xsks with `XDP_SHARED_UMEM` flag and of different queue ids and/or devices. They should inherit the flag from the first socket buffer pool since no flags can be specified once `XDP_SHARED_UMEM` is specified. Fixes: b5aea28dca134 ("xsk: Add shared umem support between queue ids") Signed-off-by: Jalal Mostafa Signed-off-by: Daniel Borkmann Acked-by: Magnus Karlsson Link: https://lore.kernel.org/bpf/20220921135701.10199-1-jalal.a.mostapha@gmail.com --- include/net/xsk_buff_pool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 647722e847b4..f787c3f524b0 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -95,7 +95,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem); int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *dev, u16 queue_id, u16 flags); -int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, +int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, struct net_device *dev, u16 queue_id); int xp_alloc_tx_descs(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_destroy(struct xsk_buff_pool *pool); -- cgit v1.2.3 From 2d2c5ea24243eb3ed12f232b2aef43981fa15360 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 20 Sep 2022 16:01:47 +0300 Subject: net/tls: Describe ciphers sizes by const structs Introduce cipher sizes descriptor. It helps reducing the amount of code duplications and repeated switch/cases that assigns the proper sizes according to the cipher type. Signed-off-by: Tariq Toukan Signed-off-by: Gal Pressman Signed-off-by: Jakub Kicinski --- include/net/tls.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index cb205f9d9473..154949c7b0c8 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -51,6 +51,16 @@ struct tls_rec; +struct tls_cipher_size_desc { + unsigned int iv; + unsigned int key; + unsigned int salt; + unsigned int tag; + unsigned int rec_seq; +}; + +extern const struct tls_cipher_size_desc tls_cipher_size_desc[]; + /* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) -- cgit v1.2.3 From d7a68e564e29bcd1c70d76d9e2f65591e6183f46 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Wed, 21 Sep 2022 10:41:04 +0800 Subject: net/sched: sch_api: add helper for tc qdisc walker stats dump MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The walk implementation of most qdisc class modules is basically the same. That is, the values of count and skip are checked first. If count is greater than or equal to skip, the registered fn function is executed. Otherwise, increase the value of count. So we can reconstruct them. Signed-off-by: Zhengchao Shao Acked-by: Toke Høiland-Jørgensen Signed-off-by: Jakub Kicinski --- include/net/pkt_sched.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 29f65632ebc5..2ff80cd04c5c 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -222,4 +222,17 @@ static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) return cb; } +static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, + unsigned long cl, + struct qdisc_walker *arg) +{ + if (arg->count >= arg->skip && arg->fn(sch, cl, arg) < 0) { + arg->stop = 1; + return false; + } + + arg->count++; + return true; +} + #endif -- cgit v1.2.3 From 21803630c4ffb433bab2951fbe0ee7b8dbcc8bcb Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Wed, 21 Sep 2022 11:10:46 -0700 Subject: net/mlx5: Fix fields name prefix in MACsec Fix ifc fields name to be consistent with the device spec document. Fixes: 8385c51ff5bc ("net/mlx5: Introduce MACsec Connect-X offload hardware bits and structures") Signed-off-by: Emeel Hakim Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 8decbf9a7bdd..da0ed11fcebd 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -11585,15 +11585,15 @@ struct mlx5_ifc_macsec_offload_obj_bits { u8 confidentiality_en[0x1]; u8 reserved_at_41[0x1]; - u8 esn_en[0x1]; - u8 esn_overlap[0x1]; + u8 epn_en[0x1]; + u8 epn_overlap[0x1]; u8 reserved_at_44[0x2]; u8 confidentiality_offset[0x2]; u8 reserved_at_48[0x4]; u8 aso_return_reg[0x4]; u8 reserved_at_50[0x10]; - u8 esn_msb[0x20]; + u8 epn_msb[0x20]; u8 reserved_at_80[0x8]; u8 dekn[0x18]; -- cgit v1.2.3 From 23cc83c6ca87c9a4da62b9ddf4d0fc09f3054f81 Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Wed, 21 Sep 2022 11:10:49 -0700 Subject: net/mlx5: Add ifc bits for MACsec extended packet number (EPN) and replay protection Add ifc bits related to advanced steering operations (ASO) and general object modify for macsec to use as part of offloading EPN and replay protection features. Reviewed-by: Raed Salem Signed-off-by: Emeel Hakim Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index da0ed11fcebd..bd577b99b146 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -11558,6 +11558,20 @@ struct mlx5_ifc_modify_ipsec_obj_in_bits { struct mlx5_ifc_ipsec_obj_bits ipsec_object; }; +enum { + MLX5_MACSEC_ASO_REPLAY_PROTECTION = 0x1, +}; + +enum { + MLX5_MACSEC_ASO_REPLAY_WIN_32BIT = 0x0, + MLX5_MACSEC_ASO_REPLAY_WIN_64BIT = 0x1, + MLX5_MACSEC_ASO_REPLAY_WIN_128BIT = 0x2, + MLX5_MACSEC_ASO_REPLAY_WIN_256BIT = 0x3, +}; + +#define MLX5_MACSEC_ASO_INC_SN 0x2 +#define MLX5_MACSEC_ASO_REG_C_4_5 0x2 + struct mlx5_ifc_macsec_aso_bits { u8 valid[0x1]; u8 reserved_at_1[0x1]; @@ -11619,6 +11633,21 @@ struct mlx5_ifc_create_macsec_obj_in_bits { struct mlx5_ifc_macsec_offload_obj_bits macsec_object; }; +struct mlx5_ifc_modify_macsec_obj_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits general_obj_in_cmd_hdr; + struct mlx5_ifc_macsec_offload_obj_bits macsec_object; +}; + +enum { + MLX5_MODIFY_MACSEC_BITMASK_EPN_OVERLAP = BIT(0), + MLX5_MODIFY_MACSEC_BITMASK_EPN_MSB = BIT(1), +}; + +struct mlx5_ifc_query_macsec_obj_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits general_obj_out_cmd_hdr; + struct mlx5_ifc_macsec_offload_obj_bits macsec_object; +}; + struct mlx5_ifc_encryption_key_obj_bits { u8 modify_field_select[0x40]; -- cgit v1.2.3 From 4411a6c0abd3e55b4a4fb9432b3a0553f12337c2 Mon Sep 17 00:00:00 2001 From: Emeel Hakim Date: Wed, 21 Sep 2022 11:10:53 -0700 Subject: net/mlx5e: Support MACsec offload extended packet number (EPN) MACsec EPN splits the packet number (PN) into two 32-bits fields, epn_lsb (32 least significant bits (LSBs) of PN) and epn_msb (32 most significant bits (MSBs) of PN). Epn_msb bits are managed by SW and for that HW is required to send an object change event of type EPN event notifying the SW to update the epn_msb in addition, once epn_msb is updated SW update HW with the new epn_msb value for HW to perform replay protection. To prevent HW from stopping while handling the event, SW manages another bit for HW called epn_overlap, HW uses the latter to get an indication regarding how to read the epn_msb value correctly while still receiving packets. Add epn event handling that updates the epn_overlap and epn_msb for every 2^31 packets according to the following logic: if epn_lsb crosses 2^31 (half sequence number wraparound) upon HW relevant event, SW updates the esn_overlap value to OLD (value = 1). When the epn_lsb crosses 2^32 (full sequence number wraparound) upon HW relevant event, SW updates the esn_overlap to NEW (value = 0) and increment the esn_msb. When using MACsec EPN a salt and short secure channel id (ssci) needs to be provided by the user, when offloading EPN need to pass this salt and ssci to the HW to be used in the initial vector (IV) calculations. Reviewed-by: Raed Salem Signed-off-by: Emeel Hakim Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Reported-by: kernel test robot Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/device.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 2927810f172b..dcd60fb9e6b4 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -325,6 +325,7 @@ enum mlx5_event { MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10, MLX5_EVENT_TYPE_WQ_ACCESS_ERROR = 0x11, MLX5_EVENT_TYPE_SRQ_CATAS_ERROR = 0x12, + MLX5_EVENT_TYPE_OBJECT_CHANGE = 0x27, MLX5_EVENT_TYPE_INTERNAL_ERROR = 0x08, MLX5_EVENT_TYPE_PORT_CHANGE = 0x09, @@ -699,6 +700,12 @@ struct mlx5_eqe_temp_warning { __be64 sensor_warning_lsb; } __packed; +struct mlx5_eqe_obj_change { + u8 rsvd0[2]; + __be16 obj_type; + __be32 obj_id; +} __packed; + #define SYNC_RST_STATE_MASK 0xf enum sync_rst_state_type { @@ -737,6 +744,7 @@ union ev_data { struct mlx5_eqe_xrq_err xrq_err; struct mlx5_eqe_sync_fw_update sync_fw_update; struct mlx5_eqe_vhca_state vhca_state; + struct mlx5_eqe_obj_change obj_change; } __packed; struct mlx5_eqe { -- cgit v1.2.3 From 72bc36956f73ac54f19a7ac7302fb274069bec18 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 20 Sep 2022 18:12:28 -0400 Subject: net: phylink: Document MAC_(A)SYM_PAUSE This documents the possible MLO_PAUSE_* settings which can result from different combinations of MAC_(A)SYM_PAUSE. Special note is paid to settings which can result from user configuration (MLO_PAUSE_AN). The autonegotiation results are more-or-less a direct consequence of IEEE 802.3 Table 28B-2. Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 6d06896fc20d..1f997e14bf80 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -21,6 +21,35 @@ enum { MLO_AN_FIXED, /* Fixed-link mode */ MLO_AN_INBAND, /* In-band protocol */ + /* MAC_SYM_PAUSE and MAC_ASYM_PAUSE are used when configuring our + * autonegotiation advertisement. They correspond to the PAUSE and + * ASM_DIR bits defined by 802.3, respectively. + * + * The following table lists the values of tx_pause and rx_pause which + * might be requested in mac_link_up. The exact values depend on either + * the results of autonegotation (if MLO_PAUSE_AN is set) or user + * configuration (if MLO_PAUSE_AN is not set). + * + * MAC_SYM_PAUSE MAC_ASYM_PAUSE MLO_PAUSE_AN tx_pause/rx_pause + * ============= ============== ============ ================== + * 0 0 0 0/0 + * 0 0 1 0/0 + * 0 1 0 0/0, 0/1, 1/0, 1/1 + * 0 1 1 0/0, 1/0 + * 1 0 0 0/0, 1/1 + * 1 0 1 0/0, 1/1 + * 1 1 0 0/0, 0/1, 1/0, 1/1 + * 1 1 1 0/0, 0/1, 1/1 + * + * If you set MAC_ASYM_PAUSE, the user may request any combination of + * tx_pause and rx_pause. You do not have to support these + * combinations. + * + * However, you should support combinations of tx_pause and rx_pause + * which might be the result of autonegotation. For example, don't set + * MAC_SYM_PAUSE unless your device can support tx_pause and rx_pause + * at the same time. + */ MAC_SYM_PAUSE = BIT(0), MAC_ASYM_PAUSE = BIT(1), MAC_10HD = BIT(2), -- cgit v1.2.3 From 606116529ab2d12e93bf751f74ed50a621b46846 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 20 Sep 2022 18:12:29 -0400 Subject: net: phylink: Export phylink_caps_to_linkmodes This function is convenient for MAC drivers. They can use it to add or remove particular link modes based on capabilities (such as if half duplex is not supported for a particular interface mode). Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 1f997e14bf80..7cf26d7a522d 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -547,6 +547,7 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int mode, phy_interface_t interface, int speed, int duplex); #endif +void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps); void phylink_get_linkmodes(unsigned long *linkmodes, phy_interface_t interface, unsigned long mac_capabilities); void phylink_generic_validate(struct phylink_config *config, -- cgit v1.2.3 From 3e6eab8f3ef93cd78cd4b67f497ef6035eb073ad Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 20 Sep 2022 18:12:30 -0400 Subject: net: phylink: Generate caps and convert to linkmodes separately If we call phylink_caps_to_linkmodes directly from phylink_get_linkmodes, it is difficult to re-use this functionality in MAC drivers. This is because MAC drivers must then work with an ethtool linkmode bitmap, instead of with mac capabilities. Instead, let the caller of phylink_get_linkmodes do the conversion. To reflect this change, rename the function to phylink_get_capabilities. Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 7cf26d7a522d..cc039ae7e80c 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -548,8 +548,8 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int mode, #endif void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps); -void phylink_get_linkmodes(unsigned long *linkmodes, phy_interface_t interface, - unsigned long mac_capabilities); +unsigned long phylink_get_capabilities(phy_interface_t interface, + unsigned long mac_capabilities); void phylink_generic_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); -- cgit v1.2.3 From 0c3e10cb44232833a50cb8e3e784c432906a60c1 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 20 Sep 2022 18:12:31 -0400 Subject: net: phy: Add support for rate matching This adds support for rate matching (also known as rate adaptation) to the phy subsystem. The general idea is that the phy interface runs at one speed, and the MAC throttles the rate at which it sends packets to the link speed. There's a good overview of several techniques for achieving this at [1]. This patch adds support for three: pause-frame based (such as in Aquantia phys), CRS-based (such as in 10PASS-TS and 2BASE-TL), and open-loop-based (such as in 10GBASE-W). This patch makes a few assumptions and a few non assumptions about the types of rate matching available. First, it assumes that different phys may use different forms of rate matching. Second, it assumes that phys can use rate matching for any of their supported link speeds (e.g. if a phy supports 10BASE-T and XGMII, then it can adapt XGMII to 10BASE-T). Third, it does not assume that all interface modes will use the same form of rate matching. Fourth, it does not assume that all phy devices will support rate matching (even if some do). Relaxing or strengthening these (non-)assumptions could result in a different API. For example, if all interface modes were assumed to use the same form of rate matching, then a bitmask of interface modes supportting rate matching would suffice. For some better visibility into the process, the current rate matching mode is exposed as part of the ethtool ksettings. For the moment, only read access is supported. I'm not sure what userspace might want to configure yet (disable it altogether, disable just one mode, specify the mode to use, etc.). For the moment, since only pause-based rate adaptation support is added in the next few commits, rate matching can be disabled altogether by adjusting the advertisement. 802.3 calls this feature "rate adaptation" in clause 49 (10GBASE-R) and "rate matching" in clause 61 (10PASS-TL and 2BASE-TS). Aquantia also calls this feature "rate adaptation". I chose "rate matching" because it is shorter, and because Russell doesn't think "adaptation" is correct in this context. Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phy.h | 22 +++++++++++++++++++++- include/uapi/linux/ethtool.h | 18 ++++++++++++++++-- include/uapi/linux/ethtool_netlink.h | 1 + 3 files changed, 38 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 337230c135f7..9c66f357f489 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -280,7 +280,6 @@ static inline const char *phy_modes(phy_interface_t interface) } } - #define PHY_INIT_TIMEOUT 100000 #define PHY_FORCE_TIMEOUT 10 @@ -574,6 +573,7 @@ struct macsec_ops; * @lp_advertising: Current link partner advertised linkmodes * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited * @autoneg: Flag autoneg being used + * @rate_matching: Current rate matching mode * @link: Current link state * @autoneg_complete: Flag auto negotiation of the link has completed * @mdix: Current crossover @@ -641,6 +641,8 @@ struct phy_device { unsigned irq_suspended:1; unsigned irq_rerun:1; + int rate_matching; + enum phy_state state; u32 dev_flags; @@ -805,6 +807,21 @@ struct phy_driver { */ int (*get_features)(struct phy_device *phydev); + /** + * @get_rate_matching: Get the supported type of rate matching for a + * particular phy interface. This is used by phy consumers to determine + * whether to advertise lower-speed modes for that interface. It is + * assumed that if a rate matching mode is supported on an interface, + * then that interface's rate can be adapted to all slower link speeds + * supported by the phy. If iface is %PHY_INTERFACE_MODE_NA, and the phy + * supports any kind of rate matching for any interface, then it must + * return that rate matching mode (preferring %RATE_MATCH_PAUSE to + * %RATE_MATCH_CRS). If the interface is not supported, this should + * return %RATE_MATCH_NONE. + */ + int (*get_rate_matching)(struct phy_device *phydev, + phy_interface_t iface); + /* PHY Power Management */ /** @suspend: Suspend the hardware, saving state if needed */ int (*suspend)(struct phy_device *phydev); @@ -971,6 +988,7 @@ struct phy_fixup { const char *phy_speed_to_str(int speed); const char *phy_duplex_to_str(unsigned int duplex); +const char *phy_rate_matching_to_str(int rate_matching); int phy_interface_num_ports(phy_interface_t interface); @@ -1687,6 +1705,8 @@ int phy_disable_interrupts(struct phy_device *phydev); void phy_request_interrupt(struct phy_device *phydev); void phy_free_interrupt(struct phy_device *phydev); void phy_print_status(struct phy_device *phydev); +int phy_get_rate_matching(struct phy_device *phydev, + phy_interface_t iface); void phy_set_max_speed(struct phy_device *phydev, u32 max_speed); void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode); void phy_advertise_supported(struct phy_device *phydev); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 2d5741fd44bb..fe9893d1485d 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1840,6 +1840,20 @@ static inline int ethtool_validate_duplex(__u8 duplex) #define MASTER_SLAVE_STATE_SLAVE 3 #define MASTER_SLAVE_STATE_ERR 4 +/* These are used to throttle the rate of data on the phy interface when the + * native speed of the interface is higher than the link speed. These should + * not be used for phy interfaces which natively support multiple speeds (e.g. + * MII or SGMII). + */ +/* No rate matching performed. */ +#define RATE_MATCH_NONE 0 +/* The phy sends pause frames to throttle the MAC. */ +#define RATE_MATCH_PAUSE 1 +/* The phy asserts CRS to prevent the MAC from transmitting. */ +#define RATE_MATCH_CRS 2 +/* The MAC is programmed with a sufficiently-large IPG. */ +#define RATE_MATCH_OPEN_LOOP 3 + /* Which connector port. */ #define PORT_TP 0x00 #define PORT_AUI 0x01 @@ -2033,8 +2047,8 @@ enum ethtool_reset_flags { * reported consistently by PHYLIB. Read-only. * @master_slave_cfg: Master/slave port mode. * @master_slave_state: Master/slave port state. + * @rate_matching: Rate adaptation performed by the PHY * @reserved: Reserved for future use; see the note on reserved space. - * @reserved1: Reserved for future use; see the note on reserved space. * @link_mode_masks: Variable length bitmaps. * * If autonegotiation is disabled, the speed and @duplex represent the @@ -2085,7 +2099,7 @@ struct ethtool_link_settings { __u8 transceiver; __u8 master_slave_cfg; __u8 master_slave_state; - __u8 reserved1[1]; + __u8 rate_matching; __u32 reserved[7]; __u32 link_mode_masks[]; /* layout of link_mode_masks fields: diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index d2fb4f7be61b..408a664fad59 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -242,6 +242,7 @@ enum { ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ ETHTOOL_A_LINKMODES_LANES, /* u32 */ + ETHTOOL_A_LINKMODES_RATE_MATCHING, /* u8 */ /* add new constants above here */ __ETHTOOL_A_LINKMODES_CNT, -- cgit v1.2.3 From ae0e4bb2a0e0e434e9f98fb4994093ec2ee71997 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 20 Sep 2022 18:12:32 -0400 Subject: net: phylink: Adjust link settings based on rate matching If the phy is configured to use pause-based rate matching, ensure that the link is full duplex with pause frame reception enabled. As suggested, if pause-based rate matching is enabled by the phy, then pause reception is unconditionally enabled. The interface duplex is determined based on the rate matching type. When rate matching is enabled, so is the speed. We assume the maximum interface speed is used. This is only relevant for MLO_AN_PHY. For MLO_AN_INBAND, the MAC/PCS's view of the interface speed will be used. Although there are no RATE_ADAPT_CRS phys in-tree, it has been added for comparison (and the implementation is quite simple). Co-developed-by: Russell King Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index cc039ae7e80c..5c99c21e42b5 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -88,6 +88,10 @@ static inline bool phylink_autoneg_inband(unsigned int mode) * @speed: link speed, one of the SPEED_* constants. * @duplex: link duplex mode, one of DUPLEX_* constants. * @pause: link pause state, described by MLO_PAUSE_* constants. + * @rate_matching: rate matching being performed, one of the RATE_MATCH_* + * constants. If rate matching is taking place, then the speed/duplex of + * the medium link mode (@speed and @duplex) and the speed/duplex of the phy + * interface mode (@interface) are different. * @link: true if the link is up. * @an_enabled: true if autonegotiation is enabled/desired. * @an_complete: true if autonegotiation has completed. @@ -99,6 +103,7 @@ struct phylink_link_state { int speed; int duplex; int pause; + int rate_matching; unsigned int link:1; unsigned int an_enabled:1; unsigned int an_complete:1; -- cgit v1.2.3 From b7e9294885b610791fcebc799bf2a9e219446fd6 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 20 Sep 2022 18:12:33 -0400 Subject: net: phylink: Adjust advertisement based on rate matching This adds support for adjusting the advertisement for pause-based rate matching. This may result in a lossy link, since the final link settings are not adjusted. Asymmetric pause support is necessary. It would be possible for a MAC supporting only symmetric pause to use pause-based rate adaptation, but only if pause reception was enabled as well. Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 5c99c21e42b5..664dd409feb9 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -554,7 +554,8 @@ void pcs_link_up(struct phylink_pcs *pcs, unsigned int mode, void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps); unsigned long phylink_get_capabilities(phy_interface_t interface, - unsigned long mac_capabilities); + unsigned long mac_capabilities, + int rate_matching); void phylink_generic_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); -- cgit v1.2.3 From 99383f1298ee25901b1f6a665bdcc3344acb2382 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Wed, 21 Sep 2022 15:51:18 +0200 Subject: net: macsec: remove the prepare flag from the MACsec offloading context Now that the MACsec offloading preparation phase was removed from the MACsec core implementation as well as from drivers implementing it, we can safely remove the flag representing it. Signed-off-by: Antoine Tenart Signed-off-by: Jakub Kicinski --- include/net/macsec.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/macsec.h b/include/net/macsec.h index 871599b11707..5b9c61c4d3a6 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -271,8 +271,6 @@ struct macsec_context { struct macsec_rx_sa_stats *rx_sa_stats; struct macsec_dev_stats *dev_stats; } stats; - - u8 prepare:1; }; /** -- cgit v1.2.3 From c8f01a4a54473f88f8cc0d9046ec9eb5a99815d5 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Thu, 22 Sep 2022 16:38:55 +0800 Subject: neighbour: Remove unused inline function neigh_key_eq16() All uses of neigh_key_eq16() have been removed since commit 1202cdd66531 ("Remove DECnet support from kernel"), so remove it. Signed-off-by: Gaosheng Cui Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 3827a6b395fd..20745cf7ae1a 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -276,11 +276,6 @@ static inline void *neighbour_priv(const struct neighbour *n) extern const struct nla_policy nda_policy[]; -static inline bool neigh_key_eq16(const struct neighbour *n, const void *pkey) -{ - return *(const u16 *)n->primary_key == *(const u16 *)pkey; -} - static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) { return *(const u32 *)n->primary_key == *(const u32 *)pkey; -- cgit v1.2.3 From d6755f37abfd009e7ca3520c319c93d14cae54b3 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Thu, 22 Sep 2022 16:38:56 +0800 Subject: net: Remove unused inline function sk_nulls_node_init() All uses of sk_nulls_node_init() have been removed since commit dbca1596bbb0 ("ping: convert to RCU lookups, get rid of rwlock"), so remove it. Signed-off-by: Gaosheng Cui Signed-off-by: Jakub Kicinski --- include/net/sock.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 96a31026e35d..08038a385ef2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -744,11 +744,6 @@ static inline void sk_node_init(struct hlist_node *node) node->pprev = NULL; } -static inline void sk_nulls_node_init(struct hlist_nulls_node *node) -{ - node->pprev = NULL; -} - static inline void __sk_del_node(struct sock *sk) { __hlist_del(&sk->sk_node); -- cgit v1.2.3 From 0b81882ddf8ac2743f657afb001beec7fc3929af Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Thu, 22 Sep 2022 16:38:57 +0800 Subject: net: Remove unused inline function dst_hold_and_use() All uses of dst_hold_and_use() have been removed since commit 1202cdd66531 ("Remove DECnet support from kernel"), so remove it. Signed-off-by: Gaosheng Cui Signed-off-by: Jakub Kicinski --- include/net/dst.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/net/dst.h b/include/net/dst.h index 6aa252c3fc55..00b479ce6b99 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -239,12 +239,6 @@ static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) } } -static inline void dst_hold_and_use(struct dst_entry *dst, unsigned long time) -{ - dst_hold(dst); - dst_use_noref(dst, time); -} - static inline struct dst_entry *dst_clone(struct dst_entry *dst) { if (dst) -- cgit v1.2.3 From b860a1b964be7917ed3dcaa674ec0a2ba0a62aa0 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 23 Sep 2022 14:48:00 +0200 Subject: xdp: Adjust xdp_frame layout to avoid using bitfields Practical experience (and advice from Alexei) tell us that bitfields in structs lead to un-optimized assembly code. I've verified this change does lead to better x86_64 assembly, both via objdump and playing with code snippets in godbolt.org. Using scripts/bloat-o-meter shows the code size is reduced with 24 bytes for xdp_convert_buff_to_frame() that gets inlined e.g. in i40e_xmit_xdp_tx_ring() which were used for microbenchmarking. Microbenchmarking results do show improvements, but very small and varying between 0.5 to 2 nanosec improvement per packet. The member @metasize is changed from u8 to u32. Future users of this area could split this into two u16 fields. I've also benchmarked with two u16 fields showing equal performance gains and code size reduction. The moved member @frame_sz doesn't change sizeof struct due to existing padding. Like xdp_buff member @frame_sz is placed next to @flags, which allows compiler to optimize assignment of these. Signed-off-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/r/166393728005.2213882.4162674859542409548.stgit@firesoul Signed-off-by: Jakub Kicinski --- include/net/xdp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/xdp.h b/include/net/xdp.h index 04c852c7a77f..55dbc68bfffc 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -164,13 +164,13 @@ struct xdp_frame { void *data; u16 len; u16 headroom; - u32 metasize:8; - u32 frame_sz:24; + u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem info is valid on remote CPU. */ struct xdp_mem_info mem; struct net_device *dev_rx; /* used by cpumap */ + u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ }; -- cgit v1.2.3 From bf7a87f1075f67c286f794519f0fedfa8b0b18cc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Sep 2022 17:33:35 +0200 Subject: kprobes: Add new KPROBE_FLAG_ON_FUNC_ENTRY kprobe flag Adding KPROBE_FLAG_ON_FUNC_ENTRY kprobe flag to indicate that attach address is on function entry. This is used in following changes in get_func_ip helper to return correct function address. Acked-by: Masami Hiramatsu (Google) Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220926153340.1621984-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/kprobes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 55041d2f884d..a0b92be98984 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -103,6 +103,7 @@ struct kprobe { * this flag is only for optimized_kprobe. */ #define KPROBE_FLAG_FTRACE 8 /* probe is using ftrace */ +#define KPROBE_FLAG_ON_FUNC_ENTRY 16 /* probe is on the function entry */ /* Has this kprobe gone ? */ static inline bool kprobe_gone(struct kprobe *p) -- cgit v1.2.3 From 0e253f7e558a3e250902ba2034091e0185448836 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 26 Sep 2022 17:33:39 +0200 Subject: bpf: Return value in kprobe get_func_ip only for entry address Changing return value of kprobe's version of bpf_get_func_ip to return zero if the attach address is not on the function's entry point. For kprobes attached in the middle of the function we can't easily get to the function address especially now with the CONFIG_X86_KERNEL_IBT support. If user cares about current IP for kprobes attached within the function body, they can get it with PT_REGS_IP(ctx). Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Acked-by: Martynas Pumputis Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20220926153340.1621984-6-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ead35f39f185..d6bd10759eaf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4951,6 +4951,7 @@ union bpf_attr { * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. + * 0 for kprobes placed within the function (not at the entry). * * u64 bpf_get_attach_cookie(void *ctx) * Description -- cgit v1.2.3 From 19c02415da2345d0dda2b5c4495bc17cc14b18b5 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 26 Sep 2022 11:47:38 -0700 Subject: bpf: use bpf_prog_pack for bpf_dispatcher Allocate bpf_dispatcher with bpf_prog_pack_alloc so that bpf_dispatcher can share pages with bpf programs. arch_prepare_bpf_dispatcher() is updated to provide a RW buffer as working area for arch code to write to. This also fixes CPA W^X warnning like: CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ... Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20220926184739.3512547-2-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 ++- include/linux/filter.h | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index edd43edb27d6..9ae155c75014 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -946,6 +946,7 @@ struct bpf_dispatcher { struct bpf_dispatcher_prog progs[BPF_DISPATCHER_MAX]; int num_progs; void *image; + void *rw_image; u32 image_off; struct bpf_ksym ksym; }; @@ -964,7 +965,7 @@ int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampolin struct bpf_trampoline *bpf_trampoline_get(u64 key, struct bpf_attach_target_info *tgt_info); void bpf_trampoline_put(struct bpf_trampoline *tr); -int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs); +int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs); #define BPF_DISPATCHER_INIT(_name) { \ .mutex = __MUTEX_INITIALIZER(_name.mutex), \ .func = &_name##_func, \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 98e28126c24b..efc42a6e3aed 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1023,6 +1023,8 @@ extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); +void bpf_jit_fill_hole_with_zero(void *area, unsigned int size); + struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, @@ -1035,6 +1037,9 @@ void bpf_jit_free(struct bpf_prog *fp); struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); +void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); +void bpf_prog_pack_free(struct bpf_binary_header *hdr); + static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { return list_empty(&fp->aux->ksym.lnode) || -- cgit v1.2.3 From 5b0d1c7bd5722467960829af51d523f5a6ffd848 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 26 Sep 2022 11:47:39 -0700 Subject: bpf: Enforce W^X for bpf trampoline Mark the trampoline as RO+X after arch_prepare_bpf_trampoline, so that the trampoine follows W^X rule strictly. This will turn off warnings like CPA refuse W^X violation: 8000000000000163 -> 0000000000000163 range: ... Also remove bpf_jit_alloc_exec_page(), since it is not used any more. Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20220926184739.3512547-3-song@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9ae155c75014..5161fac0513f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1008,7 +1008,6 @@ int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_func void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, struct bpf_prog *to); /* Called only from JIT-enabled code, so there's no need for stubs. */ -void *bpf_jit_alloc_exec_page(void); void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); -- cgit v1.2.3 From 73dfe93ea1b319482e6d82a54fe06f953ceeeccb Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 22 Sep 2022 20:41:40 +0200 Subject: headers: Remove some left-over license text Remove some left-over from commit e2be04c7f995 ("License cleanup: add SPDX license identifier to uapi header files with a license") When the SPDX-License-Identifier tag has been added, the corresponding license text has not been removed. Signed-off-by: Christophe JAILLET Acked-by: Alexander Duyck Reviewed-by: Jiri Pirko Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/88410cddd31197ea26840d7dd71612bece8c6acf.1663871981.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski --- include/uapi/linux/tc_act/tc_bpf.h | 5 ----- include/uapi/linux/tc_act/tc_skbedit.h | 13 ------------- include/uapi/linux/tc_act/tc_skbmod.h | 7 +------ include/uapi/linux/tc_act/tc_tunnel_key.h | 5 ----- include/uapi/linux/tc_act/tc_vlan.h | 5 ----- 5 files changed, 1 insertion(+), 34 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h index 653c4f94f76e..fe6c8f8f3e8c 100644 --- a/include/uapi/linux/tc_act/tc_bpf.h +++ b/include/uapi/linux/tc_act/tc_bpf.h @@ -1,11 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* * Copyright (c) 2015 Jiri Pirko - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ #ifndef __LINUX_TC_BPF_H diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h index 6cb6101208d0..64032513cc4c 100644 --- a/include/uapi/linux/tc_act/tc_skbedit.h +++ b/include/uapi/linux/tc_act/tc_skbedit.h @@ -2,19 +2,6 @@ /* * Copyright (c) 2008, Intel Corporation. * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * * Author: Alexander Duyck */ diff --git a/include/uapi/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h index af6ef2cfbf3d..ac62c9a993ea 100644 --- a/include/uapi/linux/tc_act/tc_skbmod.h +++ b/include/uapi/linux/tc_act/tc_skbmod.h @@ -1,12 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* * Copyright (c) 2016, Jamal Hadi Salim - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. -*/ + */ #ifndef __LINUX_TC_SKBMOD_H #define __LINUX_TC_SKBMOD_H diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index 3f10dc4e7a4b..49ad4033951b 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -2,11 +2,6 @@ /* * Copyright (c) 2016, Amir Vadai * Copyright (c) 2016, Mellanox Technologies. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ #ifndef __LINUX_TC_TUNNEL_KEY_H diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h index 5b306fe815cc..3e1f8e57cdd2 100644 --- a/include/uapi/linux/tc_act/tc_vlan.h +++ b/include/uapi/linux/tc_act/tc_vlan.h @@ -1,11 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* * Copyright (c) 2014 Jiri Pirko - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ #ifndef __LINUX_TC_VLAN_H -- cgit v1.2.3 From 976a859c9c68fdf160379bfc154431489f318292 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Wed, 7 Sep 2022 16:36:23 -0700 Subject: net/mlx5: Expose NPPS related registers Add management capability bits indicating firmware may support N pulses per second. Add corresponding fields in MTPPS register. Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 06eab92b9fb3..e929b0dcd985 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9792,7 +9792,9 @@ struct mlx5_ifc_pcam_reg_bits { struct mlx5_ifc_mcam_enhanced_features_bits { u8 reserved_at_0[0x5d]; u8 mcia_32dwords[0x1]; - u8 reserved_at_5e[0xc]; + u8 out_pulse_duration_ns[0x1]; + u8 npps_period[0x1]; + u8 reserved_at_60[0xa]; u8 reset_state[0x1]; u8 ptpcyc2realtime_modify[0x1]; u8 reserved_at_6c[0x2]; @@ -10292,7 +10294,12 @@ struct mlx5_ifc_mtpps_reg_bits { u8 reserved_at_18[0x4]; u8 cap_max_num_of_pps_out_pins[0x4]; - u8 reserved_at_20[0x24]; + u8 reserved_at_20[0x13]; + u8 cap_log_min_npps_period[0x5]; + u8 reserved_at_38[0x3]; + u8 cap_log_min_out_pulse_duration_ns[0x5]; + + u8 reserved_at_40[0x4]; u8 cap_pin_3_mode[0x4]; u8 reserved_at_48[0x4]; u8 cap_pin_2_mode[0x4]; @@ -10311,7 +10318,9 @@ struct mlx5_ifc_mtpps_reg_bits { u8 cap_pin_4_mode[0x4]; u8 field_select[0x20]; - u8 reserved_at_a0[0x60]; + u8 reserved_at_a0[0x20]; + + u8 npps_period[0x40]; u8 enable[0x1]; u8 reserved_at_101[0xb]; @@ -10320,7 +10329,8 @@ struct mlx5_ifc_mtpps_reg_bits { u8 pin_mode[0x4]; u8 pin[0x8]; - u8 reserved_at_120[0x20]; + u8 reserved_at_120[0x2]; + u8 out_pulse_duration_ns[0x1e]; u8 time_stamp[0x40]; -- cgit v1.2.3 From f0462bc3e9024e9738fcd1a026456238317d84c7 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Wed, 7 Sep 2022 16:36:24 -0700 Subject: net/mlx5: Add support for NPPS with real time mode Add support for setting NPPS. NPPS is currently available in REAL_TIME_CLOCK mode only. In addition allow the user to set the pulse duration. When NPPS pulse duration is not set explicitly by the user, driver set it to 50% of the NPPS period. Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha Reviewed-by: Saeed Mahameed Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 954af9cafb1d..481506376344 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -698,6 +698,8 @@ struct mlx5_pps { struct work_struct out_work; u64 start[MAX_PIN_NUM]; u8 enabled; + u64 min_npps_period; + u64 min_out_pulse_duration_ns; }; struct mlx5_timer { -- cgit v1.2.3 From 8d1ac895fff96a228db20db92243e93687659ef7 Mon Sep 17 00:00:00 2001 From: "Liu, Changcheng" Date: Wed, 7 Sep 2022 16:36:25 -0700 Subject: net/mlx5: add IFC bits for bypassing port select flow table port_select_flow_table_bypass - When set, device supports bypass port select flow table. active_port - Bitmask indicates the current active ports in PORT_SELECT_FT LAG. MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION - op_mod to operate PORT_SELECTION_Capabilities. Signed-off-by: Liu, Changcheng Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index e929b0dcd985..b7f4e93df0f2 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -68,6 +68,7 @@ enum { MLX5_SET_HCA_CAP_OP_MOD_ODP = 0x2, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3, MLX5_SET_HCA_CAP_OP_MOD_ROCE = 0x4, + MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION = 0x25, }; enum { @@ -814,7 +815,9 @@ struct mlx5_ifc_flow_table_nic_cap_bits { struct mlx5_ifc_port_selection_cap_bits { u8 reserved_at_0[0x10]; u8 port_select_flow_table[0x1]; - u8 reserved_at_11[0xf]; + u8 reserved_at_11[0x1]; + u8 port_select_flow_table_bypass[0x1]; + u8 reserved_at_13[0xd]; u8 reserved_at_20[0x1e0]; @@ -10933,7 +10936,9 @@ struct mlx5_ifc_lagc_bits { u8 reserved_at_18[0x5]; u8 lag_state[0x3]; - u8 reserved_at_20[0x14]; + u8 reserved_at_20[0xc]; + u8 active_port[0x4]; + u8 reserved_at_30[0x4]; u8 tx_remap_affinity_2[0x4]; u8 reserved_at_38[0x4]; u8 tx_remap_affinity_1[0x4]; -- cgit v1.2.3 From a83bb5df2ac604ab418fbe0a8720f55de46652eb Mon Sep 17 00:00:00 2001 From: "Liu, Changcheng" Date: Wed, 7 Sep 2022 16:36:26 -0700 Subject: RDMA/mlx5: Don't set tx affinity when lag is in hash mode In hash mode, without setting tx affinity explicitly, the port select flow table decides which port is used for the traffic. If port_select_flow_table_bypass capability is supported and tx affinity is set explicitly for QP/TIS, they will be added into the explicit affinity table in FW to check which port is used for the traffic. 1. The overloaded explicit affinity table may affect performance. To avoid this, do not set tx affinity explicitly by default. 2. The packets of the same flow need to be transmitted on the same port. Because the packets of the same flow use different QPs in slow & fast path, it shouldn't set tx affinity explicitly for these QPs. Signed-off-by: Liu, Changcheng Reviewed-by: Mark Bloch Reviewed-by: Vlad Buslov Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 481506376344..9114caceb2b5 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1153,6 +1153,7 @@ int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev); bool mlx5_lag_is_roce(struct mlx5_core_dev *dev); bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev); bool mlx5_lag_is_active(struct mlx5_core_dev *dev); +bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev); bool mlx5_lag_is_master(struct mlx5_core_dev *dev); bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev); struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 66af4fe37119dbf6a82078949a82e625155777ef Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 7 Sep 2022 16:36:30 -0700 Subject: net/mlx5: Remove unused functions Remove functions which are no longer used in the driver: mlx5e_ipsec_is_tx_flow mlx5_health_flush get_cqe_enhanced_num_mini_cqes get_cqe_l3_hdr_type mlx5_health_flush mlx5_fs_is_ipsec_flow _mlx5_fs_is_outer_ipproto_flow mlx5_fs_is_outer_tcp_flow mlx5_fs_is_outer_udp_flow mlx5_fs_is_vxlan_flow mlx5_fs_is_outer_ipsec_flow Signed-off-by: Gal Pressman Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 11 ---------- include/linux/mlx5/driver.h | 1 - include/linux/mlx5/fs_helpers.h | 48 ----------------------------------------- 3 files changed, 60 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 5b41b9fb3d48..753753f3ce12 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -874,12 +874,6 @@ static inline u8 get_cqe_opcode(struct mlx5_cqe64 *cqe) return cqe->op_own >> 4; } -static inline u8 get_cqe_enhanced_num_mini_cqes(struct mlx5_cqe64 *cqe) -{ - /* num_of_mini_cqes is zero based */ - return get_cqe_opcode(cqe) + 1; -} - static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe) { return (cqe->lro.tcppsh_abort_dupack >> 6) & 1; @@ -890,11 +884,6 @@ static inline u8 get_cqe_l4_hdr_type(struct mlx5_cqe64 *cqe) return (cqe->l4_l3_hdr_type >> 4) & 0x7; } -static inline u8 get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe) -{ - return (cqe->l4_l3_hdr_type >> 2) & 0x3; -} - static inline bool cqe_is_tunneled(struct mlx5_cqe64 *cqe) { return cqe->tls_outer_l3_tunneled & 0x1; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9114caceb2b5..52f34fe47049 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1018,7 +1018,6 @@ int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, bool mlx5_cmd_is_down(struct mlx5_core_dev *dev); int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); -void mlx5_health_flush(struct mlx5_core_dev *dev); void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); void mlx5_start_health_poll(struct mlx5_core_dev *dev); diff --git a/include/linux/mlx5/fs_helpers.h b/include/linux/mlx5/fs_helpers.h index 9db21cd0e92c..bc5125bc0561 100644 --- a/include/linux/mlx5/fs_helpers.h +++ b/include/linux/mlx5/fs_helpers.h @@ -38,46 +38,6 @@ #define MLX5_FS_IPV4_VERSION 4 #define MLX5_FS_IPV6_VERSION 6 -static inline bool mlx5_fs_is_ipsec_flow(const u32 *match_c) -{ - void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, - misc_parameters); - - return MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi); -} - -static inline bool _mlx5_fs_is_outer_ipproto_flow(const u32 *match_c, - const u32 *match_v, u8 match) -{ - const void *headers_c = MLX5_ADDR_OF(fte_match_param, match_c, - outer_headers); - const void *headers_v = MLX5_ADDR_OF(fte_match_param, match_v, - outer_headers); - - return MLX5_GET(fte_match_set_lyr_2_4, headers_c, ip_protocol) == 0xff && - MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol) == match; -} - -static inline bool mlx5_fs_is_outer_tcp_flow(const u32 *match_c, - const u32 *match_v) -{ - return _mlx5_fs_is_outer_ipproto_flow(match_c, match_v, IPPROTO_TCP); -} - -static inline bool mlx5_fs_is_outer_udp_flow(const u32 *match_c, - const u32 *match_v) -{ - return _mlx5_fs_is_outer_ipproto_flow(match_c, match_v, IPPROTO_UDP); -} - -static inline bool mlx5_fs_is_vxlan_flow(const u32 *match_c) -{ - void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, - misc_parameters); - - return MLX5_GET(fte_match_set_misc, misc_params_c, vxlan_vni); -} - static inline bool _mlx5_fs_is_outer_ipv_flow(struct mlx5_core_dev *mdev, const u32 *match_c, const u32 *match_v, int version) @@ -131,12 +91,4 @@ mlx5_fs_is_outer_ipv6_flow(struct mlx5_core_dev *mdev, const u32 *match_c, MLX5_FS_IPV6_VERSION); } -static inline bool mlx5_fs_is_outer_ipsec_flow(const u32 *match_c) -{ - void *misc_params_c = - MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters); - - return MLX5_GET(fte_match_set_misc, misc_params_c, outer_esp_spi); -} - #endif -- cgit v1.2.3 From b53ff37fcd5c7038d9dd000dcf682575a8f47999 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 7 Sep 2022 16:36:31 -0700 Subject: net/mlx5: Remove unused structs Remove structs which are no longer used in the driver: mlx5dr_cmd_qp_create_attr mlx5_fs_dr_ns mlx5_pas Signed-off-by: Gal Pressman Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 52f34fe47049..949feb7f889f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -857,11 +857,6 @@ struct mlx5_cmd_work_ent { refcount_t refcnt; }; -struct mlx5_pas { - u64 pa; - u8 log_sz; -}; - enum phy_port_state { MLX5_AAA_111 }; -- cgit v1.2.3 From 9175d8103780084e70bc89f2040ea62dc30f6f60 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 7 Sep 2022 16:36:32 -0700 Subject: net/mlx5: Remove from FPGA IFC file not-needed definitions Move IP layout bits definitions to be close to the place that actually uses it, together with removal extra defines that not in-use. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 16 ++++++++++++++++ include/linux/mlx5/mlx5_ifc_fpga.h | 24 ------------------------ 2 files changed, 16 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index b7f4e93df0f2..c25f2ce75734 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -478,6 +478,22 @@ struct mlx5_ifc_odp_per_transport_service_cap_bits { u8 reserved_at_6[0x1a]; }; +struct mlx5_ifc_ipv4_layout_bits { + u8 reserved_at_0[0x60]; + + u8 ipv4[0x20]; +}; + +struct mlx5_ifc_ipv6_layout_bits { + u8 ipv6[16][0x8]; +}; + +union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits { + struct mlx5_ifc_ipv6_layout_bits ipv6_layout; + struct mlx5_ifc_ipv4_layout_bits ipv4_layout; + u8 reserved_at_0[0x80]; +}; + struct mlx5_ifc_fte_match_set_lyr_2_4_bits { u8 smac_47_16[0x20]; diff --git a/include/linux/mlx5/mlx5_ifc_fpga.h b/include/linux/mlx5/mlx5_ifc_fpga.h index 45c7c0d67635..0596472923ad 100644 --- a/include/linux/mlx5/mlx5_ifc_fpga.h +++ b/include/linux/mlx5/mlx5_ifc_fpga.h @@ -32,30 +32,6 @@ #ifndef MLX5_IFC_FPGA_H #define MLX5_IFC_FPGA_H -struct mlx5_ifc_ipv4_layout_bits { - u8 reserved_at_0[0x60]; - - u8 ipv4[0x20]; -}; - -struct mlx5_ifc_ipv6_layout_bits { - u8 ipv6[16][0x8]; -}; - -union mlx5_ifc_ipv6_layout_ipv4_layout_auto_bits { - struct mlx5_ifc_ipv6_layout_bits ipv6_layout; - struct mlx5_ifc_ipv4_layout_bits ipv4_layout; - u8 reserved_at_0[0x80]; -}; - -enum { - MLX5_FPGA_CAP_SANDBOX_VENDOR_ID_MLNX = 0x2c9, -}; - -enum { - MLX5_FPGA_CAP_SANDBOX_PRODUCT_ID_IPSEC = 0x2, -}; - struct mlx5_ifc_fpga_shell_caps_bits { u8 max_num_qps[0x10]; u8 reserved_at_10[0x8]; -- cgit v1.2.3 From 62e56ef57c04c0cacb33433d7984a4d71b690b3f Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 25 Sep 2022 15:00:33 +0000 Subject: net: tls: Add ARIA-GCM algorithm RFC 6209 describes ARIA for TLS 1.2. ARIA-128-GCM and ARIA-256-GCM are defined in RFC 6209. This patch would offer performance increment and an opportunity for hardware offload. Benchmark results: iperf-ssl are used. CPU: intel i3-12100. TLS(openssl-3.0-dev) [ 3] 0.0- 1.0 sec 185 MBytes 1.55 Gbits/sec [ 3] 1.0- 2.0 sec 186 MBytes 1.56 Gbits/sec [ 3] 2.0- 3.0 sec 186 MBytes 1.56 Gbits/sec [ 3] 3.0- 4.0 sec 186 MBytes 1.56 Gbits/sec [ 3] 4.0- 5.0 sec 186 MBytes 1.56 Gbits/sec [ 3] 0.0- 5.0 sec 927 MBytes 1.56 Gbits/sec kTLS(aria-generic) [ 3] 0.0- 1.0 sec 198 MBytes 1.66 Gbits/sec [ 3] 1.0- 2.0 sec 194 MBytes 1.62 Gbits/sec [ 3] 2.0- 3.0 sec 194 MBytes 1.63 Gbits/sec [ 3] 3.0- 4.0 sec 194 MBytes 1.63 Gbits/sec [ 3] 4.0- 5.0 sec 194 MBytes 1.62 Gbits/sec [ 3] 0.0- 5.0 sec 974 MBytes 1.63 Gbits/sec kTLS(aria-avx wirh GFNI) [ 3] 0.0- 1.0 sec 632 MBytes 5.30 Gbits/sec [ 3] 1.0- 2.0 sec 657 MBytes 5.51 Gbits/sec [ 3] 2.0- 3.0 sec 657 MBytes 5.51 Gbits/sec [ 3] 3.0- 4.0 sec 656 MBytes 5.50 Gbits/sec [ 3] 4.0- 5.0 sec 656 MBytes 5.50 Gbits/sec [ 3] 0.0- 5.0 sec 3.18 GBytes 5.47 Gbits/sec Signed-off-by: Taehee Yoo Reviewed-by: Vadim Fedorenko Link: https://lore.kernel.org/r/20220925150033.24615-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/tls.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index f1157d8f4acd..b66a800389cc 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -100,6 +100,20 @@ #define TLS_CIPHER_SM4_CCM_TAG_SIZE 16 #define TLS_CIPHER_SM4_CCM_REC_SEQ_SIZE 8 +#define TLS_CIPHER_ARIA_GCM_128 57 +#define TLS_CIPHER_ARIA_GCM_128_IV_SIZE 8 +#define TLS_CIPHER_ARIA_GCM_128_KEY_SIZE 16 +#define TLS_CIPHER_ARIA_GCM_128_SALT_SIZE 4 +#define TLS_CIPHER_ARIA_GCM_128_TAG_SIZE 16 +#define TLS_CIPHER_ARIA_GCM_128_REC_SEQ_SIZE 8 + +#define TLS_CIPHER_ARIA_GCM_256 58 +#define TLS_CIPHER_ARIA_GCM_256_IV_SIZE 8 +#define TLS_CIPHER_ARIA_GCM_256_KEY_SIZE 32 +#define TLS_CIPHER_ARIA_GCM_256_SALT_SIZE 4 +#define TLS_CIPHER_ARIA_GCM_256_TAG_SIZE 16 +#define TLS_CIPHER_ARIA_GCM_256_REC_SEQ_SIZE 8 + #define TLS_SET_RECORD_TYPE 1 #define TLS_GET_RECORD_TYPE 2 @@ -156,6 +170,22 @@ struct tls12_crypto_info_sm4_ccm { unsigned char rec_seq[TLS_CIPHER_SM4_CCM_REC_SEQ_SIZE]; }; +struct tls12_crypto_info_aria_gcm_128 { + struct tls_crypto_info info; + unsigned char iv[TLS_CIPHER_ARIA_GCM_128_IV_SIZE]; + unsigned char key[TLS_CIPHER_ARIA_GCM_128_KEY_SIZE]; + unsigned char salt[TLS_CIPHER_ARIA_GCM_128_SALT_SIZE]; + unsigned char rec_seq[TLS_CIPHER_ARIA_GCM_128_REC_SEQ_SIZE]; +}; + +struct tls12_crypto_info_aria_gcm_256 { + struct tls_crypto_info info; + unsigned char iv[TLS_CIPHER_ARIA_GCM_256_IV_SIZE]; + unsigned char key[TLS_CIPHER_ARIA_GCM_256_KEY_SIZE]; + unsigned char salt[TLS_CIPHER_ARIA_GCM_256_SALT_SIZE]; + unsigned char rec_seq[TLS_CIPHER_ARIA_GCM_256_REC_SEQ_SIZE]; +}; + enum { TLS_INFO_UNSPEC, TLS_INFO_VERSION, -- cgit v1.2.3 From f0d74c4da1f060d2a66976193712a5e6abd361f5 Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:53 -0700 Subject: bpf: Parameterize task iterators. Allow creating an iterator that loops through resources of one thread/process. People could only create iterators to loop through all resources of files, vma, and tasks in the system, even though they were interested in only the resources of a specific task or process. Passing the additional parameters, people can now create an iterator to go through all resources or only the resources of a task. Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-2-kuifeng@fb.com --- include/linux/bpf.h | 25 +++++++++++++++++++++++++ include/uapi/linux/bpf.h | 6 ++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5161fac0513f..0f3eaf3ed98c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1796,6 +1796,27 @@ int bpf_obj_get_user(const char __user *pathname, int flags); extern int bpf_iter_ ## target(args); \ int __init bpf_iter_ ## target(args) { return 0; } +/* + * The task type of iterators. + * + * For BPF task iterators, they can be parameterized with various + * parameters to visit only some of tasks. + * + * BPF_TASK_ITER_ALL (default) + * Iterate over resources of every task. + * + * BPF_TASK_ITER_TID + * Iterate over resources of a task/tid. + * + * BPF_TASK_ITER_TGID + * Iterate over resources of every task of a process / task group. + */ +enum bpf_iter_task_type { + BPF_TASK_ITER_ALL = 0, + BPF_TASK_ITER_TID, + BPF_TASK_ITER_TGID, +}; + struct bpf_iter_aux_info { /* for map_elem iter */ struct bpf_map *map; @@ -1805,6 +1826,10 @@ struct bpf_iter_aux_info { struct cgroup *start; /* starting cgroup */ enum bpf_cgroup_iter_order order; } cgroup; + struct { + enum bpf_iter_task_type type; + u32 pid; + } task; }; typedef int (*bpf_iter_attach_target_t)(struct bpf_prog *prog, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d6bd10759eaf..455b21a53aac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -110,6 +110,12 @@ union bpf_iter_link_info { __u32 cgroup_fd; __u64 cgroup_id; } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; }; /* BPF syscall commands, see bpf(2) man-page for more details. */ -- cgit v1.2.3 From 21fb6f2aa3890b0d0abf88b7756d0098e9367a7c Mon Sep 17 00:00:00 2001 From: Kui-Feng Lee Date: Mon, 26 Sep 2022 11:49:54 -0700 Subject: bpf: Handle bpf_link_info for the parameterized task BPF iterators. Add new fields to bpf_link_info that users can query it through bpf_obj_get_info_by_fd(). Signed-off-by: Kui-Feng Lee Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220926184957.208194-3-kuifeng@fb.com --- include/uapi/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 455b21a53aac..3075018a4ef8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6265,6 +6265,10 @@ struct bpf_link_info { __u64 cgroup_id; __u32 order; } cgroup; + struct { + __u32 tid; + __u32 pid; + } task; }; } iter; struct { -- cgit v1.2.3 From 6eaab4dfdd30ea8fb0dd4ee04940676c12b728e8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 23 Sep 2022 17:39:01 +0100 Subject: net: introduce struct ubuf_info_msgzc We're going to split struct ubuf_info and leave there only mandatory fields. Users are free to extend it. Add struct ubuf_info_msgzc, which will be an extended version for MSG_ZEROCOPY and some other users. It duplicates of struct ubuf_info for now and will be removed in a couple of patches. Signed-off-by: Pavel Begunkov Acked-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f15d5b62539b..fd7dcb977fdf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -554,7 +554,28 @@ struct ubuf_info { } mmp; }; +struct ubuf_info_msgzc { + struct ubuf_info ubuf; + + union { + struct { + unsigned long desc; + void *ctx; + }; + struct { + u32 id; + u16 len; + u16 zerocopy:1; + u32 bytelen; + }; + }; + + struct mmpin mmp; +}; + #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) +#define uarg_to_msgzc(ubuf_ptr) container_of((ubuf_ptr), struct ubuf_info_msgzc, \ + ubuf) int mm_account_pinned_pages(struct mmpin *mmp, size_t size); void mm_unaccount_pinned_pages(struct mmpin *mmp); -- cgit v1.2.3 From e7d2b510165fff6bedc9cca88c071ad846850c74 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 23 Sep 2022 17:39:04 +0100 Subject: net: shrink struct ubuf_info We can benefit from a smaller struct ubuf_info, so leave only mandatory fields and let users to decide how they want to extend it. Convert MSG_ZEROCOPY to struct ubuf_info_msgzc and remove duplicated fields. This reduces the size from 48 bytes to just 16. Signed-off-by: Pavel Begunkov Acked-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fd7dcb977fdf..920eb6413fee 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -533,25 +533,8 @@ enum { struct ubuf_info { void (*callback)(struct sk_buff *, struct ubuf_info *, bool zerocopy_success); - union { - struct { - unsigned long desc; - void *ctx; - }; - struct { - u32 id; - u16 len; - u16 zerocopy:1; - u32 bytelen; - }; - }; refcount_t refcnt; u8 flags; - - struct mmpin { - struct user_struct *user; - unsigned int num_pg; - } mmp; }; struct ubuf_info_msgzc { @@ -570,7 +553,10 @@ struct ubuf_info_msgzc { }; }; - struct mmpin mmp; + struct mmpin { + struct user_struct *user; + unsigned int num_pg; + } mmp; }; #define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) -- cgit v1.2.3 From 63a8bf85568b30438431d4d78afb2fde4a280760 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 26 Sep 2022 18:02:20 -0500 Subject: netns: Replace zero-length array with DECLARE_FLEX_ARRAY() helper Zero-length arrays are deprecated and we are moving towards adopting C99 flexible-array members, instead. So, replace zero-length arrays declarations in anonymous union with the new DECLARE_FLEX_ARRAY() helper macro. This helper allows for flexible-array members in unions. Link: https://github.com/KSPP/linux/issues/193 Link: https://github.com/KSPP/linux/issues/225 Link: https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/YzIvfGXxfjdXmIS3@work Signed-off-by: Jakub Kicinski --- include/net/netns/generic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h index 7ce68183f6e1..00c399edeed1 100644 --- a/include/net/netns/generic.h +++ b/include/net/netns/generic.h @@ -33,7 +33,7 @@ struct net_generic { struct rcu_head rcu; } s; - void *ptr[0]; + DECLARE_FLEX_ARRAY(void *, ptr); }; }; -- cgit v1.2.3 From 3242abeb8da7071fa40d32346ed36343bee33b80 Mon Sep 17 00:00:00 2001 From: Benjamin Hesmans Date: Mon, 26 Sep 2022 16:27:37 -0700 Subject: tcp: export tcp_sendmsg_fastopen It will be used to support TCP FastOpen with MPTCP in the following commit. Acked-by: Paolo Abeni Reviewed-by: Matthieu Baerts Co-developed-by: Dmytro Shytyi Signed-off-by: Dmytro Shytyi Signed-off-by: Benjamin Hesmans Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 27e8d378c70a..4f71cc15ff8e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -327,6 +327,8 @@ void tcp_remove_empty_skb(struct sock *sk); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); +int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, + size_t size, struct ubuf_info *uarg); int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, -- cgit v1.2.3 From 5456262d2baa43c38e0c770543d5a31b0942f41c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 26 Sep 2022 17:25:44 -0700 Subject: net: Fix incorrect address comparison when searching for a bind2 bucket The v6_rcv_saddr and rcv_saddr are inside a union in the 'struct inet_bind2_bucket'. When searching a bucket by following the bhash2 hashtable chain, eg. inet_bind2_bucket_match, it is only using the sk->sk_family and there is no way to check if the inet_bind2_bucket has a v6 or v4 address in the union. This leads to an uninit-value KMSAN report in [0] and also potentially incorrect matches. This patch fixes it by adding a family member to the inet_bind2_bucket and then tests 'sk->sk_family != tb->family' before matching the sk's address to the tb's address. Cc: Joanne Koong Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Tested-by: Alexander Potapenko Link: https://lore.kernel.org/r/20220927002544.3381205-1-kafai@fb.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 9121ccab1fa1..3af1e927247d 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -95,6 +95,9 @@ struct inet_bind2_bucket { possible_net_t ib_net; int l3mdev; unsigned short port; +#if IS_ENABLED(CONFIG_IPV6) + unsigned short family; +#endif union { #if IS_ENABLED(CONFIG_IPV6) struct in6_addr v6_rcv_saddr; -- cgit v1.2.3 From b48b89f9c189d24eb5e2b4a0ac067da5a24ee86d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 27 Sep 2022 06:27:53 -0700 Subject: net: drop the weight argument from netif_napi_add We tell driver developers to always pass NAPI_POLL_WEIGHT as the weight to netif_napi_add(). This may be confusing to newcomers, drop the weight argument, those who really need to tweak the weight can use netif_napi_add_weight(). Acked-by: Marc Kleine-Budde # for CAN Link: https://lore.kernel.org/r/20220927132753.750069-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9f42fc871c3b..7a084a1c14e9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2553,16 +2553,15 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, * @dev: network device * @napi: NAPI context * @poll: polling function - * @weight: default weight * * netif_napi_add() must be used to initialize a NAPI context prior to calling * *any* of the other NAPI-related functions. */ static inline void netif_napi_add(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) + int (*poll)(struct napi_struct *, int)) { - netif_napi_add_weight(dev, napi, poll, weight); + netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } static inline void -- cgit v1.2.3 From 40b72108f9c609c5043903efb70c52d46b8aa871 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 27 Sep 2022 13:35:56 -0700 Subject: net/mlx5: Add the log_min_mkey_entity_size capability Add the capability that will allow the driver to determine the minimal MTT page size to be able to map the smallest possible pages in XSK. The older firmwares that don't have this capability default to 12 (i.e. 4096-byte pages). Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Reviewed-by: Saeed Mahameed Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3d963c0e47e4..1ad762e22d86 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1878,7 +1878,13 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 max_reformat_remove_size[0x8]; u8 max_reformat_remove_offset[0x8]; - u8 reserved_at_c0[0x160]; + u8 reserved_at_c0[0xe0]; + + u8 reserved_at_1a0[0xb]; + u8 log_min_mkey_entity_size[0x5]; + u8 reserved_at_1b0[0x10]; + + u8 reserved_at_1c0[0x60]; u8 reserved_at_220[0x1]; u8 sw_vhca_id_valid[0x1]; -- cgit v1.2.3 From e1e10b44cf284248fb099681f48cc723564a1cc8 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 27 Sep 2022 17:45:29 +0200 Subject: xfrm: pass extack down to xfrm_type ->init_state Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index c504d07bcb7c..dbc81f5eb553 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -405,7 +405,8 @@ struct xfrm_type { #define XFRM_TYPE_LOCAL_COADDR 4 #define XFRM_TYPE_REMOTE_COADDR 8 - int (*init_state)(struct xfrm_state *x); + int (*init_state)(struct xfrm_state *x, + struct netlink_ext_ack *extack); void (*destructor)(struct xfrm_state *); int (*input)(struct xfrm_state *, struct sk_buff *skb); int (*output)(struct xfrm_state *, struct sk_buff *pskb); -- cgit v1.2.3 From 6ee55320520e31f5dae637e928d5792352b22776 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 27 Sep 2022 17:45:33 +0200 Subject: xfrm: ipcomp: add extack to ipcomp{4,6}_init_state And the shared helper ipcomp_init_state. Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/ipcomp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ipcomp.h b/include/net/ipcomp.h index c31108295079..8660a2a6d1fc 100644 --- a/include/net/ipcomp.h +++ b/include/net/ipcomp.h @@ -22,7 +22,7 @@ struct xfrm_state; int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb); int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb); void ipcomp_destroy(struct xfrm_state *x); -int ipcomp_init_state(struct xfrm_state *x); +int ipcomp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack); static inline struct ip_comp_hdr *ip_comp_hdr(const struct sk_buff *skb) { -- cgit v1.2.3 From 64696c40d03c01e0ea2e3e9aa1c490a7b6a1b6be Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:03 -0700 Subject: bpf: Add __bpf_prog_{enter,exit}_struct_ops for struct_ops trampoline The struct_ops prog is to allow using bpf to implement the functions in a struct (eg. kernel module). The current usage is to implement the tcp_congestion. The kernel does not call the tcp-cc's ops (ie. the bpf prog) in a recursive way. The struct_ops is sharing the tracing-trampoline's enter/exit function which tracks prog->active to avoid recursion. It is needed for tracing prog. However, it turns out the struct_ops bpf prog will hit this prog->active and unnecessarily skipped running the struct_ops prog. eg. The '.ssthresh' may run in_task() and then interrupted by softirq that runs the same '.ssthresh'. Skip running the '.ssthresh' will end up returning random value to the caller. The patch adds __bpf_prog_{enter,exit}_struct_ops for the struct_ops trampoline. They do not track the prog->active to detect recursion. One exception is when the tcp_congestion's '.init' ops is doing bpf_setsockopt(TCP_CONGESTION) and then recurs to the same '.init' ops. This will be addressed in the following patches. Fixes: ca06f55b9002 ("bpf: Add per-program recursion prevention mechanism") Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20220929070407.965581-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0f3eaf3ed98c..9e7d46d16032 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -864,6 +864,10 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); -- cgit v1.2.3 From 061ff040710e9f6f043d1fa80b1b362d2845b17a Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 29 Sep 2022 00:04:06 -0700 Subject: bpf: tcp: Stop bpf_setsockopt(TCP_CONGESTION) in init ops to recur itself When a bad bpf prog '.init' calls bpf_setsockopt(TCP_CONGESTION, "itself"), it will trigger this loop: .init => bpf_setsockopt(tcp_cc) => .init => bpf_setsockopt(tcp_cc) ... ... => .init => bpf_setsockopt(tcp_cc). It was prevented by the prog->active counter before but the prog->active detection cannot be used in struct_ops as explained in the earlier patch of the set. In this patch, the second bpf_setsockopt(tcp_cc) is not allowed in order to break the loop. This is done by using a bit of an existing 1 byte hole in tcp_sock to check if there is on-going bpf_setsockopt(TCP_CONGESTION) in this tcp_sock. Note that this essentially limits only the first '.init' can call bpf_setsockopt(TCP_CONGESTION) to pick a fallback cc (eg. peer does not support ECN) and the second '.init' cannot fallback to another cc. This applies even the second bpf_setsockopt(TCP_CONGESTION) will not cause a loop. Signed-off-by: Martin KaFai Lau Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20220929070407.965581-5-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/tcp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a9fbe22732c3..3bdf687e2fb3 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -388,6 +388,12 @@ struct tcp_sock { u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs * values defined in uapi/linux/tcp.h */ + u8 bpf_chg_cc_inprogress:1; /* In the middle of + * bpf_setsockopt(TCP_CONGESTION), + * it is to avoid the bpf_tcp_cc->init() + * to recur itself by calling + * bpf_setsockopt(TCP_CONGESTION, "itself"). + */ #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) #else #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 -- cgit v1.2.3 From 5493a2ad0d20944b16aba7ed7a951a43ad1f5fba Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 27 Sep 2022 14:23:06 -0700 Subject: docs: netlink: clarify the historical baggage of Netlink flags nlmsg_flags are full of historical baggage, inconsistencies and strangeness. Try to document it more thoroughly. Explain the meaning of the ECHO flag (and while at it clarify the comment in the uAPI). Handwave a little about the NEW request flags and how they make sense on the surface but cater to really old paradigm before commands were a thing. I will add more notes on how to make use of ECHO and discouragement for reuse of flags to the kernel-side documentation. Link: https://lore.kernel.org/r/20220927212306.823862-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/netlink.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index e0689dbd2cde..e2ae82e3f9f7 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -62,7 +62,7 @@ struct nlmsghdr { #define NLM_F_REQUEST 0x01 /* It is request message. */ #define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */ #define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */ -#define NLM_F_ECHO 0x08 /* Echo this request */ +#define NLM_F_ECHO 0x08 /* Receive resulting notifications */ #define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ #define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ -- cgit v1.2.3 From dbae2b062824fc2d35ae2d5df2f500626c758e80 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 28 Sep 2022 10:43:09 +0200 Subject: net: skb: introduce and use a single page frag cache After commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs") we are observing 10-20% regressions in performance tests with small packets. The perf trace points to high pressure on the slab allocator. This change tries to improve the allocation schema for small packets using an idea originally suggested by Eric: a new per CPU page frag is introduced and used in __napi_alloc_skb to cope with small allocation requests. To ensure that the above does not lead to excessive truesize underestimation, the frag size for small allocation is inflated to 1K and all the above is restricted to build with 4K page size. Note that we need to update accordingly the run-time check introduced with commit fd9ea57f4e95 ("net: add napi_get_frags_check() helper"). Alex suggested a smart page refcount schema to reduce the number of atomic operations and deal properly with pfmemalloc pages. Under small packet UDP flood, I measure a 15% peak tput increases. Suggested-by: Eric Dumazet Suggested-by: Alexander H Duyck Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Reviewed-by: Alexander Duyck Link: https://lore.kernel.org/r/6b6f65957c59f86a353fc09a5127e83a32ab5999.1664350652.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7a084a1c14e9..64c3a5864969 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3821,6 +3821,7 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); +void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); struct packet_offload *gro_find_receive_by_type(__be16 type); struct packet_offload *gro_find_complete_by_type(__be16 type); -- cgit v1.2.3 From aac4daa8941ea6566563ac001e9e5d4e54a674e2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 28 Sep 2022 12:51:57 +0300 Subject: net/sched: query offload capabilities through ndo_setup_tc() When adding optional new features to Qdisc offloads, existing drivers must reject the new configuration until they are coded up to act on it. Since modifying all drivers in lockstep with the changes in the Qdisc can create problems of its own, it would be nice if there existed an automatic opt-in mechanism for offloading optional features. Jakub proposes that we multiplex one more kind of call through ndo_setup_tc(): one where the driver populates a Qdisc-specific capability structure. First user will be taprio in further changes. Here we are introducing the definitions for the base functionality. Link: https://patchwork.kernel.org/project/netdevbpf/patch/20220923163310.3192733-3-vladimir.oltean@nxp.com/ Suggested-by: Jakub Kicinski Co-developed-by: Jakub Kicinski Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 1 + include/net/pkt_sched.h | 5 +++++ include/net/sch_generic.h | 3 +++ 3 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 64c3a5864969..eddf8ee270e7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -940,6 +940,7 @@ struct net_device_path_ctx { }; enum tc_setup_type { + TC_QUERY_CAPS, TC_SETUP_QDISC_MQPRIO, TC_SETUP_CLSU32, TC_SETUP_CLSFLOWER, diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 2ff80cd04c5c..34600292fdfb 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -141,6 +141,11 @@ static inline struct net *qdisc_net(struct Qdisc *q) return dev_net(q->dev_queue->dev); } +struct tc_query_caps_base { + enum tc_setup_type type; + void *caps; +}; + struct tc_cbs_qopt_offload { u8 enable; s32 queue; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 32819299937d..d5517719af4e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -677,6 +677,9 @@ qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, { } #endif +void qdisc_offload_query_caps(struct net_device *dev, + enum tc_setup_type type, + void *caps, size_t caps_len); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack); -- cgit v1.2.3 From a54fc09e4cba3004443aa05979f8c678196c8226 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 28 Sep 2022 12:51:58 +0300 Subject: net/sched: taprio: allow user input of per-tc max SDU IEEE 802.1Q clause 12.29.1.1 "The queueMaxSDUTable structure and data types" and 8.6.8.4 "Enhancements for scheduled traffic" talk about the existence of a per traffic class limitation of maximum frame sizes, with a fallback on the port-based MTU. As far as I am able to understand, the 802.1Q Service Data Unit (SDU) represents the MAC Service Data Unit (MSDU, i.e. L2 payload), excluding any number of prepended VLAN headers which may be otherwise present in the MSDU. Therefore, the queueMaxSDU is directly comparable to the device MTU (1500 means L2 payload sizes are accepted, or frame sizes of 1518 octets, or 1522 plus one VLAN header). Drivers which offload this are directly responsible of translating into other units of measurement. To keep the fast path checks optimized, we keep 2 arrays in the qdisc, one for max_sdu translated into frame length (so that it's comparable to skb->len), and another for offloading and for dumping back to the user. Signed-off-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/net/pkt_sched.h | 5 +++++ include/uapi/linux/pkt_sched.h | 11 +++++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 34600292fdfb..38207873eda6 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -160,6 +160,10 @@ struct tc_etf_qopt_offload { s32 queue; }; +struct tc_taprio_caps { + bool supports_queue_max_sdu:1; +}; + struct tc_taprio_sched_entry { u8 command; /* TC_TAPRIO_CMD_* */ @@ -173,6 +177,7 @@ struct tc_taprio_qopt_offload { ktime_t base_time; u64 cycle_time; u64 cycle_time_extension; + u32 max_sdu[TC_MAX_QUEUE]; size_t num_entries; struct tc_taprio_sched_entry entries[]; diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index f292b467b27f..000eec106856 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1232,6 +1232,16 @@ enum { #define TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST _BITUL(0) #define TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD _BITUL(1) +enum { + TCA_TAPRIO_TC_ENTRY_UNSPEC, + TCA_TAPRIO_TC_ENTRY_INDEX, /* u32 */ + TCA_TAPRIO_TC_ENTRY_MAX_SDU, /* u32 */ + + /* add new constants above here */ + __TCA_TAPRIO_TC_ENTRY_CNT, + TCA_TAPRIO_TC_ENTRY_MAX = (__TCA_TAPRIO_TC_ENTRY_CNT - 1) +}; + enum { TCA_TAPRIO_ATTR_UNSPEC, TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ @@ -1245,6 +1255,7 @@ enum { TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, /* s64 */ TCA_TAPRIO_ATTR_FLAGS, /* u32 */ TCA_TAPRIO_ATTR_TXTIME_DELAY, /* u32 */ + TCA_TAPRIO_ATTR_TC_ENTRY, /* nest */ __TCA_TAPRIO_ATTR_MAX, }; -- cgit v1.2.3 From d427c8999b071af2003203b42a007c4a80156c18 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Wed, 28 Sep 2022 14:55:31 +0200 Subject: net-next: skbuff: refactor pskb_pull pskb_may_pull already contains all of the checks performed by pskb_pull. Use pskb_may_pull for validation in pskb_pull, eliminating the duplication and making __pskb_pull obsolete. Replace __pskb_pull with pskb_pull where applicable. Signed-off-by: Richard Gobert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 920eb6413fee..9fcf534f2d92 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2616,20 +2616,6 @@ void *skb_pull_data(struct sk_buff *skb, size_t len); void *__pskb_pull_tail(struct sk_buff *skb, int delta); -static inline void *__pskb_pull(struct sk_buff *skb, unsigned int len) -{ - if (len > skb_headlen(skb) && - !__pskb_pull_tail(skb, len - skb_headlen(skb))) - return NULL; - skb->len -= len; - return skb->data += len; -} - -static inline void *pskb_pull(struct sk_buff *skb, unsigned int len) -{ - return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len); -} - static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len) { if (likely(len <= skb_headlen(skb))) @@ -2639,6 +2625,15 @@ static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len) return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL; } +static inline void *pskb_pull(struct sk_buff *skb, unsigned int len) +{ + if (!pskb_may_pull(skb, len)) + return NULL; + + skb->len -= len; + return skb->data += len; +} + void skb_condense(struct sk_buff *skb); /** -- cgit v1.2.3 From f4ce91ce12a7c6ead19b128ffa8cff6e3ded2a14 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 28 Sep 2022 16:03:31 -0400 Subject: tcp: fix tcp_cwnd_validate() to not forget is_cwnd_limited This commit fixes a bug in the tracking of max_packets_out and is_cwnd_limited. This bug can cause the connection to fail to remember that is_cwnd_limited is true, causing the connection to fail to grow cwnd when it should, causing throughput to be lower than it should be. The following event sequence is an example that triggers the bug: (a) The connection is cwnd_limited, but packets_out is not at its peak due to TSO deferral deciding not to send another skb yet. In such cases the connection can advance max_packets_seq and set tp->is_cwnd_limited to true and max_packets_out to a small number. (b) Then later in the round trip the connection is pacing-limited (not cwnd-limited), and packets_out is larger. In such cases the connection would raise max_packets_out to a bigger number but (unexpectedly) flip tp->is_cwnd_limited from true to false. This commit fixes that bug. One straightforward fix would be to separately track (a) the next window after max_packets_out reaches a maximum, and (b) the next window after tp->is_cwnd_limited is set to true. But this would require consuming an extra u32 sequence number. Instead, to save space we track only the most important information. Specifically, we track the strongest available signal of the degree to which the cwnd is fully utilized: (1) If the connection is cwnd-limited then we remember that fact for the current window. (2) If the connection not cwnd-limited then we track the maximum number of outstanding packets in the current window. In particular, note that the new logic cannot trigger the buggy (a)/(b) sequence above because with the new logic a condition where tp->packets_out > tp->max_packets_out can only trigger an update of tp->is_cwnd_limited if tp->is_cwnd_limited is false. This first showed up in a testing of a BBRv2 dev branch, but this buggy behavior highlighted a general issue with the tcp_cwnd_validate() logic that can cause cwnd to fail to increase at the proper rate for any TCP congestion control, including Reno or CUBIC. Fixes: ca8a22634381 ("tcp: make cwnd-limited checks measurement-based, and gentler") Signed-off-by: Neal Cardwell Signed-off-by: Kevin(Yudong) Yang Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 +- include/net/tcp.h | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index a9fbe22732c3..4791fd801945 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -295,7 +295,7 @@ struct tcp_sock { u32 packets_out; /* Packets which are "in flight" */ u32 retrans_out; /* Retransmitted packets out */ u32 max_packets_out; /* max packets_out in last window */ - u32 max_packets_seq; /* right edge of max_packets_out flight */ + u32 cwnd_usage_seq; /* right edge of cwnd usage tracking flight */ u16 urg_data; /* Saved octet of OOB data and control flags */ u8 ecn_flags; /* ECN status bits. */ diff --git a/include/net/tcp.h b/include/net/tcp.h index d10962b9f0d0..95c1d51393ac 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1295,11 +1295,14 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); + if (tp->is_cwnd_limited) + return true; + /* If in slow start, ensure cwnd grows to twice what was ACKed. */ if (tcp_in_slow_start(tp)) return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out; - return tp->is_cwnd_limited; + return false; } /* BBR congestion control needs pacing. -- cgit v1.2.3 From 9ca66afe73da16cd2c529e6770cc5aea0c4454df Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 29 Sep 2022 00:21:41 -0700 Subject: xsk: Expose min chunk size to drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drivers should be aware of the range of valid UMEM chunk sizes to be able to allocate their internal structures of an appropriate size. It will be used by mlx5e in the following patches. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan CC: "Björn Töpel" CC: Magnus Karlsson CC: Maciej Fijalkowski Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/net/xdp_sock_drv.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 0e58c38ce0c1..6406faa3d57d 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -9,6 +9,9 @@ #include #include +#define XDP_UMEM_MIN_CHUNK_SHIFT 11 +#define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) + #ifdef CONFIG_XDP_SOCKETS void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); -- cgit v1.2.3 From e5a3cc83d54019a90119ce0cf17b5e21729b79bf Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 29 Sep 2022 00:21:42 -0700 Subject: net/mlx5e: Use runtime page_shift for striding RQ This commit allows striding RQ to determine MTT page size at runtime, instead of sticking to the compile-time PAGE_SIZE. This functionality will be used by a following commit that adjusts the MTT page size to the XSK frame size. Stick with PAGE_SIZE for XSK on legacy RQ, as frag_stride is not used in data path, it only helps calculate how pages are partitioned into fragments, and PAGE_SIZE will ensure each fragment starts at the beginning of a new allocation unit (XSK frame). Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/qp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index be640c749d5e..afac93f9552c 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -162,6 +162,8 @@ enum { MLX5_SEND_WQE_MAX_WQEBBS = 16, }; +#define MLX5_SEND_WQE_MAX_SIZE (MLX5_SEND_WQE_MAX_WQEBBS * MLX5_SEND_WQE_BB) + enum { MLX5_WQE_FMR_PERM_LOCAL_READ = 1 << 27, MLX5_WQE_FMR_PERM_LOCAL_WRITE = 1 << 28, -- cgit v1.2.3 From 6470d2e7e8ed8e9dd560d8dc3e09d1100a17ee26 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 29 Sep 2022 00:21:46 -0700 Subject: net/mlx5e: xsk: Use KSM for unaligned XSK UMR MTTs used in striding RQ have certain alignment requirements. While it's guaranteed to work when UMR pages are aligned to the UMR page size, in practice it works then UMR pages are aligned to 8 bytes. However, it's still not enough flexibility for the unaligned mode of XSK. This patch leverages KSM to map UMR pages without alignment requirements, when unaligned XSK is active. The downside is that KSM entries are twice as big as MTTs, which limits the maximum WQE size, so regular RQs and aligned XSK continue using MTTs. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/qp.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index afac93f9552c..4657d5c54abe 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -478,6 +478,12 @@ struct mlx5_klm { __be64 va; }; +struct mlx5_ksm { + __be32 reserved; + __be32 key; + __be64 va; +}; + struct mlx5_stride_block_entry { __be16 stride; __be16 bcount; -- cgit v1.2.3 From f2f1675836015e79ba9720d4f7f02441fd0bb5e5 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 29 Sep 2022 00:21:47 -0700 Subject: xsk: Remove unused xsk_buff_discard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit removed the last usage of xsk_buff_discard in mlx5e, so the function that is no longer used can be removed. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan CC: "Björn Töpel" CC: Magnus Karlsson CC: Maciej Fijalkowski Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/net/xdp_sock_drv.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 6406faa3d57d..9c0d860609ba 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -107,13 +107,6 @@ static inline void xsk_buff_free(struct xdp_buff *xdp) xp_free(xskb); } -static inline void xsk_buff_discard(struct xdp_buff *xdp) -{ - struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); - - xp_release(xskb); -} - static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) { xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; -- cgit v1.2.3 From 081adcfe930e4b01a55eaa329b2e453a442f35a9 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 29 Sep 2022 09:28:57 +0200 Subject: net: devlink: introduce a flag to indicate devlink port being registered Instead of relying on devlink pointer not being initialized, introduce an extra flag to indicate if devlink port is registered. This is needed as later on devlink pointer is going to be initialized even in case devlink port is not registered yet. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 264aa98e6da6..bcacd8dab297 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -129,7 +129,8 @@ struct devlink_port { void *type_dev; struct devlink_port_attrs attrs; u8 attrs_set:1, - switch_port:1; + switch_port:1, + registered:1; struct delayed_work type_warn_dw; struct list_head reporter_list; struct mutex reporters_lock; /* Protects reporter_list */ -- cgit v1.2.3 From ae3bbc04d4bfef5d0332cd4edda3ac8f714cea23 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 29 Sep 2022 09:28:58 +0200 Subject: net: devlink: add port_init/fini() helpers to allow pre-register/post-unregister functions Lifetime of some of the devlink objects, like regions, is currently forced to be different for devlink instance and devlink port instance (per-port regions). The reason is that for devlink ports, the internal structures initialization happens only after devlink_port_register() is called. To resolve this inconsistency, introduce new set of helpers to allow driver to initialize devlink pointer and region list before devlink_register() is called. That allows port regions to be created before devlink port registration and destroyed after devlink port unregistration. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index bcacd8dab297..ba6b8b094943 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -130,7 +130,8 @@ struct devlink_port { struct devlink_port_attrs attrs; u8 attrs_set:1, switch_port:1, - registered:1; + registered:1, + initialized:1; struct delayed_work type_warn_dw; struct list_head reporter_list; struct mutex reporters_lock; /* Protects reporter_list */ @@ -1563,6 +1564,9 @@ void devlink_set_features(struct devlink *devlink, u64 features); void devlink_register(struct devlink *devlink); void devlink_unregister(struct devlink *devlink); void devlink_free(struct devlink *devlink); +void devlink_port_init(struct devlink *devlink, + struct devlink_port *devlink_port); +void devlink_port_fini(struct devlink_port *devlink_port); int devl_port_register(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index); -- cgit v1.2.3 From 61e4a51621587c939672d6a9354f6d0aa3d4e131 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 29 Sep 2022 09:29:02 +0200 Subject: net: dsa: remove bool devlink_port_setup Since dsa_port_devlink_setup() and dsa_port_devlink_teardown() are already called from code paths which only execute once per port (due to the existing bool dp->setup), keeping another dp->devlink_port_setup is redundant, because we can already manage to balance the calls properly (and not call teardown when setup was never called, or call setup twice, or things like that). Signed-off-by: Vladimir Oltean Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index d777eac5694f..ee369670e20e 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -294,8 +294,6 @@ struct dsa_port { u8 lag_tx_enabled:1; - u8 devlink_port_setup:1; - /* Master state bits, valid only on CPU ports */ u8 master_admin_up:1; u8 master_oper_up:1; -- cgit v1.2.3 From 402963e34a707e4a8f1854ed86437bc375d65766 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Tue, 27 Sep 2022 20:48:54 +0800 Subject: net: sched: cls_api: introduce tc_cls_bind_class() helper All the bind_class callback duplicate the same logic, this patch introduces tc_cls_bind_class() helper for common usage. Signed-off-by: Zhengchao Shao Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index d376c995d906..4cabb32a2ad9 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -210,6 +210,18 @@ tcf_unbind_filter(struct tcf_proto *tp, struct tcf_result *r) __tcf_unbind_filter(q, r); } +static inline void tc_cls_bind_class(u32 classid, unsigned long cl, + void *q, struct tcf_result *res, + unsigned long base) +{ + if (res->classid == classid) { + if (cl) + __tcf_bind_filter(q, res, base); + else + __tcf_unbind_filter(q, res); + } +} + struct tcf_exts { #ifdef CONFIG_NET_CLS_ACT __u32 type; /* for backward compat(TCA_OLD_COMPAT) */ -- cgit v1.2.3 From 537dd2d9fb9f4aa7939fb4fcf552ebe4f497bd7e Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Thu, 29 Sep 2022 21:52:02 +0800 Subject: net: Add helper function to parse netlink msg of ip_tunnel_encap Add ip_tunnel_netlink_encap_parms to parse netlink msg of ip_tunnel_encap. Reduces duplicate code, no actual functional changes. Signed-off-by: Liu Jian Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index ced80e2f8b58..51da2957cf48 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -302,6 +302,9 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); +bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *encap); + extern const struct header_ops ip_tunnel_header_ops; __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); -- cgit v1.2.3 From b86fca800a6a3d439c454b462f7f067a18234e60 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Thu, 29 Sep 2022 21:52:03 +0800 Subject: net: Add helper function to parse netlink msg of ip_tunnel_parm Add ip_tunnel_netlink_parms to parse netlink msg of ip_tunnel_parm. Reduces duplicate code, no actual functional changes. Signed-off-by: Liu Jian Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 51da2957cf48..fca357679816 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -305,6 +305,9 @@ void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *encap); +void ip_tunnel_netlink_parms(struct nlattr *data[], + struct ip_tunnel_parm *parms); + extern const struct header_ops ip_tunnel_header_ops; __be16 ip_tunnel_parse_protocol(const struct sk_buff *skb); -- cgit v1.2.3 From fd580c9830316edad6f8b1d9f542563730658efe Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 30 Sep 2022 16:21:00 +0200 Subject: net: sfp: augment SFP parsing with phy_interface_t bitmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We currently parse the SFP EEPROM to a bitmap of ethtool link modes, and then attempt to convert the link modes to a PHY interface mode. While this works at present, there are cases where this is sub-optimal. For example, where a module can operate with several different PHY interface modes. To start addressing this, arrange for the SFP EEPROM parsing to also provide a bitmap of the possible PHY interface modes. Signed-off-by: Russell King Signed-off-by: Marek Behún Signed-off-by: David S. Miller --- include/linux/sfp.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 302094b855fb..d1f343853b6c 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -535,7 +535,7 @@ int sfp_parse_port(struct sfp_bus *bus, const struct sfp_eeprom_id *id, unsigned long *support); bool sfp_may_have_phy(struct sfp_bus *bus, const struct sfp_eeprom_id *id); void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, - unsigned long *support); + unsigned long *support, unsigned long *interfaces); phy_interface_t sfp_select_interface(struct sfp_bus *bus, unsigned long *link_modes); @@ -568,7 +568,8 @@ static inline bool sfp_may_have_phy(struct sfp_bus *bus, static inline void sfp_parse_support(struct sfp_bus *bus, const struct sfp_eeprom_id *id, - unsigned long *support) + unsigned long *support, + unsigned long *interfaces) { } -- cgit v1.2.3 From eca68a3c7d05b38b4e728cead0c49718f2bc1d5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Fri, 30 Sep 2022 16:21:03 +0200 Subject: net: phylink: pass supported host PHY interface modes to phylib for SFP's PHYs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pass the supported PHY interface types to phylib if the PHY we are connecting is inside a SFP, so that the PHY driver can select an appropriate host configuration mode for their interface according to the host capabilities. For example the Marvell 88X3310 PHY inside RollBall SFP modules defaults to 10gbase-r mode on host's side, and the marvell10g driver currently does not change this setting. But a host may not support 10gbase-r. For example Turris Omnia only supports sgmii, 1000base-x and 2500base-x modes. The PHY can be configured to use those modes, but in order for the PHY driver to do that, it needs to know which modes are supported. Signed-off-by: Marek Behún Signed-off-by: David S. Miller --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 9c66f357f489..d65fc76fe0ae 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -571,6 +571,7 @@ struct macsec_ops; * @advertising: Currently advertised linkmodes * @adv_old: Saved advertised while power saving for WoL * @lp_advertising: Current link partner advertised linkmodes + * @host_interfaces: PHY interface modes supported by host * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited * @autoneg: Flag autoneg being used * @rate_matching: Current rate matching mode @@ -670,6 +671,9 @@ struct phy_device { /* used with phy_speed_down */ __ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old); + /* Host supported PHY interface types. Should be ignored if empty. */ + DECLARE_PHY_INTERFACE_MASK(host_interfaces); + /* Energy efficient ethernet modes which should be prohibited */ u32 eee_broken_modes; -- cgit v1.2.3 From e85b1347ace677c3822c12d9332dfaaffe594da6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Fri, 30 Sep 2022 16:21:08 +0200 Subject: net: sfp: create/destroy I2C mdiobus before PHY probe/after PHY release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of configuring the I2C mdiobus when SFP driver is probed, create/destroy the mdiobus before the PHY is probed for/after it is released. This way we can tell the mdio-i2c code which protocol to use for each SFP transceiver. Move the code that determines MDIO I2C protocol from sfp_sm_probe_for_phy() to sfp_sm_mod_probe(), where most of the SFP ID parsing is done. Don't allocate I2C bus if no PHY is expected. Signed-off-by: Marek Behún Signed-off-by: David S. Miller --- include/linux/mdio/mdio-i2c.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mdio/mdio-i2c.h b/include/linux/mdio/mdio-i2c.h index b1d27f7cd23f..3bde1a555a49 100644 --- a/include/linux/mdio/mdio-i2c.h +++ b/include/linux/mdio/mdio-i2c.h @@ -11,6 +11,12 @@ struct device; struct i2c_adapter; struct mii_bus; +enum mdio_i2c_proto { + MDIO_I2C_NONE, + MDIO_I2C_MARVELL_C22, + MDIO_I2C_C45, +}; + struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c); #endif -- cgit v1.2.3 From 09bbedac72d5a9267088c15d1a71c8c3a8fb47e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Fri, 30 Sep 2022 16:21:09 +0200 Subject: net: phy: mdio-i2c: support I2C MDIO protocol for RollBall SFP modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some multigig SFPs from RollBall and Hilink do not expose functional MDIO access to the internal PHY of the SFP via I2C address 0x56 (although there seems to be read-only clause 22 access on this address). Instead these SFPs PHY can be accessed via I2C via the SFP Enhanced Digital Diagnostic Interface - I2C address 0x51. The SFP_PAGE has to be selected to 3 and the password must be filled with 0xff bytes for this PHY communication to work. This extends the mdio-i2c driver to support this protocol by adding a special parameter to mdio_i2c_alloc function via which this RollBall protocol can be selected. Signed-off-by: Marek Behún Cc: Andrew Lunn Cc: Russell King Signed-off-by: David S. Miller --- include/linux/mdio/mdio-i2c.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mdio/mdio-i2c.h b/include/linux/mdio/mdio-i2c.h index 3bde1a555a49..65b550a6fc32 100644 --- a/include/linux/mdio/mdio-i2c.h +++ b/include/linux/mdio/mdio-i2c.h @@ -15,8 +15,10 @@ enum mdio_i2c_proto { MDIO_I2C_NONE, MDIO_I2C_MARVELL_C22, MDIO_I2C_C45, + MDIO_I2C_ROLLBALL, }; -struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c); +struct mii_bus *mdio_i2c_alloc(struct device *parent, struct i2c_adapter *i2c, + enum mdio_i2c_proto protocol); #endif -- cgit v1.2.3 From 9bc61c04ff6cce6a3756b86e6b34914f7b39d734 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 30 Sep 2022 16:37:30 +0200 Subject: net: Remove DECnet leftovers from flow.h. DECnet was removed by commit 1202cdd66531 ("Remove DECnet support from kernel"). Let's also revome its flow structure. Compile-tested only (allmodconfig). Signed-off-by: Guillaume Nault Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/flow.h | 26 -------------------------- 1 file changed, 26 deletions(-) (limited to 'include') diff --git a/include/net/flow.h b/include/net/flow.h index 987bd511d652..2f0da4f0318b 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -54,11 +54,6 @@ union flowi_uli { __u8 code; } icmpt; - struct { - __le16 dport; - __le16 sport; - } dnports; - __be32 gre_key; struct { @@ -156,27 +151,11 @@ struct flowi6 { __u32 mp_hash; } __attribute__((__aligned__(BITS_PER_LONG/8))); -struct flowidn { - struct flowi_common __fl_common; -#define flowidn_oif __fl_common.flowic_oif -#define flowidn_iif __fl_common.flowic_iif -#define flowidn_mark __fl_common.flowic_mark -#define flowidn_scope __fl_common.flowic_scope -#define flowidn_proto __fl_common.flowic_proto -#define flowidn_flags __fl_common.flowic_flags - __le16 daddr; - __le16 saddr; - union flowi_uli uli; -#define fld_sport uli.ports.sport -#define fld_dport uli.ports.dport -} __attribute__((__aligned__(BITS_PER_LONG/8))); - struct flowi { union { struct flowi_common __fl_common; struct flowi4 ip4; struct flowi6 ip6; - struct flowidn dn; } u; #define flowi_oif u.__fl_common.flowic_oif #define flowi_iif u.__fl_common.flowic_iif @@ -211,11 +190,6 @@ static inline struct flowi_common *flowi6_to_flowi_common(struct flowi6 *fl6) return &(fl6->__fl_common); } -static inline struct flowi *flowidn_to_flowi(struct flowidn *fldn) -{ - return container_of(fldn, struct flowi, u.dn); -} - __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys); #endif -- cgit v1.2.3 From 62c07983bef9d3e78e71189441e1a470f0d1e653 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Oct 2022 13:51:02 -0700 Subject: once: add DO_ONCE_SLOW() for sleepable contexts Christophe Leroy reported a ~80ms latency spike happening at first TCP connect() time. This is because __inet_hash_connect() uses get_random_once() to populate a perturbation table which became quite big after commit 4c2c8f03a5ab ("tcp: increase source port perturb table to 2^16") get_random_once() uses DO_ONCE(), which block hard irqs for the duration of the operation. This patch adds DO_ONCE_SLOW() which uses a mutex instead of a spinlock for operations where we prefer to stay in process context. Then __inet_hash_connect() can use get_random_slow_once() to populate its perturbation table. Fixes: 4c2c8f03a5ab ("tcp: increase source port perturb table to 2^16") Fixes: 190cc82489f4 ("tcp: change source port randomizarion at connect() time") Reported-by: Christophe Leroy Link: https://lore.kernel.org/netdev/CANn89iLAEYBaoYajy0Y9UmGFff5GPxDUoG-ErVB2jDdRNQ5Tug@mail.gmail.com/T/#t Signed-off-by: Eric Dumazet Cc: Willy Tarreau Tested-by: Christophe Leroy Signed-off-by: David S. Miller --- include/linux/once.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/once.h b/include/linux/once.h index b14d8b309d52..176ab75b42df 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -5,10 +5,18 @@ #include #include +/* Helpers used from arbitrary contexts. + * Hard irqs are blocked, be cautious. + */ bool __do_once_start(bool *done, unsigned long *flags); void __do_once_done(bool *done, struct static_key_true *once_key, unsigned long *flags, struct module *mod); +/* Variant for process contexts only. */ +bool __do_once_slow_start(bool *done); +void __do_once_slow_done(bool *done, struct static_key_true *once_key, + struct module *mod); + /* Call a function exactly once. The idea of DO_ONCE() is to perform * a function call such as initialization of random seeds, etc, only * once, where DO_ONCE() can live in the fast-path. After @func has @@ -52,7 +60,27 @@ void __do_once_done(bool *done, struct static_key_true *once_key, ___ret; \ }) +/* Variant of DO_ONCE() for process/sleepable contexts. */ +#define DO_ONCE_SLOW(func, ...) \ + ({ \ + bool ___ret = false; \ + static bool __section(".data.once") ___done = false; \ + static DEFINE_STATIC_KEY_TRUE(___once_key); \ + if (static_branch_unlikely(&___once_key)) { \ + ___ret = __do_once_slow_start(&___done); \ + if (unlikely(___ret)) { \ + func(__VA_ARGS__); \ + __do_once_slow_done(&___done, &___once_key, \ + THIS_MODULE); \ + } \ + } \ + ___ret; \ + }) + #define get_random_once(buf, nbytes) \ DO_ONCE(get_random_bytes, (buf), (nbytes)) +#define get_random_slow_once(buf, nbytes) \ + DO_ONCE_SLOW(get_random_bytes, (buf), (nbytes)) + #endif /* _LINUX_ONCE_H */ -- cgit v1.2.3 From 820dc0523e05c12810bb6bf4e56ce26e4c1948a2 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 30 Sep 2022 00:38:43 +0200 Subject: net: netfilter: move bpf_ct_set_nat_info kfunc in nf_nat_bpf.c Remove circular dependency between nf_nat module and nf_conntrack one moving bpf_ct_set_nat_info kfunc in nf_nat_bpf.c Fixes: 0fabd2aa199f ("net: netfilter: add bpf_ct_set_nat_info kfunc helper") Suggested-by: Kumar Kartikeya Dwivedi Tested-by: Nathan Chancellor Tested-by: Yauheni Kaliuta Signed-off-by: Lorenzo Bianconi Acked-by: John Fastabend Link: https://lore.kernel.org/r/51a65513d2cda3eeb0754842e8025ab3966068d8.1664490511.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/netfilter/nf_conntrack_bpf.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_bpf.h b/include/net/netfilter/nf_conntrack_bpf.h index c8b80add1142..2d0da478c8e0 100644 --- a/include/net/netfilter/nf_conntrack_bpf.h +++ b/include/net/netfilter/nf_conntrack_bpf.h @@ -4,6 +4,11 @@ #define _NF_CONNTRACK_BPF_H #include +#include + +struct nf_conn___init { + struct nf_conn ct; +}; #if (IS_BUILTIN(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ (IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) @@ -24,4 +29,18 @@ static inline void cleanup_nf_conntrack_bpf(void) #endif +#if (IS_BUILTIN(CONFIG_NF_NAT) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_NF_NAT) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +extern int register_nf_nat_bpf(void); + +#else + +static inline int register_nf_nat_bpf(void) +{ + return 0; +} + +#endif + #endif /* _NF_CONNTRACK_BPF_H */ -- cgit v1.2.3 From 168723c1f8d6e1e823d4c6ad3cf64478cf58330a Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Sat, 1 Oct 2022 21:56:22 -0700 Subject: net/mlx5e: xsk: Use umr_mode to calculate striding RQ parameters Instead of passing the unaligned flag, pass an enum that indicates the UMR mode. The next commit will add the third mode (KLM for certain configurations of XSK), which will be added to this enum instead of adding another bool flag everywhere. Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index f8ecb33105d3..285f301a6390 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1288,4 +1288,8 @@ static inline bool mlx5_get_roce_state(struct mlx5_core_dev *dev) return mlx5_is_roce_on(dev); } +enum { + MLX5_OCTWORD = 16, +}; + #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From 16ab85e78439bab1201ff26ba430231d1574b4ae Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sat, 1 Oct 2022 21:56:27 -0700 Subject: net/mlx5e: Expose rx_oversize_pkts_buffer counter Add the rx_oversize_pkts_buffer counter to ethtool statistics. This counter exposes the number of dropped received packets due to length which arrived to RQ and exceed software buffer size allocated by the device for incoming traffic. It might imply that the device MTU is larger than the software buffers size. Signed-off-by: Gal Pressman Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1ad762e22d86..06574d430ff5 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1491,7 +1491,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_120[0xa]; u8 log_max_ra_req_dc[0x6]; - u8 reserved_at_130[0x9]; + u8 reserved_at_130[0x2]; + u8 eth_wqe_too_small[0x1]; + u8 reserved_at_133[0x6]; u8 vnic_env_cq_overrun[0x1]; u8 log_max_ra_res_dc[0x6]; @@ -3537,7 +3539,9 @@ struct mlx5_ifc_vnic_diagnostic_statistics_bits { u8 cq_overrun[0x20]; - u8 reserved_at_220[0xde0]; + u8 eth_wqe_too_small[0x20]; + + u8 reserved_at_220[0xdc0]; }; struct mlx5_ifc_traffic_counter_bits { -- cgit v1.2.3 From 9b98d395b85dd042fe83fb696b1ac02e6c93a520 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sat, 1 Oct 2022 21:56:28 -0700 Subject: net/mlx5: Start health poll at earlier stage of driver load Start health poll at earlier stage, so if fw fatal issue occurred before or during initialization commands such as init_hca or set_hca_cap the poll health can detect and indicate that the driver is already in error state. Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 285f301a6390..a12929bc31b2 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1017,6 +1017,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health); +void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev); void mlx5_drain_health_wq(struct mlx5_core_dev *dev); void mlx5_trigger_health_work(struct mlx5_core_dev *dev); int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size, -- cgit v1.2.3 From 3114b075eb2531dea31a961944309485d6a53040 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 3 Oct 2022 08:51:57 +0200 Subject: net: add framework to support Ethernet PSE and PDs devices This framework was create with intention to provide support for Ethernet PSE (Power Sourcing Equipment) and PDs (Powered Device). At current step this patch implements generic PSE support for PoDL (Power over Data Lines 802.3bu) specification with reserving name space for PD devices as well. This framework can be extended to support 802.3af and 802.3at "Power via the Media Dependent Interface" (or PoE/Power over Ethernet) Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/pse-pd/pse.h | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 include/linux/pse-pd/pse.h (limited to 'include') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h new file mode 100644 index 000000000000..3ba787a48b15 --- /dev/null +++ b/include/linux/pse-pd/pse.h @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* +// Copyright (c) 2022 Pengutronix, Oleksij Rempel + */ +#ifndef _LINUX_PSE_CONTROLLER_H +#define _LINUX_PSE_CONTROLLER_H + +#include +#include +#include + +struct module; +struct device_node; +struct of_phandle_args; +struct pse_control; + +/** + * struct pse_controller_dev - PSE controller entity that might + * provide multiple PSE controls + * @ops: a pointer to device specific struct pse_controller_ops + * @owner: kernel module of the PSE controller driver + * @list: internal list of PSE controller devices + * @pse_control_head: head of internal list of requested PSE controls + * @dev: corresponding driver model device struct + * @of_pse_n_cells: number of cells in PSE line specifiers + * @of_xlate: translation function to translate from specifier as found in the + * device tree to id as given to the PSE control ops + * @nr_lines: number of PSE controls in this controller device + * @lock: Mutex for serialization access to the PSE controller + */ +struct pse_controller_dev { + const struct pse_controller_ops *ops; + struct module *owner; + struct list_head list; + struct list_head pse_control_head; + struct device *dev; + int of_pse_n_cells; + int (*of_xlate)(struct pse_controller_dev *pcdev, + const struct of_phandle_args *pse_spec); + unsigned int nr_lines; + struct mutex lock; +}; + +#if IS_ENABLED(CONFIG_PSE_CONTROLLER) +int pse_controller_register(struct pse_controller_dev *pcdev); +void pse_controller_unregister(struct pse_controller_dev *pcdev); +struct device; +int devm_pse_controller_register(struct device *dev, + struct pse_controller_dev *pcdev); + +struct pse_control *of_pse_control_get(struct device_node *node); +void pse_control_put(struct pse_control *psec); + +#else + +static inline struct pse_control *of_pse_control_get(struct device_node *node) +{ + return ERR_PTR(-ENOENT); +} + +static inline void pse_control_put(struct pse_control *psec) +{ +} + +#endif + +#endif -- cgit v1.2.3 From 5e82147de1cbd758bb280908daa39d95ed467538 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 3 Oct 2022 08:51:59 +0200 Subject: net: mdiobus: search for PSE nodes by parsing PHY nodes. Some PHYs can be linked with PSE (Power Sourcing Equipment), so search for related nodes and attach it to the phydev. Signed-off-by: Oleksij Rempel Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index d65fc76fe0ae..ddf66198f751 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -597,6 +597,7 @@ struct macsec_ops; * @master_slave_get: Current master/slave advertisement * @master_slave_state: Current master/slave configuration * @mii_ts: Pointer to time stamper callbacks + * @psec: Pointer to Power Sourcing Equipment control struct * @lock: Mutex for serialization access to PHY * @state_queue: Work queue for state machine * @shared: Pointer to private data shared by phys in one package @@ -715,6 +716,7 @@ struct phy_device { struct phylink *phylink; struct net_device *attached_dev; struct mii_timestamper *mii_ts; + struct pse_control *psec; u8 mdix; u8 mdix_ctrl; -- cgit v1.2.3 From 18ff0bcda6d1dd3d53b4ce3f03e61bf1a648f960 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 3 Oct 2022 08:52:00 +0200 Subject: ethtool: add interface to interact with Ethernet Power Equipment Add interface to support Power Sourcing Equipment. At current step it provides generic way to address all variants of PSE devices as defined in IEEE 802.3-2018 but support only objects specified for IEEE 802.3-2018 104.4 PoDL Power Sourcing Equipment (PSE). Currently supported and mandatory objects are: IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState IEEE 802.3-2018 30.15.1.2.1 acPoDLPSEAdminControl This is minimal interface needed to control PSE on each separate ethernet port but it provides not all mandatory objects specified in IEEE 802.3-2018. Since "PoDL PSE" and "PSE" have similar names, but some different values I decide to not merge them and keep separate naming schema. This should allow as to be as close to IEEE 802.3 spec as possible and avoid name conflicts in the future. This implementation is connected to PHYs instead of MACs because PSE auto classification can potentially interfere with PHY auto negotiation. So, may be some extra PHY related initialization will be needed. With WIP version of ethtools interaction with PSE capable link looks as following: $ ip l ... 5: t1l1@eth0: .. ... $ ethtool --show-pse t1l1 PSE attributs for t1l1: PoDL PSE Admin State: disabled PoDL PSE Power Detection Status: disabled $ ethtool --set-pse t1l1 podl-pse-admin-control enable $ ethtool --show-pse t1l1 PSE attributs for t1l1: PoDL PSE Admin State: enabled PoDL PSE Power Detection Status: delivering power Signed-off-by: kernel test robot Signed-off-by: Oleksij Rempel Reviewed-by: Bagas Sanjaya Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/pse-pd/pse.h | 62 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/ethtool.h | 45 ++++++++++++++++++++++++++ include/uapi/linux/ethtool_netlink.h | 16 ++++++++++ 3 files changed, 123 insertions(+) (limited to 'include') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index 3ba787a48b15..fd1a916eeeba 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -9,6 +9,47 @@ #include #include +struct phy_device; +struct pse_controller_dev; + +/** + * struct pse_control_config - PSE control/channel configuration. + * + * @admin_cotrol: set PoDL PSE admin control as described in + * IEEE 802.3-2018 30.15.1.2.1 acPoDLPSEAdminControl + */ +struct pse_control_config { + enum ethtool_podl_pse_admin_state admin_cotrol; +}; + +/** + * struct pse_control_status - PSE control/channel status. + * + * @podl_admin_state: operational state of the PoDL PSE + * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState + * @podl_pw_status: power detection status of the PoDL PSE. + * IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus: + */ +struct pse_control_status { + enum ethtool_podl_pse_admin_state podl_admin_state; + enum ethtool_podl_pse_pw_d_status podl_pw_status; +}; + +/** + * struct pse_controller_ops - PSE controller driver callbacks + * + * @ethtool_get_status: get PSE control status for ethtool interface + * @ethtool_set_config: set PSE control configuration over ethtool interface + */ +struct pse_controller_ops { + int (*ethtool_get_status)(struct pse_controller_dev *pcdev, + unsigned long id, struct netlink_ext_ack *extack, + struct pse_control_status *status); + int (*ethtool_set_config)(struct pse_controller_dev *pcdev, + unsigned long id, struct netlink_ext_ack *extack, + const struct pse_control_config *config); +}; + struct module; struct device_node; struct of_phandle_args; @@ -51,6 +92,13 @@ int devm_pse_controller_register(struct device *dev, struct pse_control *of_pse_control_get(struct device_node *node); void pse_control_put(struct pse_control *psec); +int pse_ethtool_get_status(struct pse_control *psec, + struct netlink_ext_ack *extack, + struct pse_control_status *status); +int pse_ethtool_set_config(struct pse_control *psec, + struct netlink_ext_ack *extack, + const struct pse_control_config *config); + #else static inline struct pse_control *of_pse_control_get(struct device_node *node) @@ -62,6 +110,20 @@ static inline void pse_control_put(struct pse_control *psec) { } +int pse_ethtool_get_status(struct pse_control *psec, + struct netlink_ext_ack *extack, + struct pse_control_status *status) +{ + return -ENOTSUPP; +} + +int pse_ethtool_set_config(struct pse_control *psec, + struct netlink_ext_ack *extack, + const struct pse_control_config *config) +{ + return -ENOTSUPP; +} + #endif #endif diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index fe9893d1485d..dc2aa3d75b39 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -736,6 +736,51 @@ enum ethtool_module_power_mode { ETHTOOL_MODULE_POWER_MODE_HIGH, }; +/** + * enum ethtool_podl_pse_admin_state - operational state of the PoDL PSE + * functions. IEEE 802.3-2018 30.15.1.1.2 aPoDLPSEAdminState + * @ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN: state of PoDL PSE functions are + * unknown + * @ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED: PoDL PSE functions are disabled + * @ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED: PoDL PSE functions are enabled + */ +enum ethtool_podl_pse_admin_state { + ETHTOOL_PODL_PSE_ADMIN_STATE_UNKNOWN = 1, + ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, + ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED, +}; + +/** + * enum ethtool_podl_pse_pw_d_status - power detection status of the PoDL PSE. + * IEEE 802.3-2018 30.15.1.1.3 aPoDLPSEPowerDetectionStatus: + * @ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN: PoDL PSE + * @ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED: "The enumeration “disabled” is + * asserted true when the PoDL PSE state diagram variable mr_pse_enable is + * false" + * @ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING: "The enumeration “searching” is + * asserted true when either of the PSE state diagram variables + * pi_detecting or pi_classifying is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING: "The enumeration “deliveringPower” + * is asserted true when the PoDL PSE state diagram variable pi_powered is + * true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP: "The enumeration “sleep” is asserted + * true when the PoDL PSE state diagram variable pi_sleeping is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE: "The enumeration “idle” is asserted true + * when the logical combination of the PoDL PSE state diagram variables + * pi_prebiased*!pi_sleeping is true." + * @ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR: "The enumeration “error” is asserted + * true when the PoDL PSE state diagram variable overload_held is true." + */ +enum ethtool_podl_pse_pw_d_status { + ETHTOOL_PODL_PSE_PW_D_STATUS_UNKNOWN = 1, + ETHTOOL_PODL_PSE_PW_D_STATUS_DISABLED, + ETHTOOL_PODL_PSE_PW_D_STATUS_SEARCHING, + ETHTOOL_PODL_PSE_PW_D_STATUS_DELIVERING, + ETHTOOL_PODL_PSE_PW_D_STATUS_SLEEP, + ETHTOOL_PODL_PSE_PW_D_STATUS_IDLE, + ETHTOOL_PODL_PSE_PW_D_STATUS_ERROR, +}; + /** * struct ethtool_gstrings - string set for data tagging * @cmd: Command number = %ETHTOOL_GSTRINGS diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 408a664fad59..bb57084ac524 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -49,6 +49,8 @@ enum { ETHTOOL_MSG_PHC_VCLOCKS_GET, ETHTOOL_MSG_MODULE_GET, ETHTOOL_MSG_MODULE_SET, + ETHTOOL_MSG_PSE_GET, + ETHTOOL_MSG_PSE_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -94,6 +96,7 @@ enum { ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, ETHTOOL_MSG_MODULE_GET_REPLY, ETHTOOL_MSG_MODULE_NTF, + ETHTOOL_MSG_PSE_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -863,6 +866,19 @@ enum { ETHTOOL_A_MODULE_MAX = (__ETHTOOL_A_MODULE_CNT - 1) }; +/* Power Sourcing Equipment */ +enum { + ETHTOOL_A_PSE_UNSPEC, + ETHTOOL_A_PSE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PODL_PSE_ADMIN_STATE, /* u32 */ + ETHTOOL_A_PODL_PSE_ADMIN_CONTROL, /* u32 */ + ETHTOOL_A_PODL_PSE_PW_D_STATUS, /* u32 */ + + /* add new constants above here */ + __ETHTOOL_A_PSE_CNT, + ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 -- cgit v1.2.3 From 2a4187f4406ec3236f8b9d0d5150d2bf8d021b68 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 3 Oct 2022 20:14:13 +0200 Subject: once: rename _SLOW to _SLEEPABLE The _SLOW designation wasn't really descriptive of anything. This is meant to be called from process context when it's possible to sleep. So name this more aptly _SLEEPABLE, which better fits its intended use. Fixes: 62c07983bef9 ("once: add DO_ONCE_SLOW() for sleepable contexts") Cc: Christophe Leroy Signed-off-by: Jason A. Donenfeld Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20221003181413.1221968-1-Jason@zx2c4.com Signed-off-by: Jakub Kicinski --- include/linux/once.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/once.h b/include/linux/once.h index 176ab75b42df..bc714d414448 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -13,9 +13,9 @@ void __do_once_done(bool *done, struct static_key_true *once_key, unsigned long *flags, struct module *mod); /* Variant for process contexts only. */ -bool __do_once_slow_start(bool *done); -void __do_once_slow_done(bool *done, struct static_key_true *once_key, - struct module *mod); +bool __do_once_sleepable_start(bool *done); +void __do_once_sleepable_done(bool *done, struct static_key_true *once_key, + struct module *mod); /* Call a function exactly once. The idea of DO_ONCE() is to perform * a function call such as initialization of random seeds, etc, only @@ -61,26 +61,26 @@ void __do_once_slow_done(bool *done, struct static_key_true *once_key, }) /* Variant of DO_ONCE() for process/sleepable contexts. */ -#define DO_ONCE_SLOW(func, ...) \ - ({ \ - bool ___ret = false; \ - static bool __section(".data.once") ___done = false; \ - static DEFINE_STATIC_KEY_TRUE(___once_key); \ - if (static_branch_unlikely(&___once_key)) { \ - ___ret = __do_once_slow_start(&___done); \ - if (unlikely(___ret)) { \ - func(__VA_ARGS__); \ - __do_once_slow_done(&___done, &___once_key, \ - THIS_MODULE); \ - } \ - } \ - ___ret; \ +#define DO_ONCE_SLEEPABLE(func, ...) \ + ({ \ + bool ___ret = false; \ + static bool __section(".data.once") ___done = false; \ + static DEFINE_STATIC_KEY_TRUE(___once_key); \ + if (static_branch_unlikely(&___once_key)) { \ + ___ret = __do_once_sleepable_start(&___done); \ + if (unlikely(___ret)) { \ + func(__VA_ARGS__); \ + __do_once_sleepable_done(&___done, &___once_key,\ + THIS_MODULE); \ + } \ + } \ + ___ret; \ }) #define get_random_once(buf, nbytes) \ DO_ONCE(get_random_bytes, (buf), (nbytes)) -#define get_random_slow_once(buf, nbytes) \ - DO_ONCE_SLOW(get_random_bytes, (buf), (nbytes)) +#define get_random_sleepable_once(buf, nbytes) \ + DO_ONCE_SLEEPABLE(get_random_bytes, (buf), (nbytes)) #endif /* _LINUX_ONCE_H */ -- cgit v1.2.3 From 681bf011b9b5989c6e9db6beb64494918aab9a43 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 3 Oct 2022 21:03:27 -0700 Subject: eth: pse: add missing static inlines build bot reports missing 'static inline' qualifiers in the header. Reported-by: kernel test robot Fixes: 18ff0bcda6d1 ("ethtool: add interface to interact with Ethernet Power Equipment") Reviewed-by: Oleksij Rempel Link: https://lore.kernel.org/r/20221004040327.2034878-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/pse-pd/pse.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/pse-pd/pse.h b/include/linux/pse-pd/pse.h index fd1a916eeeba..fb724c65c77b 100644 --- a/include/linux/pse-pd/pse.h +++ b/include/linux/pse-pd/pse.h @@ -110,16 +110,16 @@ static inline void pse_control_put(struct pse_control *psec) { } -int pse_ethtool_get_status(struct pse_control *psec, - struct netlink_ext_ack *extack, - struct pse_control_status *status) +static inline int pse_ethtool_get_status(struct pse_control *psec, + struct netlink_ext_ack *extack, + struct pse_control_status *status) { return -ENOTSUPP; } -int pse_ethtool_set_config(struct pse_control *psec, - struct netlink_ext_ack *extack, - const struct pse_control_config *config) +static inline int pse_ethtool_set_config(struct pse_control *psec, + struct netlink_ext_ack *extack, + const struct pse_control_config *config) { return -ENOTSUPP; } -- cgit v1.2.3