From f64c4acea51fbe2c08c0b0f48b7f5d1657d7a5e4 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Fri, 10 Sep 2021 01:04:08 +0300 Subject: bpf: Add hardware timestamp field to __sk_buff BPF programs may want to know hardware timestamps if NIC supports such timestamping. Expose this data as hwtstamp field of __sk_buff the same way as gso_segs/gso_size. This field could be accessed from the same programs as tstamp field, but it's read-only field. Explicit test to deny access to padding data is added to bpf_skb_is_valid_access. Also update BPF_PROG_TEST_RUN tests of the feature. Signed-off-by: Vadim Fedorenko Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210909220409.8804-2-vfedorenko@novek.ru --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 791f31dd0abe..51cfd91cc387 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5284,6 +5284,8 @@ struct __sk_buff { __u32 gso_segs; __bpf_md_ptr(struct bpf_sock *, sk); __u32 gso_size; + __u32 :32; /* Padding, future use. */ + __u64 hwtstamp; }; struct bpf_tunnel_key { -- cgit v1.2.3 From 856c02dbce4f8d6a5644083db22c11750aa11481 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 10 Sep 2021 11:33:51 -0700 Subject: bpf: Introduce helper bpf_get_branch_snapshot Introduce bpf_get_branch_snapshot(), which allows tracing pogram to get branch trace from hardware (e.g. Intel LBR). To use the feature, the user need to create perf_event with proper branch_record filtering on each cpu, and then calls bpf_get_branch_snapshot in the bpf function. On Intel CPUs, VLBR event (raw event 0x1b00) can be use for this. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210910183352.3151445-3-songliubraving@fb.com --- include/uapi/linux/bpf.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 51cfd91cc387..d21326558d42 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4877,6 +4877,27 @@ union bpf_attr { * Get the struct pt_regs associated with **task**. * Return * A pointer to struct pt_regs. + * + * long bpf_get_branch_snapshot(void *entries, u32 size, u64 flags) + * Description + * Get branch trace from hardware engines like Intel LBR. The + * hardware engine is stopped shortly after the helper is + * called. Therefore, the user need to filter branch entries + * based on the actual use case. To capture branch trace + * before the trigger point of the BPF program, the helper + * should be called at the beginning of the BPF program. + * + * The data is stored as struct perf_branch_entry into output + * buffer *entries*. *size* is the size of *entries* in bytes. + * *flags* is reserved for now and must be zero. + * + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-ENOENT** if architecture does not support branch records. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5055,6 +5076,7 @@ union bpf_attr { FN(get_func_ip), \ FN(get_attach_cookie), \ FN(task_pt_regs), \ + FN(get_branch_snapshot), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From fa0866625543b4d8b3d026e4e0ef5ec25a453920 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Tue, 14 Sep 2021 10:35:05 +0200 Subject: net/smc: add support for user defined EIDs SMC-Dv2 allows users to define EIDs which allows to create separate name spaces enabling users to cluster their SMC-Dv2 connections. Add support for user defined EIDs and extent the generic netlink interface so users can add, remove and dump EIDs. Signed-off-by: Karsten Graul Reviewed-by: Guvenc Gulce Signed-off-by: Guvenc Gulce Signed-off-by: David S. Miller --- include/uapi/linux/smc.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index 0f7f87c70baf..e3728af2832b 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -38,6 +38,9 @@ enum { /* SMC PNET Table commands */ #define SMC_GENL_FAMILY_VERSION 1 #define SMC_PCI_ID_STR_LEN 16 /* Max length of pci id string */ +#define SMC_MAX_HOSTNAME_LEN 32 /* Max length of the hostname */ +#define SMC_MAX_UEID 4 /* Max number of user EIDs */ +#define SMC_MAX_EID_LEN 32 /* Max length of an EID */ /* SMC_GENL_FAMILY commands */ enum { @@ -49,6 +52,10 @@ enum { SMC_NETLINK_GET_DEV_SMCR, SMC_NETLINK_GET_STATS, SMC_NETLINK_GET_FBACK_STATS, + SMC_NETLINK_DUMP_UEID, + SMC_NETLINK_ADD_UEID, + SMC_NETLINK_REMOVE_UEID, + SMC_NETLINK_FLUSH_UEID, }; /* SMC_GENL_FAMILY top level attributes */ @@ -242,4 +249,12 @@ enum { __SMC_NLA_FBACK_STATS_MAX, SMC_NLA_FBACK_STATS_MAX = __SMC_NLA_FBACK_STATS_MAX - 1 }; + +/* SMC_NETLINK_UEID attributes */ +enum { + SMC_NLA_EID_TABLE_UNSPEC, + SMC_NLA_EID_TABLE_ENTRY, /* string */ + __SMC_NLA_EID_TABLE_MAX, + SMC_NLA_EID_TABLE_MAX = __SMC_NLA_EID_TABLE_MAX - 1 +}; #endif /* _UAPI_LINUX_SMC_H */ -- cgit v1.2.3 From 3c572145c24e21c24e1cd0fd168011eaba85da8e Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Tue, 14 Sep 2021 10:35:07 +0200 Subject: net/smc: add generic netlink support for system EID With SMC-Dv2 users can configure if the static system EID should be used during CLC handshake, or if only user EIDs are allowed. Add generic netlink support to enable and disable the system EID, and to retrieve the system EID and its current enabled state. Signed-off-by: Karsten Graul Reviewed-by: Guvenc Gulce Signed-off-by: Guvenc Gulce Signed-off-by: David S. Miller --- include/uapi/linux/smc.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index e3728af2832b..b175bd0165a1 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -56,6 +56,9 @@ enum { SMC_NETLINK_ADD_UEID, SMC_NETLINK_REMOVE_UEID, SMC_NETLINK_FLUSH_UEID, + SMC_NETLINK_DUMP_SEID, + SMC_NETLINK_ENABLE_SEID, + SMC_NETLINK_DISABLE_SEID, }; /* SMC_GENL_FAMILY top level attributes */ @@ -257,4 +260,13 @@ enum { __SMC_NLA_EID_TABLE_MAX, SMC_NLA_EID_TABLE_MAX = __SMC_NLA_EID_TABLE_MAX - 1 }; + +/* SMC_NETLINK_SEID attributes */ +enum { + SMC_NLA_SEID_UNSPEC, + SMC_NLA_SEID_ENTRY, /* string */ + SMC_NLA_SEID_ENABLED, /* u8 */ + __SMC_NLA_SEID_TABLE_MAX, + SMC_NLA_SEID_TABLE_MAX = __SMC_NLA_SEID_TABLE_MAX - 1 +}; #endif /* _UAPI_LINUX_SMC_H */ -- cgit v1.2.3 From 41ced4cd88020c9d4b71ff7c50d020f081efa4a0 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 14 Sep 2021 15:30:09 -0700 Subject: btf: Change BTF_KIND_* macros to enums Change BTF_KIND_* macros to enums so they are encoded in dwarf and appear in vmlinux.h. This will make it easier for bpf programs to use these constants without macro definitions. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210914223009.245307-1-yhs@fb.com --- include/uapi/linux/btf.h | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index d27b1708efe9..10e401073dd1 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -56,25 +56,28 @@ struct btf_type { #define BTF_INFO_VLEN(info) ((info) & 0xffff) #define BTF_INFO_KFLAG(info) ((info) >> 31) -#define BTF_KIND_UNKN 0 /* Unknown */ -#define BTF_KIND_INT 1 /* Integer */ -#define BTF_KIND_PTR 2 /* Pointer */ -#define BTF_KIND_ARRAY 3 /* Array */ -#define BTF_KIND_STRUCT 4 /* Struct */ -#define BTF_KIND_UNION 5 /* Union */ -#define BTF_KIND_ENUM 6 /* Enumeration */ -#define BTF_KIND_FWD 7 /* Forward */ -#define BTF_KIND_TYPEDEF 8 /* Typedef */ -#define BTF_KIND_VOLATILE 9 /* Volatile */ -#define BTF_KIND_CONST 10 /* Const */ -#define BTF_KIND_RESTRICT 11 /* Restrict */ -#define BTF_KIND_FUNC 12 /* Function */ -#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ -#define BTF_KIND_VAR 14 /* Variable */ -#define BTF_KIND_DATASEC 15 /* Section */ -#define BTF_KIND_FLOAT 16 /* Floating point */ -#define BTF_KIND_MAX BTF_KIND_FLOAT -#define NR_BTF_KINDS (BTF_KIND_MAX + 1) +enum { + BTF_KIND_UNKN = 0, /* Unknown */ + BTF_KIND_INT = 1, /* Integer */ + BTF_KIND_PTR = 2, /* Pointer */ + BTF_KIND_ARRAY = 3, /* Array */ + BTF_KIND_STRUCT = 4, /* Struct */ + BTF_KIND_UNION = 5, /* Union */ + BTF_KIND_ENUM = 6, /* Enumeration */ + BTF_KIND_FWD = 7, /* Forward */ + BTF_KIND_TYPEDEF = 8, /* Typedef */ + BTF_KIND_VOLATILE = 9, /* Volatile */ + BTF_KIND_CONST = 10, /* Const */ + BTF_KIND_RESTRICT = 11, /* Restrict */ + BTF_KIND_FUNC = 12, /* Function */ + BTF_KIND_FUNC_PROTO = 13, /* Function Proto */ + BTF_KIND_VAR = 14, /* Variable */ + BTF_KIND_DATASEC = 15, /* Section */ + BTF_KIND_FLOAT = 16, /* Floating point */ + + NR_BTF_KINDS, + BTF_KIND_MAX = NR_BTF_KINDS - 1, +}; /* For some specific BTF_KIND, "struct btf_type" is immediately * followed by extra data. -- cgit v1.2.3 From b5ea834dde6b6e7f75e51d5f66dac8cd7c97b5ef Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 14 Sep 2021 15:30:15 -0700 Subject: bpf: Support for new btf kind BTF_KIND_TAG LLVM14 added support for a new C attribute ([1]) __attribute__((btf_tag("arbitrary_str"))) This attribute will be emitted to dwarf ([2]) and pahole will convert it to BTF. Or for bpf target, this attribute will be emitted to BTF directly ([3], [4]). The attribute is intended to provide additional information for - struct/union type or struct/union member - static/global variables - static/global function or function parameter. For linux kernel, the btf_tag can be applied in various places to specify user pointer, function pre- or post- condition, function allow/deny in certain context, etc. Such information will be encoded in vmlinux BTF and can be used by verifier. The btf_tag can also be applied to bpf programs to help global verifiable functions, e.g., specifying preconditions, etc. This patch added basic parsing and checking support in kernel for new BTF_KIND_TAG kind. [1] https://reviews.llvm.org/D106614 [2] https://reviews.llvm.org/D106621 [3] https://reviews.llvm.org/D106622 [4] https://reviews.llvm.org/D109560 Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210914223015.245546-1-yhs@fb.com --- include/uapi/linux/btf.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 10e401073dd1..642b6ecb37d7 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -43,7 +43,7 @@ struct btf_type { * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC, FUNC_PROTO and VAR. + * FUNC, FUNC_PROTO, VAR and TAG. * "type" is a type_id referring to another type. */ union { @@ -74,6 +74,7 @@ enum { BTF_KIND_VAR = 14, /* Variable */ BTF_KIND_DATASEC = 15, /* Section */ BTF_KIND_FLOAT = 16, /* Floating point */ + BTF_KIND_TAG = 17, /* Tag */ NR_BTF_KINDS, BTF_KIND_MAX = NR_BTF_KINDS - 1, @@ -173,4 +174,15 @@ struct btf_var_secinfo { __u32 size; }; +/* BTF_KIND_TAG is followed by a single "struct btf_tag" to describe + * additional information related to the tag applied location. + * If component_idx == -1, the tag is applied to a struct, union, + * variable or function. Otherwise, it is applied to a struct/union + * member or a func argument, and component_idx indicates which member + * or argument (0 ... vlen-1). + */ +struct btf_tag { + __s32 component_idx; +}; + #endif /* _UAPI__LINUX_BTF_H__ */ -- cgit v1.2.3 From 336562752acc1a723f9a24b5b8129ae22e0478c6 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Wed, 15 Sep 2021 01:54:00 +0200 Subject: bpf: Update bpf_get_smp_processor_id() documentation BPF programs run with migration disabled regardless of preemption, as they are protected by migrate_disable(). Update the uapi documentation accordingly. Signed-off-by: Matteo Croce Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210914235400.59427-1-mcroce@linux.microsoft.com --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d21326558d42..3e9785f1064a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1629,7 +1629,7 @@ union bpf_attr { * u32 bpf_get_smp_processor_id(void) * Description * Get the SMP (symmetric multiprocessing) processor id. Note that - * all programs run with preemption disabled, which means that the + * all programs run with migration disabled, which means that the * SMP processor id is stable during all the execution of the * program. * Return -- cgit v1.2.3 From 227b9644ab16d2ecd98d593edbe15c32c0c9620a Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Thu, 16 Sep 2021 11:37:38 +0800 Subject: net/tls: support SM4 GCM/CCM algorithm The RFC8998 specification defines the use of the ShangMi algorithm cipher suites in TLS 1.3, and also supports the GCM/CCM mode using the SM4 algorithm. Signed-off-by: Tianjia Zhang Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/tls.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index 0d54baea1d8d..5f38be0ec0f3 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -84,6 +84,20 @@ #define TLS_CIPHER_CHACHA20_POLY1305_TAG_SIZE 16 #define TLS_CIPHER_CHACHA20_POLY1305_REC_SEQ_SIZE 8 +#define TLS_CIPHER_SM4_GCM 55 +#define TLS_CIPHER_SM4_GCM_IV_SIZE 8 +#define TLS_CIPHER_SM4_GCM_KEY_SIZE 16 +#define TLS_CIPHER_SM4_GCM_SALT_SIZE 4 +#define TLS_CIPHER_SM4_GCM_TAG_SIZE 16 +#define TLS_CIPHER_SM4_GCM_REC_SEQ_SIZE 8 + +#define TLS_CIPHER_SM4_CCM 56 +#define TLS_CIPHER_SM4_CCM_IV_SIZE 8 +#define TLS_CIPHER_SM4_CCM_KEY_SIZE 16 +#define TLS_CIPHER_SM4_CCM_SALT_SIZE 4 +#define TLS_CIPHER_SM4_CCM_TAG_SIZE 16 +#define TLS_CIPHER_SM4_CCM_REC_SEQ_SIZE 8 + #define TLS_SET_RECORD_TYPE 1 #define TLS_GET_RECORD_TYPE 2 @@ -124,6 +138,22 @@ struct tls12_crypto_info_chacha20_poly1305 { unsigned char rec_seq[TLS_CIPHER_CHACHA20_POLY1305_REC_SEQ_SIZE]; }; +struct tls12_crypto_info_sm4_gcm { + struct tls_crypto_info info; + unsigned char iv[TLS_CIPHER_SM4_GCM_IV_SIZE]; + unsigned char key[TLS_CIPHER_SM4_GCM_KEY_SIZE]; + unsigned char salt[TLS_CIPHER_SM4_GCM_SALT_SIZE]; + unsigned char rec_seq[TLS_CIPHER_SM4_GCM_REC_SEQ_SIZE]; +}; + +struct tls12_crypto_info_sm4_ccm { + struct tls_crypto_info info; + unsigned char iv[TLS_CIPHER_SM4_CCM_IV_SIZE]; + unsigned char key[TLS_CIPHER_SM4_CCM_KEY_SIZE]; + unsigned char salt[TLS_CIPHER_SM4_CCM_SALT_SIZE]; + unsigned char rec_seq[TLS_CIPHER_SM4_CCM_REC_SEQ_SIZE]; +}; + enum { TLS_INFO_UNSPEC, TLS_INFO_VERSION, -- cgit v1.2.3 From 10aceb629e198429c849d5e995c3bb1ba7a9aaa3 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 17 Sep 2021 11:29:05 -0700 Subject: bpf: Add bpf_trace_vprintk helper This helper is meant to be "bpf_trace_printk, but with proper vararg support". Follow bpf_snprintf's example and take a u64 pseudo-vararg array. Write to /sys/kernel/debug/tracing/trace_pipe using the same mechanism as bpf_trace_printk. The functionality of this helper was requested in the libbpf issue tracker [0]. [0] Closes: https://github.com/libbpf/libbpf/issues/315 Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210917182911.2426606-4-davemarchevsky@fb.com --- include/uapi/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3e9785f1064a..98ca79a67937 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4898,6 +4898,16 @@ union bpf_attr { * **-EINVAL** if *flags* is not zero. * * **-ENOENT** if architecture does not support branch records. + * + * long bpf_trace_vprintk(const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 + * to format and can handle more format args as a result. + * + * Arguments are to be used as in **bpf_seq_printf**\ () helper. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5077,6 +5087,7 @@ union bpf_attr { FN(get_attach_cookie), \ FN(task_pt_regs), \ FN(get_branch_snapshot), \ + FN(trace_vprintk), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From a42effb0b24fcaf49513c2d7d77ef6daa9e32a6f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Fri, 17 Sep 2021 11:29:11 -0700 Subject: bpf: Clarify data_len param in bpf_snprintf and bpf_seq_printf comments Since the data_len in these two functions is a byte len of the preceding u64 *data array, it must always be a multiple of 8. If this isn't the case both helpers error out, so let's make the requirement explicit so users don't need to infer it. Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210917182911.2426606-10-davemarchevsky@fb.com --- include/uapi/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 98ca79a67937..6fc59d61937a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4046,7 +4046,7 @@ union bpf_attr { * arguments. The *data* are a **u64** array and corresponding format string * values are stored in the array. For strings and pointers where pointees * are accessed, only the pointer values are stored in the *data* array. - * The *data_len* is the size of *data* in bytes. + * The *data_len* is the size of *data* in bytes - must be a multiple of 8. * * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. * Reading kernel memory may fail due to either invalid address or @@ -4751,7 +4751,8 @@ union bpf_attr { * Each format specifier in **fmt** corresponds to one u64 element * in the **data** array. For strings and pointers where pointees * are accessed, only the pointer values are stored in the *data* - * array. The *data_len* is the size of *data* in bytes. + * array. The *data_len* is the size of *data* in bytes - must be + * a multiple of 8. * * Formats **%s** and **%p{i,I}{4,6}** require to read kernel * memory. Reading kernel memory may fail due to either invalid -- cgit v1.2.3 From 55c42fa7fa331f98062c32799456420930b8bf8c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 Sep 2021 16:33:19 -0700 Subject: mptcp: add MPTCP_INFO getsockopt Its not compatible with multipath-tcp.org kernel one. 1. The out-of-tree implementation defines a different 'struct mptcp_info', with embedded __user addresses for additional data such as endpoint addresses. 2. Mat Martineau points out that embedded __user addresses doesn't work with BPF_CGROUP_RUN_PROG_GETSOCKOPT() which assumes that copying in optsize bytes from optval provides all data that got copied to userspace. This provides mptcp_info data for the given mptcp socket. Userspace sets optlen to the size of the structure it expects. The kernel updates it to contain the number of bytes that it copied. This allows to append more information to the structure later. Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index f66038b9551f..3e9caeddda7e 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -193,4 +193,7 @@ enum mptcp_event_attr { #define MPTCP_RST_EBADPERF 5 #define MPTCP_RST_EMIDDLEBOX 6 +/* MPTCP socket options */ +#define MPTCP_INFO 1 + #endif /* _UAPI_MPTCP_H */ -- cgit v1.2.3 From 06f15cee369535a383c9c82ed37a25f0a413f6f1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 Sep 2021 16:33:20 -0700 Subject: mptcp: add MPTCP_TCPINFO getsockopt support Allow users to retrieve TCP_INFO data of all subflows. Users need to pre-initialize a meta header that has to be prepended to the data buffer that will be filled with the tcp info data. The meta header looks like this: struct mptcp_subflow_data { __u32 size_subflow_data;/* size of this structure in userspace */ __u32 num_subflows; /* must be 0, set by kernel */ __u32 size_kernel; /* must be 0, set by kernel */ __u32 size_user; /* size of one element in data[] */ } __attribute__((aligned(8))); size_subflow_data has to be set to 'sizeof(struct mptcp_subflow_data)'. This allows to extend mptcp_subflow_data structure later on without breaking backwards compatibility. If the structure is extended later on, kernel knows where the userspace-provided meta header ends, even if userspace uses an older (smaller) version of the structure. num_subflows must be set to 0. If the getsockopt request succeeds (return value is 0), it will be updated to contain the number of active subflows for the given logical connection. size_kernel must be set to 0. If the getsockopt request is successful, it will contain the size of the 'struct tcp_info' as known by the kernel. This is informational only. size_user must be set to 'sizeof(struct tcp_info)'. This allows the kernel to only fill in the space reserved/expected by userspace. Example: struct my_tcp_info { struct mptcp_subflow_data d; struct tcp_info ti[2]; }; struct my_tcp_info ti; socklen_t olen; memset(&ti, 0, sizeof(ti)); ti.d.size_subflow_data = sizeof(struct mptcp_subflow_data); ti.d.size_user = sizeof(struct tcp_info); olen = sizeof(ti); ret = getsockopt(fd, SOL_MPTCP, MPTCP_TCPINFO, &ti, &olen); if (ret < 0) die_perror("getsockopt MPTCP_TCPINFO"); mptcp_subflow_data.num_subflows is populated with the number of subflows that exist on the kernel side for the logical mptcp connection. This allows userspace to re-try with a larger tcp_info array if the number of subflows was larger than the available space in the ti[] array. olen has to be set to the number of bytes that userspace has allocated to receive the kernel data. It will be updated to contain the real number bytes that have been copied to by the kernel. In the above example, if the number if subflows was 1, olen is equal to 'sizeof(struct mptcp_subflow_data) + sizeof(struct tcp_info). For 2 or more subflows olen is equal to 'sizeof(struct my_tcp_info)'. If there was more data that could not be copied due to lack of space in the option buffer, userspace can detect this by checking mptcp_subflow_data->num_subflows. Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 3e9caeddda7e..3f013a513770 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -193,7 +193,15 @@ enum mptcp_event_attr { #define MPTCP_RST_EBADPERF 5 #define MPTCP_RST_EMIDDLEBOX 6 +struct mptcp_subflow_data { + __u32 size_subflow_data; /* size of this structure in userspace */ + __u32 num_subflows; /* must be 0, set by kernel */ + __u32 size_kernel; /* must be 0, set by kernel */ + __u32 size_user; /* size of one element in data[] */ +} __attribute__((aligned(8))); + /* MPTCP socket options */ -#define MPTCP_INFO 1 +#define MPTCP_INFO 1 +#define MPTCP_TCPINFO 2 #endif /* _UAPI_MPTCP_H */ -- cgit v1.2.3 From c11c5906bc0aba62a78da69035f6b30c6da6d13b Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 17 Sep 2021 16:33:21 -0700 Subject: mptcp: add MPTCP_SUBFLOW_ADDRS getsockopt support This retrieves the address pairs of all subflows currently active for a given mptcp connection. It re-uses the same meta-header as for MPTCP_TCPINFO. A new structure is provided to hold the subflow address data: struct mptcp_subflow_addrs { union { __kernel_sa_family_t sa_family; struct sockaddr sa_local; struct sockaddr_in sin_local; struct sockaddr_in6 sin6_local; struct sockaddr_storage ss_local; }; union { struct sockaddr sa_remote; struct sockaddr_in sin_remote; struct sockaddr_in6 sin6_remote; struct sockaddr_storage ss_remote; }; }; Usage of the new getsockopt is very similar to MPTCP_TCPINFO one. Userspace allocates a 'struct mptcp_subflow_data', followed by one or more 'struct mptcp_subflow_addrs', then inits the mptcp_subflow_data structure as follows: struct mptcp_subflow_addrs *sf_addr; struct mptcp_subflow_data *addr; socklen_t olen = sizeof(*addr) + (8 * sizeof(*sf_addr)); addr = malloc(olen); addr->size_subflow_data = sizeof(*addr); addr->num_subflows = 0; addr->size_kernel = 0; addr->size_user = sizeof(struct mptcp_subflow_addrs); sf_addr = (struct mptcp_subflow_addrs *)(addr + 1); and then retrieves the endpoint addresses via: ret = getsockopt(fd, SOL_MPTCP, MPTCP_SUBFLOW_ADDRS, addr, &olen); If the call succeeds, kernel will have added up to 8 endpoint addresses after the 'mptcp_subflow_data' header. Userspace needs to re-check 'olen' value to detect how many bytes have been filled in by the kernel. Userspace can check addr->num_subflows to discover when there were more subflows that available data space. Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 3f013a513770..c8cc46f80a16 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -4,6 +4,13 @@ #include #include +#include /* for sockaddr_in */ +#include /* for sockaddr_in6 */ +#include /* for sockaddr_storage and sa_family */ + +#ifndef __KERNEL__ +#include /* for struct sockaddr */ +#endif #define MPTCP_SUBFLOW_FLAG_MCAP_REM _BITUL(0) #define MPTCP_SUBFLOW_FLAG_MCAP_LOC _BITUL(1) @@ -200,8 +207,25 @@ struct mptcp_subflow_data { __u32 size_user; /* size of one element in data[] */ } __attribute__((aligned(8))); +struct mptcp_subflow_addrs { + union { + __kernel_sa_family_t sa_family; + struct sockaddr sa_local; + struct sockaddr_in sin_local; + struct sockaddr_in6 sin6_local; + struct __kernel_sockaddr_storage ss_local; + }; + union { + struct sockaddr sa_remote; + struct sockaddr_in sin_remote; + struct sockaddr_in6 sin6_remote; + struct __kernel_sockaddr_storage ss_remote; + }; +}; + /* MPTCP socket options */ #define MPTCP_INFO 1 #define MPTCP_TCPINFO 2 +#define MPTCP_SUBFLOW_ADDRS 3 #endif /* _UAPI_MPTCP_H */ -- cgit v1.2.3 From e306784a8de08868d0ecbf78dd42a0051d0e14ce Mon Sep 17 00:00:00 2001 From: Subrat Mishra Date: Wed, 15 Sep 2021 11:22:23 +0530 Subject: cfg80211: AP mode driver offload for FILS association crypto Add a driver FILS crypto offload extended capability flag to indicate that the driver running in AP mode is capable of handling encryption and decryption of (Re)Association request and response frames. Add a command to set FILS AAD data to driver. This feature is supported on drivers running in AP mode only. This extended capability is exchanged with hostapd during cfg80211 init. If the driver indicates this capability, then before sending the Authentication response frame, hostapd sets FILS AAD data to the driver. This allows the driver to decrypt (Re)Association Request frame and encrypt (Re)Association Response frame. FILS Key derivation will still be done in hostapd. Signed-off-by: Subrat Mishra Link: https://lore.kernel.org/r/1631685143-13530-1-git-send-email-subratm@codeaurora.org [fix whitespace] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c2efea98e060..e89bbf856228 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -300,6 +300,29 @@ * the interface goes down. */ +/** + * DOC: FILS shared key crypto offload + * + * This feature is applicable to drivers running in AP mode. + * + * FILS shared key crypto offload can be advertised by drivers by setting + * @NL80211_EXT_FEATURE_FILS_CRYPTO_OFFLOAD flag. The drivers that support + * FILS shared key crypto offload should be able to encrypt and decrypt + * association frames for FILS shared key authentication as per IEEE 802.11ai. + * With this capability, for FILS key derivation, drivers depend on userspace. + * + * After FILS key derivation, userspace shares the FILS AAD details with the + * driver and the driver stores the same to use in decryption of association + * request and in encryption of association response. The below parameters + * should be given to the driver in %NL80211_CMD_SET_FILS_AAD. + * %NL80211_ATTR_MAC - STA MAC address, used for storing FILS AAD per STA + * %NL80211_ATTR_FILS_KEK - Used for encryption or decryption + * %NL80211_ATTR_FILS_NONCES - Used for encryption or decryption + * (STA Nonce 16 bytes followed by AP Nonce 16 bytes) + * + * Once the association is done, the driver cleans the FILS AAD data. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -1200,6 +1223,12 @@ * @NL80211_CMD_COLOR_CHANGE_COMPLETED: Notify userland that the color change * has completed * + * @NL80211_CMD_SET_FILS_AAD: Set FILS AAD data to the driver using - + * &NL80211_ATTR_MAC - for STA MAC address + * &NL80211_ATTR_FILS_KEK - for KEK + * &NL80211_ATTR_FILS_NONCES - for FILS Nonces + * (STA Nonce 16 bytes followed by AP Nonce 16 bytes) + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1440,6 +1469,8 @@ enum nl80211_commands { NL80211_CMD_COLOR_CHANGE_ABORTED, NL80211_CMD_COLOR_CHANGE_COMPLETED, + NL80211_CMD_SET_FILS_AAD, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -5995,6 +6026,11 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_BSS_COLOR: The driver supports BSS color collision * detection and change announcemnts. * + * @NL80211_EXT_FEATURE_FILS_CRYPTO_OFFLOAD: Driver running in AP mode supports + * FILS encryption and decryption for (Re)Association Request and Response + * frames. Userspace has to share FILS AAD details to the driver by using + * @NL80211_CMD_SET_FILS_AAD. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6060,6 +6096,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SECURE_RTT, NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE, NL80211_EXT_FEATURE_BSS_COLOR, + NL80211_EXT_FEATURE_FILS_CRYPTO_OFFLOAD, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From dc1e3cb8da8b414b37208b2fb6755fef8122504b Mon Sep 17 00:00:00 2001 From: John Crispin Date: Wed, 15 Sep 2021 19:54:34 -0700 Subject: nl80211: MBSSID and EMA support in AP mode Add new attributes to configure support for multiple BSSID and advanced multi-BSSID advertisements (EMA) in AP mode. - NL80211_ATTR_MBSSID_CONFIG used for per interface configuration. - NL80211_ATTR_MBSSID_ELEMS used to MBSSID elements for beacons. Memory for the elements is allocated dynamically. This change frees the memory in existing functions which call nl80211_parse_beacon(), a comment is added to indicate the new references to do the same. Signed-off-by: John Crispin Co-developed-by: Aloka Dixit Signed-off-by: Aloka Dixit Link: https://lore.kernel.org/r/20210916025437.29138-2-alokad@codeaurora.org [don't leave ERR_PTR hanging around] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 76 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e89bbf856228..eda608b1eb09 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -360,7 +360,10 @@ * @NL80211_CMD_DEL_INTERFACE: Virtual interface was deleted, has attributes * %NL80211_ATTR_IFINDEX and %NL80211_ATTR_WIPHY. Can also be sent from * userspace to request deletion of a virtual interface, then requires - * attribute %NL80211_ATTR_IFINDEX. + * attribute %NL80211_ATTR_IFINDEX. If multiple BSSID advertisements are + * enabled using %NL80211_ATTR_MBSSID_CONFIG, %NL80211_ATTR_MBSSID_ELEMS, + * and if this command is used for the transmitting interface, then all + * the non-transmitting interfaces are deleted as well. * * @NL80211_CMD_GET_KEY: Get sequence counter information for a key specified * by %NL80211_ATTR_KEY_IDX and/or %NL80211_ATTR_MAC. @@ -2624,6 +2627,18 @@ enum nl80211_commands { * @NL80211_ATTR_COLOR_CHANGE_ELEMS: Nested set of attributes containing the IE * information for the time while performing a color switch. * + * @NL80211_ATTR_MBSSID_CONFIG: Nested attribute for multiple BSSID + * advertisements (MBSSID) parameters in AP mode. + * Kernel uses this attribute to indicate the driver's support for MBSSID + * and enhanced multi-BSSID advertisements (EMA AP) to the userspace. + * Userspace should use this attribute to configure per interface MBSSID + * parameters. + * See &enum nl80211_mbssid_config_attributes for details. + * + * @NL80211_ATTR_MBSSID_ELEMS: Nested parameter to pass multiple BSSID elements. + * Mandatory parameter for the transmitting interface to enable MBSSID. + * Optional for the non-transmitting interfaces. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3127,6 +3142,9 @@ enum nl80211_attrs { NL80211_ATTR_COLOR_CHANGE_COLOR, NL80211_ATTR_COLOR_CHANGE_ELEMS, + NL80211_ATTR_MBSSID_CONFIG, + NL80211_ATTR_MBSSID_ELEMS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -7386,4 +7404,60 @@ enum nl80211_sar_specs_attrs { NL80211_SAR_ATTR_SPECS_MAX = __NL80211_SAR_ATTR_SPECS_LAST - 1, }; +/** + * enum nl80211_mbssid_config_attributes - multiple BSSID (MBSSID) and enhanced + * multi-BSSID advertisements (EMA) in AP mode. + * Kernel uses some of these attributes to advertise driver's support for + * MBSSID and EMA. + * Remaining attributes should be used by the userspace to configure the + * features. + * + * @__NL80211_MBSSID_CONFIG_ATTR_INVALID: Invalid + * + * @NL80211_MBSSID_CONFIG_ATTR_MAX_INTERFACES: Used by the kernel to advertise + * the maximum number of MBSSID interfaces supported by the driver. + * Driver should indicate MBSSID support by setting + * wiphy->mbssid_max_interfaces to a value more than or equal to 2. + * + * @NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY: Used by the kernel + * to advertise the maximum profile periodicity supported by the driver + * if EMA is enabled. Driver should indicate EMA support to the userspace + * by setting wiphy->mbssid_max_ema_profile_periodicity to + * a non-zero value. + * + * @NL80211_MBSSID_CONFIG_ATTR_INDEX: Mandatory parameter to pass the index of + * this BSS (u8) in the multiple BSSID set. + * Value must be set to 0 for the transmitting interface and non-zero for + * all non-transmitting interfaces. The userspace will be responsible + * for using unique indices for the interfaces. + * Range: 0 to wiphy->mbssid_max_interfaces-1. + * + * @NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX: Mandatory parameter for + * a non-transmitted profile which provides the interface index (u32) of + * the transmitted profile. The value must match one of the interface + * indices advertised by the kernel. Optional if the interface being set up + * is the transmitting one, however, if provided then the value must match + * the interface index of the same. + * + * @NL80211_MBSSID_CONFIG_ATTR_EMA: Flag used to enable EMA AP feature. + * Setting this flag is permitted only if the driver advertises EMA support + * by setting wiphy->mbssid_max_ema_profile_periodicity to non-zero. + * + * @__NL80211_MBSSID_CONFIG_ATTR_LAST: Internal + * @NL80211_MBSSID_CONFIG_ATTR_MAX: highest attribute + */ +enum nl80211_mbssid_config_attributes { + __NL80211_MBSSID_CONFIG_ATTR_INVALID, + + NL80211_MBSSID_CONFIG_ATTR_MAX_INTERFACES, + NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY, + NL80211_MBSSID_CONFIG_ATTR_INDEX, + NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX, + NL80211_MBSSID_CONFIG_ATTR_EMA, + + /* keep last */ + __NL80211_MBSSID_CONFIG_ATTR_LAST, + NL80211_MBSSID_CONFIG_ATTR_MAX = __NL80211_MBSSID_CONFIG_ATTR_LAST - 1, +}; + #endif /* __LINUX_NL80211_H */ -- cgit v1.2.3 From a70e3f024d5f4ec7edb17ab5d927eb55397f1d15 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 30 Sep 2021 14:21:04 -0700 Subject: devlink: report maximum number of snapshots with regions Each region has an independently configurable number of maximum snapshots. This information is not reported to userspace, making it not very discoverable. Fix this by adding a new DEVLINK_ATTR_REGION_MAX_SNAPSHOST attribute which is used to report this maximum. Ex: $devlink region pci/0000:af:00.0/nvm-flash: size 10485760 snapshot [] max 1 pci/0000:af:00.0/device-caps: size 4096 snapshot [] max 10 pci/0000:af:00.1/nvm-flash: size 10485760 snapshot [] max 1 pci/0000:af:00.1/device-caps: size 4096 snapshot [] max 10 This information enables users to understand why a new region command may fail due to having too many existing snapshots. Reported-by: Gurucharan G (A Contingent worker at Intel) Signed-off-by: Jacob Keller Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 32f53a0069d6..b897b80770f6 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -551,6 +551,8 @@ enum devlink_attr { DEVLINK_ATTR_RATE_NODE_NAME, /* string */ DEVLINK_ATTR_RATE_PARENT_NODE_NAME, /* string */ + DEVLINK_ATTR_REGION_MAX_SNAPSHOTS, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From 8cb3bf8bff3c47e171f6b66f9ccfc3f1451a11a2 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Sun, 3 Oct 2021 20:45:38 +0200 Subject: ipv6: ioam: Add support for the ip6ip6 encapsulation This patch adds support for the ip6ip6 encapsulation by providing three encap modes: inline, encap and auto. Signed-off-by: Justin Iurman Signed-off-by: David S. Miller --- include/uapi/linux/ioam6_iptunnel.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ioam6_iptunnel.h b/include/uapi/linux/ioam6_iptunnel.h index bae14636a8c8..829ffdfcacca 100644 --- a/include/uapi/linux/ioam6_iptunnel.h +++ b/include/uapi/linux/ioam6_iptunnel.h @@ -9,9 +9,38 @@ #ifndef _UAPI_LINUX_IOAM6_IPTUNNEL_H #define _UAPI_LINUX_IOAM6_IPTUNNEL_H +/* Encap modes: + * - inline: direct insertion + * - encap: ip6ip6 encapsulation + * - auto: inline for local packets, encap for in-transit packets + */ +enum { + __IOAM6_IPTUNNEL_MODE_MIN, + + IOAM6_IPTUNNEL_MODE_INLINE, + IOAM6_IPTUNNEL_MODE_ENCAP, + IOAM6_IPTUNNEL_MODE_AUTO, + + __IOAM6_IPTUNNEL_MODE_MAX, +}; + +#define IOAM6_IPTUNNEL_MODE_MIN (__IOAM6_IPTUNNEL_MODE_MIN + 1) +#define IOAM6_IPTUNNEL_MODE_MAX (__IOAM6_IPTUNNEL_MODE_MAX - 1) + enum { IOAM6_IPTUNNEL_UNSPEC, + + /* Encap mode */ + IOAM6_IPTUNNEL_MODE, /* u8 */ + + /* Tunnel dst address. + * For encap,auto modes. + */ + IOAM6_IPTUNNEL_DST, /* struct in6_addr */ + + /* IOAM Trace Header */ IOAM6_IPTUNNEL_TRACE, /* struct ioam6_trace_hdr */ + __IOAM6_IPTUNNEL_MAX, }; -- cgit v1.2.3 From 353407d917b2d87cd8104a0453d012439c6ca4be Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 6 Oct 2021 13:46:42 +0300 Subject: ethtool: Add ability to control transceiver modules' power mode Add a pair of new ethtool messages, 'ETHTOOL_MSG_MODULE_SET' and 'ETHTOOL_MSG_MODULE_GET', that can be used to control transceiver modules parameters and retrieve their status. The first parameter to control is the power mode of the module. It is only relevant for paged memory modules, as flat memory modules always operate in low power mode. When a paged memory module is in low power mode, its power consumption is reduced to the minimum, the management interface towards the host is available and the data path is deactivated. User space can choose to put modules that are not currently in use in low power mode and transition them to high power mode before putting the associated ports administratively up. This is useful for user space that favors reduced power consumption and lower temperatures over reduced link up times. In QSFP-DD modules the transition from low power mode to high power mode can take a few seconds and this transition is only expected to get longer with future / more complex modules. User space can control the power mode of the module via the power mode policy attribute ('ETHTOOL_A_MODULE_POWER_MODE_POLICY'). Possible values: * high: Module is always in high power mode. * auto: Module is transitioned by the host to high power mode when the first port using it is put administratively up and to low power mode when the last port using it is put administratively down. The operational power mode of the module is available to user space via the 'ETHTOOL_A_MODULE_POWER_MODE' attribute. The attribute is not reported to user space when a module is not plugged-in. The user API is designed to be generic enough so that it could be used for modules with different memory maps (e.g., SFF-8636, CMIS). The only implementation of the device driver API in this series is for a MAC driver (mlxsw) where the module is controlled by the device's firmware, but it is designed to be generic enough so that it could also be used by implementations where the module is controlled by the CPU. CMIS testing ============ # ethtool -m swp11 Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628)) ... Module State : 0x03 (ModuleReady) LowPwrAllowRequestHW : Off LowPwrRequestSW : Off The module is not in low power mode, as it is not forced by hardware (LowPwrAllowRequestHW is off) or by software (LowPwrRequestSW is off). The power mode can be queried from the kernel. In case LowPwrAllowRequestHW was on, the kernel would need to take into account the state of the LowPwrRequestHW signal, which is not visible to user space. $ ethtool --show-module swp11 Module parameters for swp11: power-mode-policy high power-mode high Change the power mode policy to 'auto': # ethtool --set-module swp11 power-mode-policy auto Query the power mode again: $ ethtool --show-module swp11 Module parameters for swp11: power-mode-policy auto power-mode low Verify with the data read from the EEPROM: # ethtool -m swp11 Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628)) ... Module State : 0x01 (ModuleLowPwr) LowPwrAllowRequestHW : Off LowPwrRequestSW : On Put the associated port administratively up which will instruct the host to transition the module to high power mode: # ip link set dev swp11 up Query the power mode again: $ ethtool --show-module swp11 Module parameters for swp11: power-mode-policy auto power-mode high Verify with the data read from the EEPROM: # ethtool -m swp11 Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628)) ... Module State : 0x03 (ModuleReady) LowPwrAllowRequestHW : Off LowPwrRequestSW : Off Put the associated port administratively down which will instruct the host to transition the module to low power mode: # ip link set dev swp11 down Query the power mode again: $ ethtool --show-module swp11 Module parameters for swp11: power-mode-policy auto power-mode low Verify with the data read from the EEPROM: # ethtool -m swp11 Identifier : 0x18 (QSFP-DD Double Density 8X Pluggable Transceiver (INF-8628)) ... Module State : 0x01 (ModuleLowPwr) LowPwrAllowRequestHW : Off LowPwrRequestSW : On SFF-8636 testing ================ # ethtool -m swp13 Identifier : 0x11 (QSFP28) ... Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) enabled Power set : Off Power override : On ... Transmit avg optical power (Channel 1) : 0.7733 mW / -1.12 dBm Transmit avg optical power (Channel 2) : 0.7649 mW / -1.16 dBm Transmit avg optical power (Channel 3) : 0.7790 mW / -1.08 dBm Transmit avg optical power (Channel 4) : 0.7837 mW / -1.06 dBm Rcvr signal avg optical power(Channel 1) : 0.9302 mW / -0.31 dBm Rcvr signal avg optical power(Channel 2) : 0.9079 mW / -0.42 dBm Rcvr signal avg optical power(Channel 3) : 0.8993 mW / -0.46 dBm Rcvr signal avg optical power(Channel 4) : 0.8778 mW / -0.57 dBm The module is not in low power mode, as it is not forced by hardware (Power override is on) or by software (Power set is off). The power mode can be queried from the kernel. In case Power override was off, the kernel would need to take into account the state of the LPMode signal, which is not visible to user space. $ ethtool --show-module swp13 Module parameters for swp13: power-mode-policy high power-mode high Change the power mode policy to 'auto': # ethtool --set-module swp13 power-mode-policy auto Query the power mode again: $ ethtool --show-module swp13 Module parameters for swp13: power-mode-policy auto power-mode low Verify with the data read from the EEPROM: # ethtool -m swp13 Identifier : 0x11 (QSFP28) Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) not enabled Power set : On Power override : On ... Transmit avg optical power (Channel 1) : 0.0000 mW / -inf dBm Transmit avg optical power (Channel 2) : 0.0000 mW / -inf dBm Transmit avg optical power (Channel 3) : 0.0000 mW / -inf dBm Transmit avg optical power (Channel 4) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 1) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 2) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 3) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 4) : 0.0000 mW / -inf dBm Put the associated port administratively up which will instruct the host to transition the module to high power mode: # ip link set dev swp13 up Query the power mode again: $ ethtool --show-module swp13 Module parameters for swp13: power-mode-policy auto power-mode high Verify with the data read from the EEPROM: # ethtool -m swp13 Identifier : 0x11 (QSFP28) ... Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) enabled Power set : Off Power override : On ... Transmit avg optical power (Channel 1) : 0.7934 mW / -1.01 dBm Transmit avg optical power (Channel 2) : 0.7859 mW / -1.05 dBm Transmit avg optical power (Channel 3) : 0.7885 mW / -1.03 dBm Transmit avg optical power (Channel 4) : 0.7985 mW / -0.98 dBm Rcvr signal avg optical power(Channel 1) : 0.9325 mW / -0.30 dBm Rcvr signal avg optical power(Channel 2) : 0.9034 mW / -0.44 dBm Rcvr signal avg optical power(Channel 3) : 0.9086 mW / -0.42 dBm Rcvr signal avg optical power(Channel 4) : 0.8885 mW / -0.51 dBm Put the associated port administratively down which will instruct the host to transition the module to low power mode: # ip link set dev swp13 down Query the power mode again: $ ethtool --show-module swp13 Module parameters for swp13: power-mode-policy auto power-mode low Verify with the data read from the EEPROM: # ethtool -m swp13 Identifier : 0x11 (QSFP28) ... Extended identifier description : 5.0W max. Power consumption, High Power Class (> 3.5 W) not enabled Power set : On Power override : On ... Transmit avg optical power (Channel 1) : 0.0000 mW / -inf dBm Transmit avg optical power (Channel 2) : 0.0000 mW / -inf dBm Transmit avg optical power (Channel 3) : 0.0000 mW / -inf dBm Transmit avg optical power (Channel 4) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 1) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 2) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 3) : 0.0000 mW / -inf dBm Rcvr signal avg optical power(Channel 4) : 0.0000 mW / -inf dBm Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 23 +++++++++++++++++++++++ include/uapi/linux/ethtool_netlink.h | 17 +++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index b6db6590baf0..6de61d53ca5d 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -706,6 +706,29 @@ enum ethtool_stringset { ETH_SS_COUNT }; +/** + * enum ethtool_module_power_mode_policy - plug-in module power mode policy + * @ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH: Module is always in high power mode. + * @ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO: Module is transitioned by the host + * to high power mode when the first port using it is put administratively + * up and to low power mode when the last port using it is put + * administratively down. + */ +enum ethtool_module_power_mode_policy { + ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH = 1, + ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO, +}; + +/** + * enum ethtool_module_power_mode - plug-in module power mode + * @ETHTOOL_MODULE_POWER_MODE_LOW: Module is in low power mode. + * @ETHTOOL_MODULE_POWER_MODE_HIGH: Module is in high power mode. + */ +enum ethtool_module_power_mode { + ETHTOOL_MODULE_POWER_MODE_LOW = 1, + ETHTOOL_MODULE_POWER_MODE_HIGH, +}; + /** * struct ethtool_gstrings - string set for data tagging * @cmd: Command number = %ETHTOOL_GSTRINGS diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 5545f1ca9237..ca5fbb59fa42 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -47,6 +47,8 @@ enum { ETHTOOL_MSG_MODULE_EEPROM_GET, ETHTOOL_MSG_STATS_GET, ETHTOOL_MSG_PHC_VCLOCKS_GET, + ETHTOOL_MSG_MODULE_GET, + ETHTOOL_MSG_MODULE_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -90,6 +92,8 @@ enum { ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, ETHTOOL_MSG_STATS_GET_REPLY, ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, + ETHTOOL_MSG_MODULE_GET_REPLY, + ETHTOOL_MSG_MODULE_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -833,6 +837,19 @@ enum { ETHTOOL_A_STATS_RMON_MAX = (__ETHTOOL_A_STATS_RMON_CNT - 1) }; +/* MODULE */ + +enum { + ETHTOOL_A_MODULE_UNSPEC, + ETHTOOL_A_MODULE_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_MODULE_POWER_MODE_POLICY, /* u8 */ + ETHTOOL_A_MODULE_POWER_MODE, /* u8 */ + + /* add new constants above here */ + __ETHTOOL_A_MODULE_CNT, + ETHTOOL_A_MODULE_MAX = (__ETHTOOL_A_MODULE_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 -- cgit v1.2.3 From 3dfb51126064b594470b9c0b278188fbc9194709 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 6 Oct 2021 13:46:46 +0300 Subject: ethtool: Add transceiver module extended state Add an extended state and sub-state to describe link issues related to transceiver modules. The 'ETHTOOL_LINK_EXT_SUBSTATE_MODULE_CMIS_NOT_READY' extended sub-state tells user space that port is unable to gain a carrier because the CMIS Module State Machine did not reach the ModuleReady (Fully Operational) state. For example, if the module is stuck at ModuleLowPwr or ModuleFault state. In case of the latter, user space can read the fault reason from the module's EEPROM and potentially reset it. Signed-off-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 6de61d53ca5d..a2223b685451 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -603,6 +603,7 @@ enum ethtool_link_ext_state { ETHTOOL_LINK_EXT_STATE_CALIBRATION_FAILURE, ETHTOOL_LINK_EXT_STATE_POWER_BUDGET_EXCEEDED, ETHTOOL_LINK_EXT_STATE_OVERHEAT, + ETHTOOL_LINK_EXT_STATE_MODULE, }; /* More information in addition to ETHTOOL_LINK_EXT_STATE_AUTONEG. */ @@ -649,6 +650,11 @@ enum ethtool_link_ext_substate_cable_issue { ETHTOOL_LINK_EXT_SUBSTATE_CI_CABLE_TEST_FAILURE, }; +/* More information in addition to ETHTOOL_LINK_EXT_STATE_MODULE. */ +enum ethtool_link_ext_substate_module { + ETHTOOL_LINK_EXT_SUBSTATE_MODULE_CMIS_NOT_READY = 1, +}; + #define ETH_GSTRING_LEN 32 /** -- cgit v1.2.3 From 4c1e34c0dbffb17accdfe16ac97ab432df9024ff Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Fri, 8 Oct 2021 11:00:53 +0100 Subject: vsock: Enable y2038 safe timeval for timeout Reuse the timeval compat code from core/sock to handle 32-bit and 64-bit timeval structures. Also introduce a new socket option define to allow using y2038 safe timeval under 32-bit. The existing behavior of sock_set_timeout and vsock's timeout setter differ when the time value is out of bounds. vsocks current behavior is retained at the expense of not being able to share the full implementation. This allows the LTP test vsock01 to pass under 32-bit compat mode. Fixes: fe0c72f3db11 ("socket: move compat timeout handling into sock.c") Signed-off-by: Richard Palethorpe Cc: Richard Palethorpe Reviewed-by: Arnd Bergmann Signed-off-by: David S. Miller --- include/uapi/linux/vm_sockets.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h index 46918a1852d7..c60ca33eac59 100644 --- a/include/uapi/linux/vm_sockets.h +++ b/include/uapi/linux/vm_sockets.h @@ -64,7 +64,7 @@ * timeout for a STREAM socket. */ -#define SO_VM_SOCKETS_CONNECT_TIMEOUT 6 +#define SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD 6 /* Option name for using non-blocking send/receive. Use as the option name * for setsockopt(3) or getsockopt(3) to set or get the non-blocking @@ -81,6 +81,17 @@ #define SO_VM_SOCKETS_NONBLOCK_TXRX 7 +#define SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW 8 + +#if !defined(__KERNEL__) +#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) +#define SO_VM_SOCKETS_CONNECT_TIMEOUT SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD +#else +#define SO_VM_SOCKETS_CONNECT_TIMEOUT \ + (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD : SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW) +#endif +#endif + /* The vSocket equivalent of INADDR_ANY. This works for the svm_cid field of * sockaddr_vm and indicates the context ID of the current endpoint. */ -- cgit v1.2.3 From 2c611ad97a82b51221bb0920cc6cac0b1d4c0e52 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Mon, 11 Oct 2021 14:12:37 +0200 Subject: net, neigh: Extend neigh->flags to 32 bit to allow for extensions Currently, all bits in struct ndmsg's ndm_flags are used up with the most recent addition of 435f2e7cc0b7 ("net: bridge: add support for sticky fdb entries"). This makes it impossible to extend the neighboring subsystem with new NTF_* flags: struct ndmsg { __u8 ndm_family; __u8 ndm_pad1; __u16 ndm_pad2; __s32 ndm_ifindex; __u16 ndm_state; __u8 ndm_flags; __u8 ndm_type; }; There are ndm_pad{1,2} attributes which are not used. However, due to uncareful design, the kernel does not enforce them to be zero upon new neighbor entry addition, and given they've been around forever, it is not possible to reuse them today due to risk of breakage. One option to overcome this limitation is to add a new NDA_FLAGS_EXT attribute for extended flags. In struct neighbour, there is a 3 byte hole between protocol and ha_lock, which allows neigh->flags to be extended from 8 to 32 bits while still being on the same cacheline as before. This also allows for all future NTF_* flags being in neigh->flags rather than yet another flags field. Unknown flags in NDA_FLAGS_EXT will be rejected by the kernel. Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 00a60695fa53..a80cca141855 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -31,6 +31,7 @@ enum { NDA_PROTOCOL, /* Originator of entry */ NDA_NH_ID, NDA_FDB_EXT_ATTRS, + NDA_FLAGS_EXT, __NDA_MAX }; -- cgit v1.2.3 From 7482e3841d520a368426ac196720601687e2dc47 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 11 Oct 2021 14:12:38 +0200 Subject: net, neigh: Add NTF_MANAGED flag for managed neighbor entries Allow a user space control plane to insert entries with a new NTF_EXT_MANAGED flag. The flag then indicates to the kernel that the neighbor entry should be periodically probed for keeping the entry in NUD_REACHABLE state iff possible. The use case for this is targeting XDP or tc BPF load-balancers which use the bpf_fib_lookup() BPF helper in order to piggyback on neighbor resolution for their backends. Given they cannot be resolved in fast-path, a control plane inserts the L3 (without L2) entries manually into the neighbor table and lets the kernel do the neighbor resolution either on the gateway or on the backend directly in case the latter resides in the same L2. This avoids to deal with L2 in the control plane and to rebuild what the kernel already does best anyway. NTF_EXT_MANAGED can be combined with NTF_EXT_LEARNED in order to avoid GC eviction. The kernel then adds NTF_MANAGED flagged entries to a per-neighbor table which gets triggered by the system work queue to periodically call neigh_event_send() for performing the resolution. The implementation allows migration from/to NTF_MANAGED neighbor entries, so that already existing entries can be converted by the control plane if needed. Potentially, we could make the interval for periodically calling neigh_event_send() configurable; right now it's set to DELAY_PROBE_TIME which is also in line with mlxsw which has similar driver-internal infrastructure c723c735fa6b ("mlxsw: spectrum_router: Periodically update the kernel's neigh table"). In future, the latter could possibly reuse the NTF_MANAGED neighbors as well. Example: # ./ip/ip n replace 192.168.178.30 dev enp5s0 managed extern_learn # ./ip/ip n 192.168.178.30 dev enp5s0 lladdr f4:8c:50:5e:71:9a managed extern_learn REACHABLE [...] Signed-off-by: Daniel Borkmann Acked-by: Roopa Prabhu Link: https://linuxplumbersconf.org/event/11/contributions/953/ Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index a80cca141855..db05fb55055e 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -41,14 +41,16 @@ enum { * Neighbor Cache Entry Flags */ -#define NTF_USE 0x01 -#define NTF_SELF 0x02 -#define NTF_MASTER 0x04 -#define NTF_PROXY 0x08 /* == ATF_PUBL */ -#define NTF_EXT_LEARNED 0x10 -#define NTF_OFFLOADED 0x20 -#define NTF_STICKY 0x40 -#define NTF_ROUTER 0x80 +#define NTF_USE (1 << 0) +#define NTF_SELF (1 << 1) +#define NTF_MASTER (1 << 2) +#define NTF_PROXY (1 << 3) /* == ATF_PUBL */ +#define NTF_EXT_LEARNED (1 << 4) +#define NTF_OFFLOADED (1 << 5) +#define NTF_STICKY (1 << 6) +#define NTF_ROUTER (1 << 7) +/* Extended flags under NDA_FLAGS_EXT: */ +#define NTF_EXT_MANAGED (1 << 0) /* * Neighbor Cache Entry States. @@ -66,12 +68,22 @@ enum { #define NUD_PERMANENT 0x80 #define NUD_NONE 0x00 -/* NUD_NOARP & NUD_PERMANENT are pseudostates, they never change - * and make no address resolution or NUD. - * NUD_PERMANENT also cannot be deleted by garbage collectors. +/* NUD_NOARP & NUD_PERMANENT are pseudostates, they never change and make no + * address resolution or NUD. + * + * NUD_PERMANENT also cannot be deleted by garbage collectors. This holds true + * for dynamic entries with NTF_EXT_LEARNED flag as well. However, upon carrier + * down event, NUD_PERMANENT entries are not flushed whereas NTF_EXT_LEARNED + * flagged entries explicitly are (which is also consistent with the routing + * subsystem). + * * When NTF_EXT_LEARNED is set for a bridge fdb entry the different cache entry * states don't make sense and thus are ignored. Such entries don't age and * can roam. + * + * NTF_EXT_MANAGED flagged neigbor entries are managed by the kernel on behalf + * of a user space control plane, and automatically refreshed so that (if + * possible) they remain in NUD_REACHABLE state. */ struct nda_cacheinfo { -- cgit v1.2.3 From 42df6e1d221dddc0f2acf2be37e68d553ad65f96 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Fri, 8 Oct 2021 22:06:03 +0200 Subject: netfilter: Introduce egress hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support classifying packets with netfilter on egress to satisfy user requirements such as: * outbound security policies for containers (Laura) * filtering and mangling intra-node Direct Server Return (DSR) traffic on a load balancer (Laura) * filtering locally generated traffic coming in through AF_PACKET, such as local ARP traffic generated for clustering purposes or DHCP (Laura; the AF_PACKET plumbing is contained in a follow-up commit) * L2 filtering from ingress and egress for AVB (Audio Video Bridging) and gPTP with nftables (Pablo) * in the future: in-kernel NAT64/NAT46 (Pablo) The egress hook introduced herein complements the ingress hook added by commit e687ad60af09 ("netfilter: add netfilter ingress hook after handle_ing() under unique static key"). A patch for nftables to hook up egress rules from user space has been submitted separately, so users may immediately take advantage of the feature. Alternatively or in addition to netfilter, packets can be classified with traffic control (tc). On ingress, packets are classified first by tc, then by netfilter. On egress, the order is reversed for symmetry. Conceptually, tc and netfilter can be thought of as layers, with netfilter layered above tc. Traffic control is capable of redirecting packets to another interface (man 8 tc-mirred). E.g., an ingress packet may be redirected from the host namespace to a container via a veth connection: tc ingress (host) -> tc egress (veth host) -> tc ingress (veth container) In this case, netfilter egress classifying is not performed when leaving the host namespace! That's because the packet is still on the tc layer. If tc redirects the packet to a physical interface in the host namespace such that it leaves the system, the packet is never subjected to netfilter egress classifying. That is only logical since it hasn't passed through netfilter ingress classifying either. Packets can alternatively be redirected at the netfilter layer using nft fwd. Such a packet *is* subjected to netfilter egress classifying since it has reached the netfilter layer. Internally, the skb->nf_skip_egress flag controls whether netfilter is invoked on egress by __dev_queue_xmit(). Because __dev_queue_xmit() may be called recursively by tunnel drivers such as vxlan, the flag is reverted to false after sch_handle_egress(). This ensures that netfilter is applied both on the overlay and underlying network. Interaction between tc and netfilter is possible by setting and querying skb->mark. If netfilter egress classifying is not enabled on any interface, it is patched out of the data path by way of a static_key and doesn't make a performance difference that is discernible from noise: Before: 1537 1538 1538 1537 1538 1537 Mb/sec After: 1536 1534 1539 1539 1539 1540 Mb/sec Before + tc accept: 1418 1418 1418 1419 1419 1418 Mb/sec After + tc accept: 1419 1424 1418 1419 1422 1420 Mb/sec Before + tc drop: 1620 1619 1619 1619 1620 1620 Mb/sec After + tc drop: 1616 1624 1625 1624 1622 1619 Mb/sec When netfilter egress classifying is enabled on at least one interface, a minimal performance penalty is incurred for every egress packet, even if the interface it's transmitted over doesn't have any netfilter egress rules configured. That is caused by checking dev->nf_hooks_egress against NULL. Measurements were performed on a Core i7-3615QM. Commands to reproduce: ip link add dev foo type dummy ip link set dev foo up modprobe pktgen echo "add_device foo" > /proc/net/pktgen/kpktgend_3 samples/pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -i foo -n 400000000 -m "11:11:11:11:11:11" -d 1.1.1.1 Accept all traffic with tc: tc qdisc add dev foo clsact tc filter add dev foo egress bpf da bytecode '1,6 0 0 0,' Drop all traffic with tc: tc qdisc add dev foo clsact tc filter add dev foo egress bpf da bytecode '1,6 0 0 2,' Apply this patch when measuring packet drops to avoid errors in dmesg: https://lore.kernel.org/netdev/a73dda33-57f4-95d8-ea51-ed483abd6a7a@iogearbox.net/ Signed-off-by: Lukas Wunner Cc: Laura García Liébana Cc: John Fastabend Cc: Daniel Borkmann Cc: Alexei Starovoitov Cc: Eric Dumazet Cc: Thomas Graf Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter.h b/include/uapi/linux/netfilter.h index ef9a44286e23..53411ccc69db 100644 --- a/include/uapi/linux/netfilter.h +++ b/include/uapi/linux/netfilter.h @@ -51,6 +51,7 @@ enum nf_inet_hooks { enum nf_dev_hooks { NF_NETDEV_INGRESS, + NF_NETDEV_EGRESS, NF_NETDEV_NUMHOOKS }; -- cgit v1.2.3 From e72aeb9ee0e34c57dc90793d0bf82cab9624d64e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Oct 2021 10:59:18 -0700 Subject: fq_codel: implement L4S style ce_threshold_ect1 marking Add TCA_FQ_CODEL_CE_THRESHOLD_ECT1 boolean option to select Low Latency, Low Loss, Scalable Throughput (L4S) style marking, along with ce_threshold. If enabled, only packets with ECT(1) can be transformed to CE if their sojourn time is above the ce_threshold. Note that this new option does not change rules for codel law. In particular, if TCA_FQ_CODEL_ECN is left enabled (this is the default when fq_codel qdisc is created), ECT(0) packets can still get CE if codel law (as governed by limit/target) decides so. Section 4.3.b of current draft [1] states: b. A scheduler with per-flow queues such as FQ-CoDel or FQ-PIE can be used for L4S. For instance within each queue of an FQ-CoDel system, as well as a CoDel AQM, there is typically also ECN marking at an immediate (unsmoothed) shallow threshold to support use in data centres (see Sec.5.2.7 of [RFC8290]). This can be modified so that the shallow threshold is solely applied to ECT(1) packets. Then if there is a flow of non-ECN or ECT(0) packets in the per-flow-queue, the Classic AQM (e.g. CoDel) is applied; while if there is a flow of ECT(1) packets in the queue, the shallower (typically sub-millisecond) threshold is applied. Tested: tc qd replace dev eth1 root fq_codel ce_threshold_ect1 50usec netperf ... -t TCP_STREAM -- K dctcp tc -s -d qd sh dev eth1 qdisc fq_codel 8022: root refcnt 32 limit 10240p flows 1024 quantum 9212 target 5ms ce_threshold_ect1 49us interval 100ms memory_limit 32Mb ecn drop_batch 64 Sent 14388596616 bytes 9543449 pkt (dropped 0, overlimits 0 requeues 152013) backlog 0b 0p requeues 152013 maxpacket 68130 drop_overlimit 0 new_flow_count 95678 ecn_mark 0 ce_mark 7639 new_flows_len 0 old_flows_len 0 [1] L4S current draft: https://datatracker.ietf.org/doc/html/draft-ietf-tsvwg-l4s-arch Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Ingemar Johansson S Cc: Tom Henderson Cc: Bob Briscoe Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index ec88590b3198..6be9a84cccfa 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -840,6 +840,7 @@ enum { TCA_FQ_CODEL_CE_THRESHOLD, TCA_FQ_CODEL_DROP_BATCH_SIZE, TCA_FQ_CODEL_MEMORY_LIMIT, + TCA_FQ_CODEL_CE_THRESHOLD_ECT1, __TCA_FQ_CODEL_MAX }; -- cgit v1.2.3 From b0539f5eddc2eefd24378bda3ee9cbbca916f58d Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Sat, 16 Oct 2021 11:37:51 +0200 Subject: net/smc: add netlink support for SMC-Rv2 Implement the netlink support for SMC-Rv2 related attributes that are provided to user space. Signed-off-by: Karsten Graul Signed-off-by: David S. Miller --- include/uapi/linux/smc.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h index b175bd0165a1..20f33b27787f 100644 --- a/include/uapi/linux/smc.h +++ b/include/uapi/linux/smc.h @@ -84,17 +84,28 @@ enum { SMC_NLA_SYS_IS_ISM_V2, /* u8 */ SMC_NLA_SYS_LOCAL_HOST, /* string */ SMC_NLA_SYS_SEID, /* string */ + SMC_NLA_SYS_IS_SMCR_V2, /* u8 */ __SMC_NLA_SYS_MAX, SMC_NLA_SYS_MAX = __SMC_NLA_SYS_MAX - 1 }; -/* SMC_NLA_LGR_V2 nested attributes */ +/* SMC_NLA_LGR_D_V2_COMMON and SMC_NLA_LGR_R_V2_COMMON nested attributes */ enum { SMC_NLA_LGR_V2_VER, /* u8 */ SMC_NLA_LGR_V2_REL, /* u8 */ SMC_NLA_LGR_V2_OS, /* u8 */ SMC_NLA_LGR_V2_NEG_EID, /* string */ SMC_NLA_LGR_V2_PEER_HOST, /* string */ + __SMC_NLA_LGR_V2_MAX, + SMC_NLA_LGR_V2_MAX = __SMC_NLA_LGR_V2_MAX - 1 +}; + +/* SMC_NLA_LGR_R_V2 nested attributes */ +enum { + SMC_NLA_LGR_R_V2_UNSPEC, + SMC_NLA_LGR_R_V2_DIRECT, /* u8 */ + __SMC_NLA_LGR_R_V2_MAX, + SMC_NLA_LGR_R_V2_MAX = __SMC_NLA_LGR_R_V2_MAX - 1 }; /* SMC_GEN_LGR_SMCR attributes */ @@ -106,6 +117,8 @@ enum { SMC_NLA_LGR_R_PNETID, /* string */ SMC_NLA_LGR_R_VLAN_ID, /* u8 */ SMC_NLA_LGR_R_CONNS_NUM, /* u32 */ + SMC_NLA_LGR_R_V2_COMMON, /* nest */ + SMC_NLA_LGR_R_V2, /* nest */ __SMC_NLA_LGR_R_MAX, SMC_NLA_LGR_R_MAX = __SMC_NLA_LGR_R_MAX - 1 }; @@ -138,7 +151,7 @@ enum { SMC_NLA_LGR_D_PNETID, /* string */ SMC_NLA_LGR_D_CHID, /* u16 */ SMC_NLA_LGR_D_PAD, /* flag */ - SMC_NLA_LGR_V2, /* nest */ + SMC_NLA_LGR_D_V2_COMMON, /* nest */ __SMC_NLA_LGR_D_MAX, SMC_NLA_LGR_D_MAX = __SMC_NLA_LGR_D_MAX - 1 }; -- cgit v1.2.3 From 7bbbbfaa7a1b0b03890f25fba5f28bb8c7ef145a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alvin=20=C5=A0ipraga?= Date: Mon, 18 Oct 2021 11:37:56 +0200 Subject: ether: add EtherType for proprietary Realtek protocols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new EtherType ETH_P_REALTEK to the if_ether.h uapi header. The EtherType 0x8899 is used in a number of different protocols from Realtek Semiconductor Corp [1], so no general assumptions should be made when trying to decode such packets. Observed protocols include: 0x1 - Realtek Remote Control protocol [2] 0x2 - Echo protocol [2] 0x3 - Loop detection protocol [2] 0x4 - RTL8365MB 4- and 8-byte switch CPU tag protocols [3] 0x9 - RTL8306 switch CPU tag protocol [4] 0xA - RTL8366RB switch CPU tag protocol [4] [1] https://lore.kernel.org/netdev/CACRpkdYQthFgjwVzHyK3DeYUOdcYyWmdjDPG=Rf9B3VrJ12Rzg@mail.gmail.com/ [2] https://www.wireshark.org/lists/ethereal-dev/200409/msg00090.html [3] https://lore.kernel.org/netdev/20210822193145.1312668-4-alvin@pqrs.dk/ [4] https://lore.kernel.org/netdev/20200708122537.1341307-2-linus.walleij@linaro.org/ Suggested-by: Andrew Lunn Signed-off-by: Alvin Šipraga Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 5f589c7a8382..5da4ee234e0b 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -86,6 +86,7 @@ * over Ethernet */ #define ETH_P_PAE 0x888E /* Port Access Entity (IEEE 802.1X) */ +#define ETH_P_REALTEK 0x8899 /* Multiple proprietary protocols */ #define ETH_P_AOE 0x88A2 /* ATA over Ethernet */ #define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN */ #define ETH_P_802_EX1 0x88B5 /* 802.1 Local Experimental 1. */ -- cgit v1.2.3 From 223f903e9c832699f4e5f422281a60756c1c6cfe Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 12 Oct 2021 09:48:38 -0700 Subject: bpf: Rename BTF_KIND_TAG to BTF_KIND_DECL_TAG Patch set [1] introduced BTF_KIND_TAG to allow tagging declarations for struct/union, struct/union field, var, func and func arguments and these tags will be encoded into dwarf. They are also encoded to btf by llvm for the bpf target. After BTF_KIND_TAG is introduced, we intended to use it for kernel __user attributes. But kernel __user is actually a type attribute. Upstream and internal discussion showed it is not a good idea to mix declaration attribute and type attribute. So we proposed to introduce btf_type_tag as a type attribute and existing btf_tag renamed to btf_decl_tag ([2]). This patch renamed BTF_KIND_TAG to BTF_KIND_DECL_TAG and some other declarations with *_tag to *_decl_tag to make it clear the tag is for declaration. In the future, BTF_KIND_TYPE_TAG might be introduced per [3]. [1] https://lore.kernel.org/bpf/20210914223004.244411-1-yhs@fb.com/ [2] https://reviews.llvm.org/D111588 [3] https://reviews.llvm.org/D111199 Fixes: b5ea834dde6b ("bpf: Support for new btf kind BTF_KIND_TAG") Fixes: 5b84bd10363e ("libbpf: Add support for BTF_KIND_TAG") Fixes: 5c07f2fec003 ("bpftool: Add support for BTF_KIND_TAG") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20211012164838.3345699-1-yhs@fb.com --- include/uapi/linux/btf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h index 642b6ecb37d7..deb12f755f0f 100644 --- a/include/uapi/linux/btf.h +++ b/include/uapi/linux/btf.h @@ -43,7 +43,7 @@ struct btf_type { * "size" tells the size of the type it is describing. * * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, - * FUNC, FUNC_PROTO, VAR and TAG. + * FUNC, FUNC_PROTO, VAR and DECL_TAG. * "type" is a type_id referring to another type. */ union { @@ -74,7 +74,7 @@ enum { BTF_KIND_VAR = 14, /* Variable */ BTF_KIND_DATASEC = 15, /* Section */ BTF_KIND_FLOAT = 16, /* Floating point */ - BTF_KIND_TAG = 17, /* Tag */ + BTF_KIND_DECL_TAG = 17, /* Decl Tag */ NR_BTF_KINDS, BTF_KIND_MAX = NR_BTF_KINDS - 1, @@ -174,14 +174,14 @@ struct btf_var_secinfo { __u32 size; }; -/* BTF_KIND_TAG is followed by a single "struct btf_tag" to describe +/* BTF_KIND_DECL_TAG is followed by a single "struct btf_decl_tag" to describe * additional information related to the tag applied location. * If component_idx == -1, the tag is applied to a struct, union, * variable or function. Otherwise, it is applied to a struct/union * member or a func argument, and component_idx indicates which member * or argument (0 ... vlen-1). */ -struct btf_tag { +struct btf_decl_tag { __s32 component_idx; }; -- cgit v1.2.3 From dfcb63ce1de6b10ba98dee928f9463f37e5a5512 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 19 Oct 2021 19:47:09 +0200 Subject: fq_codel: generalise ce_threshold marking for subset of traffic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit e72aeb9ee0e3 ("fq_codel: implement L4S style ce_threshold_ect1 marking") expanded the ce_threshold feature of FQ-CoDel so it can be applied to a subset of the traffic, using the ECT(1) bit of the ECN field as the classifier. However, hard-coding ECT(1) as the only classifier for this feature seems limiting, so let's expand it to be more general. To this end, change the parameter from a ce_threshold_ect1 boolean, to a one-byte selector/mask pair (ce_threshold_{selector,mask}) which is applied to the whole diffserv/ECN field in the IP header. This makes it possible to classify packets by any value in either the ECN field or the diffserv field. In particular, setting a selector of INET_ECN_ECT_1 and a mask of INET_ECN_MASK corresponds to the functionality before this patch, and a mask of ~INET_ECN_MASK allows using the selector as a straight-forward match against a diffserv code point: # apply ce_threshold to ECT(1) traffic tc qdisc replace dev eth0 root fq_codel ce_threshold 1ms ce_threshold_selector 0x1/0x3 # apply ce_threshold to ECN-capable traffic marked as diffserv AF22 tc qdisc replace dev eth0 root fq_codel ce_threshold 1ms ce_threshold_selector 0x50/0xfc Regardless of the selector chosen, the normal rules for ECN-marking of packets still apply, i.e., the flow must still declare itself ECN-capable by setting one of the bits in the ECN field to get marked at all. v2: - Add tc usage examples to patch description Signed-off-by: Toke Høiland-Jørgensen Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20211019174709.69081-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 6be9a84cccfa..f292b467b27f 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -840,7 +840,8 @@ enum { TCA_FQ_CODEL_CE_THRESHOLD, TCA_FQ_CODEL_DROP_BATCH_SIZE, TCA_FQ_CODEL_MEMORY_LIMIT, - TCA_FQ_CODEL_CE_THRESHOLD_ECT1, + TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR, + TCA_FQ_CODEL_CE_THRESHOLD_MASK, __TCA_FQ_CODEL_MAX }; -- cgit v1.2.3 From 1add667da24273631d4b1f5529789ec5253227f6 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Wed, 20 Oct 2021 08:11:47 +0300 Subject: nl80211: vendor-cmd: intel: add more details for IWL_MVM_VENDOR_CMD_HOST_GET_OWNERSHIP Explain more the expected flow for this command. Signed-off-by: Emmanuel Grumbach Link: https://lore.kernel.org/r/20211020051147.29297-1-emmanuel.grumbach@intel.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211-vnd-intel.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211-vnd-intel.h b/include/uapi/linux/nl80211-vnd-intel.h index 0bf177b84fd9..4ed7d0b24512 100644 --- a/include/uapi/linux/nl80211-vnd-intel.h +++ b/include/uapi/linux/nl80211-vnd-intel.h @@ -13,6 +13,35 @@ * enum iwl_mvm_vendor_cmd - supported vendor commands * @IWL_MVM_VENDOR_CMD_GET_CSME_CONN_INFO: reports CSME connection info. * @IWL_MVM_VENDOR_CMD_HOST_GET_OWNERSHIP: asks for ownership on the device. + * This is useful when the CSME firmware owns the device and the kernel + * wants to use it. In case the CSME firmware has no connection active the + * kernel will manage on its own to get ownership of the device. + * When the CSME firmware has an active connection, the user space + * involvement is required. The kernel will assert the RFKILL signal with + * the "device not owned" reason so that nobody can touch the device. Then + * the user space can run the following flow to be able to get connected + * to the very same AP the CSME firmware is currently connected to: + * + * 1) The user space (NetworkManager) boots and sees that the device is + * in RFKILL because the host doesn't own the device + * 2) The user space asks the kernel what AP the CSME firmware is + * connected to (with %IWL_MVM_VENDOR_CMD_GET_CSME_CONN_INFO) + * 3) The user space checks if it has a profile that matches the reply + * from the CSME firmware + * 4) The user space installs a network to the wpa_supplicant with a + * specific BSSID and a specific frequency + * 5) The user space prevents any type of full scan + * 6) The user space asks iwlmei to request ownership on the device (with + * this command) + * 7) iwlmei requests ownership from the CSME firmware + * 8) The CSME firmware grants ownership + * 9) iwlmei tells iwlwifi to lift the RFKILL + * 10) RFKILL OFF is reported to user space + * 11) The host boots the device, loads the firwmare, and connects to a + * specific BSSID without scanning including IP as fast as it can + * 12) The host reports to the CSME firmware that there is a connection + * 13) The TCP connection is preserved and the host has connectivity + * * @IWL_MVM_VENDOR_CMD_ROAMING_FORBIDDEN_EVENT: notifies if roaming is allowed. * It contains a &IWL_MVM_VENDOR_ATTR_ROAMING_FORBIDDEN and a * &IWL_MVM_VENDOR_ATTR_VIF_ADDR attributes. -- cgit v1.2.3 From 63fa04266629b9559d66c4dc18b03e0f9fc04a02 Mon Sep 17 00:00:00 2001 From: Srinivasan Raju Date: Mon, 18 Oct 2021 11:00:54 +0100 Subject: nl80211: Add LC placeholder band definition to nl80211_band Define LC band which is a draft under IEEE 802.11bb. Current NL80211_BAND_LC is a placeholder band and will be more defined IEEE 802.11bb progresses. Signed-off-by: Srinivasan Raju Link: https://lore.kernel.org/r/20211018100143.7565-2-srini.raju@purelifi.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index eda608b1eb09..6b816ef0155f 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4978,6 +4978,7 @@ enum nl80211_txrate_gi { * @NL80211_BAND_60GHZ: around 60 GHz band (58.32 - 69.12 GHz) * @NL80211_BAND_6GHZ: around 6 GHz band (5.9 - 7.2 GHz) * @NL80211_BAND_S1GHZ: around 900MHz, supported by S1G PHYs + * @NL80211_BAND_LC: light communication band (placeholder) * @NUM_NL80211_BANDS: number of bands, avoid using this in userspace * since newer kernel versions may support more bands */ @@ -4987,6 +4988,7 @@ enum nl80211_band { NL80211_BAND_60GHZ, NL80211_BAND_6GHZ, NL80211_BAND_S1GHZ, + NL80211_BAND_LC, NUM_NL80211_BANDS, }; -- cgit v1.2.3 From f9d366d420af4ce8719c59e60853573c02831f61 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 21 Oct 2021 17:30:39 +0200 Subject: cfg80211: fix kernel-doc for MBSSID EMA The struct member ema_max_profile_periodicity was listed with the wrong name in the kernel-doc, fix that. Link: https://lore.kernel.org/r/20211021173038.18ec2030c66b.Iac731bb299525940948adad2c41f514b7dd81c47@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6b816ef0155f..61cab81e920d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -7424,7 +7424,7 @@ enum nl80211_sar_specs_attrs { * @NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY: Used by the kernel * to advertise the maximum profile periodicity supported by the driver * if EMA is enabled. Driver should indicate EMA support to the userspace - * by setting wiphy->mbssid_max_ema_profile_periodicity to + * by setting wiphy->ema_max_profile_periodicity to * a non-zero value. * * @NL80211_MBSSID_CONFIG_ATTR_INDEX: Mandatory parameter to pass the index of @@ -7443,7 +7443,7 @@ enum nl80211_sar_specs_attrs { * * @NL80211_MBSSID_CONFIG_ATTR_EMA: Flag used to enable EMA AP feature. * Setting this flag is permitted only if the driver advertises EMA support - * by setting wiphy->mbssid_max_ema_profile_periodicity to non-zero. + * by setting wiphy->ema_max_profile_periodicity to non-zero. * * @__NL80211_MBSSID_CONFIG_ATTR_LAST: Internal * @NL80211_MBSSID_CONFIG_ATTR_MAX: highest attribute -- cgit v1.2.3 From 9eeb3aa33ae005526f672b394c1791578463513f Mon Sep 17 00:00:00 2001 From: Hengqi Chen Date: Thu, 21 Oct 2021 21:47:51 +0800 Subject: bpf: Add bpf_skc_to_unix_sock() helper The helper is used in tracing programs to cast a socket pointer to a unix_sock pointer. The return value could be NULL if the casting is illegal. Suggested-by: Yonghong Song Signed-off-by: Hengqi Chen Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211021134752.1223426-2-hengqi.chen@gmail.com --- include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6fc59d61937a..22e7a3f38b9f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4909,6 +4909,12 @@ union bpf_attr { * Return * The number of bytes written to the buffer, or a negative error * in case of failure. + * + * struct unix_sock *bpf_skc_to_unix_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *unix_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5089,6 +5095,7 @@ union bpf_attr { FN(task_pt_regs), \ FN(get_branch_snapshot), \ FN(trace_vprintk), \ + FN(skc_to_unix_sock), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From aba64c7da98330141dcdadd5612f088043a83696 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Wed, 20 Oct 2021 00:48:17 -0700 Subject: bpf: Add verified_insns to bpf_prog_info and fdinfo This stat is currently printed in the verifier log and not stored anywhere. To ease consumption of this data, add a field to bpf_prog_aux so it can be exposed via BPF_OBJ_GET_INFO_BY_FD and fdinfo. Signed-off-by: Dave Marchevsky Signed-off-by: Andrii Nakryiko Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20211020074818.1017682-2-davemarchevsky@fb.com --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 22e7a3f38b9f..c10820037883 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5620,6 +5620,7 @@ struct bpf_prog_info { __u64 run_time_ns; __u64 run_cnt; __u64 recursion_misses; + __u32 verified_insns; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.2.3 From 63dfe0709643528290c8a6825f278eda0e3f3c2e Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 18 Sep 2021 18:56:32 +0900 Subject: can: bittiming: allow TDC{V,O} to be zero and add can_tdc_const::tdc{v,o,f}_min ISO 11898-1 specifies in section 11.3.3 "Transmitter delay compensation" that "the configuration range for [the] SSP position shall be at least 0 to 63 minimum time quanta." Because SSP = TDCV + TDCO, it means that we should allow both TDCV and TDCO to hold zero value in order to honor SSP's minimum possible value. However, current implementation assigned special meaning to TDCV and TDCO's zero values: * TDCV = 0 -> TDCV is automatically measured by the transceiver. * TDCO = 0 -> TDC is off. In order to allow for those values to really be zero and to maintain current features, we introduce two new flags: * CAN_CTRLMODE_TDC_AUTO indicates that the controller support automatic measurement of TDCV. * CAN_CTRLMODE_TDC_MANUAL indicates that the controller support manual configuration of TDCV. N.B.: current implementation failed to provide an option for the driver to indicate that only manual mode was supported. TDC is disabled if both CAN_CTRLMODE_TDC_AUTO and CAN_CTRLMODE_TDC_MANUAL flags are off, c.f. the helper function can_tdc_is_enabled() which is also introduced in this patch. Also, this patch adds three fields: tdcv_min, tdco_min and tdcf_min to struct can_tdc_const. While we are not convinced that those three fields could be anything else than zero, we can imagine that some controllers might specify a lower bound on these. Thus, those minimums are really added "just in case". Comments of struct can_tdc and can_tdc_const are updated accordingly. Finally, the changes are applied to the etas_es58x driver. Link: https://lore.kernel.org/all/20210918095637.20108-2-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/netlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index f730d443b918..004cd09a7d49 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -101,6 +101,8 @@ struct can_ctrlmode { #define CAN_CTRLMODE_PRESUME_ACK 0x40 /* Ignore missing CAN ACKs */ #define CAN_CTRLMODE_FD_NON_ISO 0x80 /* CAN FD in non-ISO mode */ #define CAN_CTRLMODE_CC_LEN8_DLC 0x100 /* Classic CAN DLC option */ +#define CAN_CTRLMODE_TDC_AUTO 0x200 /* CAN transiver automatically calculates TDCV */ +#define CAN_CTRLMODE_TDC_MANUAL 0x400 /* TDCV is manually set up by user */ /* * CAN device statistics -- cgit v1.2.3 From d99755f71a80df33b981484f0d3bb956ed15a247 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Sat, 18 Sep 2021 18:56:35 +0900 Subject: can: netlink: add interface for CAN-FD Transmitter Delay Compensation (TDC) Add the netlink interface for TDC parameters of struct can_tdc_const and can_tdc. Contrary to the can_bittiming(_const) structures for which there is just a single IFLA_CAN(_DATA)_BITTMING(_CONST) entry per structure, here, we create a nested entry IFLA_CAN_TDC. Within this nested entry, additional IFLA_CAN_TDC_TDC* entries are added for each of the TDC parameters of the newly introduced struct can_tdc_const and struct can_tdc. For struct can_tdc_const, these are: IFLA_CAN_TDC_TDCV_MIN IFLA_CAN_TDC_TDCV_MAX IFLA_CAN_TDC_TDCO_MIN IFLA_CAN_TDC_TDCO_MAX IFLA_CAN_TDC_TDCF_MIN IFLA_CAN_TDC_TDCF_MAX For struct can_tdc, these are: IFLA_CAN_TDC_TDCV IFLA_CAN_TDC_TDCO IFLA_CAN_TDC_TDCF This is done so that changes can be applied in the future to the structures without breaking the netlink interface. The TDC netlink logic works as follow: * CAN_CTRLMODE_FD is not provided: - if any TDC parameters are provided: error. - TDC parameters not provided: TDC parameters unchanged. * CAN_CTRLMODE_FD is provided and is false: - TDC is deactivated: both the structure and the CAN_CTRLMODE_TDC_{AUTO,MANUAL} flags are flushed. * CAN_CTRLMODE_FD provided and is true: - CAN_CTRLMODE_TDC_{AUTO,MANUAL} and tdc{v,o,f} not provided: call can_calc_tdco() to automatically decide whether TDC should be activated and, if so, set CAN_CTRLMODE_TDC_AUTO and uses the calculated tdco value. - CAN_CTRLMODE_TDC_AUTO and tdco provided: set CAN_CTRLMODE_TDC_AUTO and use the provided tdco value. Here, tdcv is illegal and tdcf is optional. - CAN_CTRLMODE_TDC_MANUAL and both of tdcv and tdco provided: set CAN_CTRLMODE_TDC_MANUAL and use the provided tdcv and tdco value. Here, tdcf is optional. - CAN_CTRLMODE_TDC_{AUTO,MANUAL} are mutually exclusive. Whenever one flag is turned on, the other will automatically be turned off. Providing both returns an error. - Combination other than the one listed above are illegal and will return an error. N.B. above rules mean that whenever CAN_CTRLMODE_FD is provided, the previous TDC values will be overwritten. The only option to reuse previous TDC value is to not provide CAN_CTRLMODE_FD. All the new parameters are defined as u32. This arbitrary choice is done to mimic the other bittiming values with are also all of type u32. An u16 would have been sufficient to hold the TDC values. This patch completes below series (c.f. [1]): - commit 289ea9e4ae59 ("can: add new CAN FD bittiming parameters: Transmitter Delay Compensation (TDC)") - commit c25cc7993243 ("can: bittiming: add calculation for CAN FD Transmitter Delay Compensation (TDC)") [1] https://lore.kernel.org/linux-can/20210224002008.4158-1-mailhol.vincent@wanadoo.fr/T/#t Link: https://lore.kernel.org/all/20210918095637.20108-5-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/netlink.h | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index 004cd09a7d49..75b85c60efb2 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -136,10 +136,35 @@ enum { IFLA_CAN_BITRATE_CONST, IFLA_CAN_DATA_BITRATE_CONST, IFLA_CAN_BITRATE_MAX, - __IFLA_CAN_MAX + IFLA_CAN_TDC, + + /* add new constants above here */ + __IFLA_CAN_MAX, + IFLA_CAN_MAX = __IFLA_CAN_MAX - 1 }; -#define IFLA_CAN_MAX (__IFLA_CAN_MAX - 1) +/* + * CAN FD Transmitter Delay Compensation (TDC) + * + * Please refer to struct can_tdc_const and can_tdc in + * include/linux/can/bittiming.h for further details. + */ +enum { + IFLA_CAN_TDC_UNSPEC, + IFLA_CAN_TDC_TDCV_MIN, /* u32 */ + IFLA_CAN_TDC_TDCV_MAX, /* u32 */ + IFLA_CAN_TDC_TDCO_MIN, /* u32 */ + IFLA_CAN_TDC_TDCO_MAX, /* u32 */ + IFLA_CAN_TDC_TDCF_MIN, /* u32 */ + IFLA_CAN_TDC_TDCF_MAX, /* u32 */ + IFLA_CAN_TDC_TDCV, /* u32 */ + IFLA_CAN_TDC_TDCO, /* u32 */ + IFLA_CAN_TDC_TDCF, /* u32 */ + + /* add new constants above here */ + __IFLA_CAN_TDC, + IFLA_CAN_TDC_MAX = __IFLA_CAN_TDC - 1 +}; /* u16 termination range: 1..65535 Ohms */ #define CAN_TERMINATION_DISABLED 0 -- cgit v1.2.3 From 1cf4e9a6fbdbc9850216a2a6d8ed52888679a077 Mon Sep 17 00:00:00 2001 From: Luo Jie Date: Sun, 24 Oct 2021 16:27:33 +0800 Subject: net: phy: add constants for fast retrain related register Add the constants for 2.5G fast retrain capability in 10G AN control register, fast retrain status and control register and THP bypass register into mdio.h. Signed-off-by: Luo Jie Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- include/uapi/linux/mdio.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index bdf77dffa5a4..c54e6eae5366 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -53,12 +53,14 @@ #define MDIO_AN_EEE_LPABLE 61 /* EEE link partner ability */ #define MDIO_AN_EEE_ADV2 62 /* EEE advertisement 2 */ #define MDIO_AN_EEE_LPABLE2 63 /* EEE link partner ability 2 */ +#define MDIO_AN_CTRL2 64 /* AN THP bypass request control */ /* Media-dependent registers. */ #define MDIO_PMA_10GBT_SWAPPOL 130 /* 10GBASE-T pair swap & polarity */ #define MDIO_PMA_10GBT_TXPWR 131 /* 10GBASE-T TX power control */ #define MDIO_PMA_10GBT_SNR 133 /* 10GBASE-T SNR margin, lane A. * Lanes B-D are numbered 134-136. */ +#define MDIO_PMA_10GBR_FSRT_CSR 147 /* 10GBASE-R fast retrain status and control */ #define MDIO_PMA_10GBR_FECABLE 170 /* 10GBASE-R FEC ability */ #define MDIO_PCS_10GBX_STAT1 24 /* 10GBASE-X PCS status 1 */ #define MDIO_PCS_10GBRT_STAT1 32 /* 10GBASE-R/-T PCS status 1 */ @@ -239,6 +241,9 @@ #define MDIO_PMA_10GBR_FECABLE_ABLE 0x0001 /* FEC ability */ #define MDIO_PMA_10GBR_FECABLE_ERRABLE 0x0002 /* FEC error indic. ability */ +/* PMA 10GBASE-R Fast Retrain status and control register. */ +#define MDIO_PMA_10GBR_FSRT_ENABLE 0x0001 /* Fast retrain enable */ + /* PCS 10GBASE-R/-T status register 1. */ #define MDIO_PCS_10GBRT_STAT1_BLKLK 0x0001 /* Block lock attained */ @@ -247,6 +252,7 @@ #define MDIO_PCS_10GBRT_STAT2_BER 0x3f00 /* AN 10GBASE-T control register. */ +#define MDIO_AN_10GBT_CTRL_ADVFSRT2_5G 0x0020 /* Advertise 2.5GBASE-T fast retrain */ #define MDIO_AN_10GBT_CTRL_ADV2_5G 0x0080 /* Advertise 2.5GBASE-T */ #define MDIO_AN_10GBT_CTRL_ADV5G 0x0100 /* Advertise 5GBASE-T */ #define MDIO_AN_10GBT_CTRL_ADV10G 0x1000 /* Advertise 10GBASE-T */ @@ -289,6 +295,9 @@ #define MDIO_EEE_2_5GT 0x0001 /* 2.5GT EEE cap */ #define MDIO_EEE_5GT 0x0002 /* 5GT EEE cap */ +/* AN MultiGBASE-T AN control 2 */ +#define MDIO_AN_THP_BP2_5GT 0x0008 /* 2.5GT THP bypass request */ + /* 2.5G/5G Extended abilities register. */ #define MDIO_PMA_NG_EXTABLE_2_5GBT 0x0001 /* 2.5GBASET ability */ #define MDIO_PMA_NG_EXTABLE_5GBT 0x0002 /* 5GBASET ability */ -- cgit v1.2.3 From 99ce45d5e7dbde399997a630f45ac9f654fa4bcc Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Tue, 26 Oct 2021 09:57:28 +0800 Subject: mctp: Implement extended addressing This change allows an extended address struct - struct sockaddr_mctp_ext - to be passed to sendmsg/recvmsg. This allows userspace to specify output ifindex and physical address information (for sendmsg) or receive the input ifindex/physaddr for incoming messages (for recvmsg). This is typically used by userspace for MCTP address discovery and assignment operations. The extended addressing facility is conditional on a new sockopt: MCTP_OPT_ADDR_EXT; userspace must explicitly enable addressing before the kernel will consume/populate the extended address data. Includes a fix for an uninitialised var: Reported-by: kernel test robot Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/uapi/linux/mctp.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h index 6acd4ccafbf7..07b0318716fc 100644 --- a/include/uapi/linux/mctp.h +++ b/include/uapi/linux/mctp.h @@ -11,6 +11,7 @@ #include #include +#include typedef __u8 mctp_eid_t; @@ -28,6 +29,14 @@ struct sockaddr_mctp { __u8 __smctp_pad1; }; +struct sockaddr_mctp_ext { + struct sockaddr_mctp smctp_base; + int smctp_ifindex; + __u8 smctp_halen; + __u8 __smctp_pad0[3]; + __u8 smctp_haddr[MAX_ADDR_LEN]; +}; + #define MCTP_NET_ANY 0x0 #define MCTP_ADDR_NULL 0x00 @@ -36,4 +45,6 @@ struct sockaddr_mctp { #define MCTP_TAG_MASK 0x07 #define MCTP_TAG_OWNER 0x08 +#define MCTP_OPT_ADDR_EXT 1 + #endif /* __UAPI_MCTP_H */ -- cgit v1.2.3 From 9330986c03006ab1d33d243b7cfe598a7a3c1baa Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 27 Oct 2021 16:45:00 -0700 Subject: bpf: Add bloom filter map implementation This patch adds the kernel-side changes for the implementation of a bpf bloom filter map. The bloom filter map supports peek (determining whether an element is present in the map) and push (adding an element to the map) operations.These operations are exposed to userspace applications through the already existing syscalls in the following way: BPF_MAP_LOOKUP_ELEM -> peek BPF_MAP_UPDATE_ELEM -> push The bloom filter map does not have keys, only values. In light of this, the bloom filter map's API matches that of queue stack maps: user applications use BPF_MAP_LOOKUP_ELEM/BPF_MAP_UPDATE_ELEM which correspond internally to bpf_map_peek_elem/bpf_map_push_elem, and bpf programs must use the bpf_map_peek_elem and bpf_map_push_elem APIs to query or add an element to the bloom filter map. When the bloom filter map is created, it must be created with a key_size of 0. For updates, the user will pass in the element to add to the map as the value, with a NULL key. For lookups, the user will pass in the element to query in the map as the value, with a NULL key. In the verifier layer, this requires us to modify the argument type of a bloom filter's BPF_FUNC_map_peek_elem call to ARG_PTR_TO_MAP_VALUE; as well, in the syscall layer, we need to copy over the user value so that in bpf_map_peek_elem, we know which specific value to query. A few things to please take note of: * If there are any concurrent lookups + updates, the user is responsible for synchronizing this to ensure no false negative lookups occur. * The number of hashes to use for the bloom filter is configurable from userspace. If no number is specified, the default used will be 5 hash functions. The benchmarks later in this patchset can help compare the performance of using different number of hashes on different entry sizes. In general, using more hashes decreases both the false positive rate and the speed of a lookup. * Deleting an element in the bloom filter map is not supported. * The bloom filter map may be used as an inner map. * The "max_entries" size that is specified at map creation time is used to approximate a reasonable bitmap size for the bloom filter, and is not otherwise strictly enforced. If the user wishes to insert more entries into the bloom filter than "max_entries", they may do so but they should be aware that this may lead to a higher false positive rate. Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20211027234504.30744-2-joannekoong@fb.com --- include/uapi/linux/bpf.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c10820037883..8bead4aa3ad0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -906,6 +906,7 @@ enum bpf_map_type { BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, BPF_MAP_TYPE_TASK_STORAGE, + BPF_MAP_TYPE_BLOOM_FILTER, }; /* Note that tracing related programs such as @@ -1274,6 +1275,13 @@ union bpf_attr { * struct stored as the * map value */ + /* Any per-map-type extra fields + * + * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the + * number of hash functions (if 0, the bloom filter will default + * to using 5 hash functions). + */ + __u64 map_extra; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -5638,6 +5646,7 @@ struct bpf_map_info { __u32 btf_id; __u32 btf_key_type_id; __u32 btf_value_type_id; + __u64 map_extra; } __attribute__((aligned(8))); struct bpf_btf_info { -- cgit v1.2.3 From d6aef08a872b9e23eecc92d0e92393473b13c497 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 28 Oct 2021 12:04:54 +0530 Subject: bpf: Add bpf_kallsyms_lookup_name helper This helper allows us to get the address of a kernel symbol from inside a BPF_PROG_TYPE_SYSCALL prog (used by gen_loader), so that we can relocate typeless ksym vars. Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20211028063501.2239335-2-memxor@gmail.com --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 8bead4aa3ad0..bd0c9f0487f6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4923,6 +4923,21 @@ union bpf_attr { * Dynamically cast a *sk* pointer to a *unix_sock* pointer. * Return * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_kallsyms_lookup_name(const char *name, int name_sz, int flags, u64 *res) + * Description + * Get the address of a kernel symbol, returned in *res*. *res* is + * set to 0 if the symbol is not found. + * Return + * On success, zero. On error, a negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-EINVAL** if string *name* is not the same size as *name_sz*. + * + * **-ENOENT** if symbol is not found. + * + * **-EPERM** if caller does not have permission to obtain kernel address. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5104,6 +5119,7 @@ union bpf_attr { FN(get_branch_snapshot), \ FN(trace_vprintk), \ FN(skc_to_unix_sock), \ + FN(kallsyms_lookup_name), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 56fa95014a0447f798444e626091cbeb3176af24 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 25 Oct 2021 15:43:29 +0200 Subject: netfilter: nft_meta: add NFT_META_IFTYPE Generalize NFT_META_IIFTYPE to NFT_META_IFTYPE which allows you to match on the interface type of the skb->dev field. This field is used by the netdev family to add an implicit dependency to skip non-ethernet packets when matching on layer 3 and 4 TCP/IP header fields. For backward compatibility, add the NFT_META_IIFTYPE alias to NFT_META_IFTYPE. Add __NFT_META_IIFTYPE, to be used by userspace in the future to match specifically on the iiftype. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index e94d1fa554cb..08db4ee06ab6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -896,7 +896,8 @@ enum nft_meta_keys { NFT_META_OIF, NFT_META_IIFNAME, NFT_META_OIFNAME, - NFT_META_IIFTYPE, + NFT_META_IFTYPE, +#define NFT_META_IIFTYPE NFT_META_IFTYPE NFT_META_OIFTYPE, NFT_META_SKUID, NFT_META_SKGID, @@ -923,6 +924,7 @@ enum nft_meta_keys { NFT_META_TIME_HOUR, NFT_META_SDIF, NFT_META_SDIFNAME, + __NFT_META_IIFTYPE, }; /** -- cgit v1.2.3 From c46b38dc8743535e686b911d253a844f0bd50ead Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 28 Oct 2021 22:15:00 +0200 Subject: netfilter: nft_payload: support for inner header matching / mangling Allow to match and mangle on inner headers / payload data after the transport header. There is a new field in the pktinfo structure that stores the inner header offset which is calculated only when requested. Only TCP and UDP supported at this stage. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 08db4ee06ab6..466fd3f4447c 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -753,11 +753,13 @@ enum nft_dynset_attributes { * @NFT_PAYLOAD_LL_HEADER: link layer header * @NFT_PAYLOAD_NETWORK_HEADER: network header * @NFT_PAYLOAD_TRANSPORT_HEADER: transport header + * @NFT_PAYLOAD_INNER_HEADER: inner header / payload */ enum nft_payload_bases { NFT_PAYLOAD_LL_HEADER, NFT_PAYLOAD_NETWORK_HEADER, NFT_PAYLOAD_TRANSPORT_HEADER, + NFT_PAYLOAD_INNER_HEADER, }; /** -- cgit v1.2.3 From b9022b53adad88fd6cf2b9718c9e498504f3e1dd Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Sun, 31 Oct 2021 16:00:02 +0000 Subject: amt: add control plane of amt interface It adds definitions and control plane code for AMT. this is very similar to udp tunneling interfaces such as gtp, vxlan, etc. In the next patch, data plane code will be added. Signed-off-by: Taehee Yoo Signed-off-by: David S. Miller --- include/uapi/linux/amt.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 include/uapi/linux/amt.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/amt.h b/include/uapi/linux/amt.h new file mode 100644 index 000000000000..2dccff417195 --- /dev/null +++ b/include/uapi/linux/amt.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Copyright (c) 2021 Taehee Yoo + */ +#ifndef _UAPI_AMT_H_ +#define _UAPI_AMT_H_ + +enum ifla_amt_mode { + /* AMT interface works as Gateway mode. + * The Gateway mode encapsulates IGMP/MLD traffic and decapsulates + * multicast traffic. + */ + AMT_MODE_GATEWAY = 0, + /* AMT interface works as Relay mode. + * The Relay mode encapsulates multicast traffic and decapsulates + * IGMP/MLD traffic. + */ + AMT_MODE_RELAY, + __AMT_MODE_MAX, +}; + +#define AMT_MODE_MAX (__AMT_MODE_MAX - 1) + +enum { + IFLA_AMT_UNSPEC, + /* This attribute specify mode etier Gateway or Relay. */ + IFLA_AMT_MODE, + /* This attribute specify Relay port. + * AMT interface is created as Gateway mode, this attribute is used + * to specify relay(remote) port. + * AMT interface is created as Relay mode, this attribute is used + * as local port. + */ + IFLA_AMT_RELAY_PORT, + /* This attribute specify Gateway port. + * AMT interface is created as Gateway mode, this attribute is used + * as local port. + * AMT interface is created as Relay mode, this attribute is not used. + */ + IFLA_AMT_GATEWAY_PORT, + /* This attribute specify physical device */ + IFLA_AMT_LINK, + /* This attribute specify local ip address */ + IFLA_AMT_LOCAL_IP, + /* This attribute specify Relay ip address. + * So, this is not used by Relay. + */ + IFLA_AMT_REMOTE_IP, + /* This attribute specify Discovery ip address. + * When Gateway get started, it send discovery message to find the + * Relay's ip address. + * So, this is not used by Relay. + */ + IFLA_AMT_DISCOVERY_IP, + /* This attribute specify number of maximum tunnel. */ + IFLA_AMT_MAX_TUNNELS, + __IFLA_AMT_MAX, +}; + +#define IFLA_AMT_MAX (__IFLA_AMT_MAX - 1) + +#endif /* _UAPI_AMT_H_ */ -- cgit v1.2.3 From 8845b4681bf44b9d2d2badf2c67cf476e42a86bd Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 29 Oct 2021 15:49:08 -0700 Subject: bpf: Add alignment padding for "map_extra" + consolidate holes This patch makes 2 changes regarding alignment padding for the "map_extra" field. 1) In the kernel header, "map_extra" and "btf_value_type_id" are rearranged to consolidate the hole. Before: struct bpf_map { ... u32 max_entries; /* 36 4 */ u32 map_flags; /* 40 4 */ /* XXX 4 bytes hole, try to pack */ u64 map_extra; /* 48 8 */ int spin_lock_off; /* 56 4 */ int timer_off; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ u32 id; /* 64 4 */ int numa_node; /* 68 4 */ ... bool frozen; /* 117 1 */ /* XXX 10 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ ... struct work_struct work; /* 144 72 */ /* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */ struct mutex freeze_mutex; /* 216 144 */ /* --- cacheline 5 boundary (320 bytes) was 40 bytes ago --- */ u64 writecnt; /* 360 8 */ /* size: 384, cachelines: 6, members: 26 */ /* sum members: 354, holes: 2, sum holes: 14 */ /* padding: 16 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 10 */ } __attribute__((__aligned__(64))); After: struct bpf_map { ... u32 max_entries; /* 36 4 */ u64 map_extra; /* 40 8 */ u32 map_flags; /* 48 4 */ int spin_lock_off; /* 52 4 */ int timer_off; /* 56 4 */ u32 id; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ int numa_node; /* 64 4 */ ... bool frozen /* 113 1 */ /* XXX 14 bytes hole, try to pack */ /* --- cacheline 2 boundary (128 bytes) --- */ ... struct work_struct work; /* 144 72 */ /* --- cacheline 3 boundary (192 bytes) was 24 bytes ago --- */ struct mutex freeze_mutex; /* 216 144 */ /* --- cacheline 5 boundary (320 bytes) was 40 bytes ago --- */ u64 writecnt; /* 360 8 */ /* size: 384, cachelines: 6, members: 26 */ /* sum members: 354, holes: 1, sum holes: 14 */ /* padding: 16 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 14 */ } __attribute__((__aligned__(64))); 2) Add alignment padding to the bpf_map_info struct More details can be found in commit 36f9814a494a ("bpf: fix uapi hole for 32 bit compat applications") Signed-off-by: Joanne Koong Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20211029224909.1721024-3-joannekoong@fb.com --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bd0c9f0487f6..ba5af15e25f5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5662,6 +5662,7 @@ struct bpf_map_info { __u32 btf_id; __u32 btf_key_type_id; __u32 btf_value_type_id; + __u32 :32; /* alignment pad */ __u64 map_extra; } __attribute__((aligned(8))); -- cgit v1.2.3 From fcdb44d08a95003c3d040aecdee286156ec6f34e Mon Sep 17 00:00:00 2001 From: James Prestwood Date: Mon, 1 Nov 2021 10:36:28 -0700 Subject: net: arp: introduce arp_evict_nocarrier sysctl parameter This change introduces a new sysctl parameter, arp_evict_nocarrier. When set (default) the ARP cache will be cleared on a NOCARRIER event. This new option has been defaulted to '1' which maintains existing behavior. Clearing the ARP cache on NOCARRIER is relatively new, introduced by: commit 859bd2ef1fc1110a8031b967ee656c53a6260a76 Author: David Ahern Date: Thu Oct 11 20:33:49 2018 -0700 net: Evict neighbor entries on carrier down The reason for this changes is to prevent the ARP cache from being cleared when a wireless device roams. Specifically for wireless roams the ARP cache should not be cleared because the underlying network has not changed. Clearing the ARP cache in this case can introduce significant delays sending out packets after a roam. A user reported such a situation here: https://lore.kernel.org/linux-wireless/CACsRnHWa47zpx3D1oDq9JYnZWniS8yBwW1h0WAVZ6vrbwL_S0w@mail.gmail.com/ After some investigation it was found that the kernel was holding onto packets until ARP finished which resulted in this 1 second delay. It was also found that the first ARP who-has was never responded to, which is actually what caues the delay. This change is more or less working around this behavior, but again, there is no reason to clear the cache on a roam anyways. As for the unanswered who-has, we know the packet made it OTA since it was seen while monitoring. Why it never received a response is unknown. In any case, since this is a problem on the AP side of things all that can be done is to work around it until it is solved. Some background on testing/reproducing the packet delay: Hardware: - 2 access points configured for Fast BSS Transition (Though I don't see why regular reassociation wouldn't have the same behavior) - Wireless station running IWD as supplicant - A device on network able to respond to pings (I used one of the APs) Procedure: - Connect to first AP - Ping once to establish an ARP entry - Start a tcpdump - Roam to second AP - Wait for operstate UP event, and note the timestamp - Start pinging Results: Below is the tcpdump after UP. It was recorded the interface went UP at 10:42:01.432875. 10:42:01.461871 ARP, Request who-has 192.168.254.1 tell 192.168.254.71, length 28 10:42:02.497976 ARP, Request who-has 192.168.254.1 tell 192.168.254.71, length 28 10:42:02.507162 ARP, Reply 192.168.254.1 is-at ac:86:74:55:b0:20, length 46 10:42:02.507185 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 1, length 64 10:42:02.507205 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 2, length 64 10:42:02.507212 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 3, length 64 10:42:02.507219 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 4, length 64 10:42:02.507225 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 5, length 64 10:42:02.507232 IP 192.168.254.71 > 192.168.254.1: ICMP echo request, id 52792, seq 6, length 64 10:42:02.515373 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 1, length 64 10:42:02.521399 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 2, length 64 10:42:02.521612 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 3, length 64 10:42:02.521941 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 4, length 64 10:42:02.522419 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 5, length 64 10:42:02.523085 IP 192.168.254.1 > 192.168.254.71: ICMP echo reply, id 52792, seq 6, length 64 You can see the first ARP who-has went out very quickly after UP, but was never responded to. Nearly a second later the kernel retries and gets a response. Only then do the ping packets go out. If an ARP entry is manually added prior to UP (after the cache is cleared) it is seen that the first ping is never responded to, so its not only an issue with ARP but with data packets in general. As mentioned prior, the wireless interface was also monitored to verify the ping/ARP packet made it OTA which was observed to be true. Signed-off-by: James Prestwood Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/uapi/linux/ip.h | 1 + include/uapi/linux/sysctl.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h index e42d13b55cf3..e00bbb9c47bb 100644 --- a/include/uapi/linux/ip.h +++ b/include/uapi/linux/ip.h @@ -169,6 +169,7 @@ enum IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST, IPV4_DEVCONF_DROP_GRATUITOUS_ARP, IPV4_DEVCONF_BC_FORWARDING, + IPV4_DEVCONF_ARP_EVICT_NOCARRIER, __IPV4_DEVCONF_MAX }; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 1e05d3caa712..6a3b194c50fe 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -482,6 +482,7 @@ enum NET_IPV4_CONF_PROMOTE_SECONDARIES=20, NET_IPV4_CONF_ARP_ACCEPT=21, NET_IPV4_CONF_ARP_NOTIFY=22, + NET_IPV4_CONF_ARP_EVICT_NOCARRIER=23, }; /* /proc/sys/net/ipv4/netfilter */ -- cgit v1.2.3 From 18ac597af25e9760b76471524096f5b29eb820e6 Mon Sep 17 00:00:00 2001 From: James Prestwood Date: Mon, 1 Nov 2021 10:36:29 -0700 Subject: net: ndisc: introduce ndisc_evict_nocarrier sysctl parameter In most situations the neighbor discovery cache should be cleared on a NOCARRIER event which is currently done unconditionally. But for wireless roams the neighbor discovery cache can and should remain intact since the underlying network has not changed. This patch introduces a sysctl option ndisc_evict_nocarrier which can be disabled by a wireless supplicant during a roam. This allows packets to be sent after a roam immediately without having to wait for neighbor discovery. A user reported roughly a 1 second delay after a roam before packets could be sent out (note, on IPv4). This delay was due to the ARP cache being cleared. During testing of this same scenario using IPv6 no delay was noticed, but regardless there is no reason to clear the ndisc cache for wireless roams. Signed-off-by: James Prestwood Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/uapi/linux/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index b243a53fa985..d4178dace0bf 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -193,6 +193,7 @@ enum { DEVCONF_IOAM6_ENABLED, DEVCONF_IOAM6_ID, DEVCONF_IOAM6_ID_WIDE, + DEVCONF_NDISC_EVICT_NOCARRIER, DEVCONF_MAX }; -- cgit v1.2.3