From 0f9f696a502e1b01fbb137a08f56f157da9d95eb Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 8 Jan 2021 16:47:59 -0800 Subject: mptcp: add set_flags command in PM netlink This patch added a new command MPTCP_PM_CMD_SET_FLAGS in PM netlink: In mptcp_nl_cmd_set_flags, parse the input address, get the backup value according to whether the address's FLAG_BACKUP flag is set from the user-space. Then check whether this address had been added in the local address list. If it had been, then call mptcp_nl_addr_backup to deal with this address. In mptcp_nl_addr_backup, traverse all the existing msk sockets to find the relevant sockets, and call mptcp_pm_nl_mp_prio_send_ack to send out a MP_PRIO ACK packet. Finally in mptcp_nl_cmd_set_flags, set or clear the address's FLAG_BACKUP flag. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 9762660df741..3674a451a18c 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -82,6 +82,7 @@ enum { MPTCP_PM_CMD_FLUSH_ADDRS, MPTCP_PM_CMD_SET_LIMITS, MPTCP_PM_CMD_GET_LIMITS, + MPTCP_PM_CMD_SET_FLAGS, __MPTCP_PM_CMD_AFTER_LAST }; -- cgit v1.2.3 From ad0bfc233ae2e7ee3bcb9a6089e4aa54e2b44fa1 Mon Sep 17 00:00:00 2001 From: Danilo Carvalho Date: Fri, 8 Jan 2021 22:21:04 +0000 Subject: Fix whitespace in uapi/linux/tcp.h. List of things fixed: - Two of the socket options were idented with spaces instead of tabs. - Trailing whitespace in some lines. - Improper spacing around parenthesis caught by checkpatch.pl. - Mix of space and tabs in tcp_word_hdr union. Signed-off-by: Danilo Carvalho Link: https://lore.kernel.org/r/20210108222104.2079472-1-doak@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/tcp.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 13ceeb395eb8..768e93bd5b51 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -51,7 +51,7 @@ struct tcphdr { fin:1; #else #error "Adjust your defines" -#endif +#endif __be16 window; __sum16 check; __be16 urg_ptr; @@ -62,14 +62,14 @@ struct tcphdr { * (union is compatible to any of its members) * This means this part of the code is -fstrict-aliasing safe now. */ -union tcp_word_hdr { +union tcp_word_hdr { struct tcphdr hdr; - __be32 words[5]; -}; + __be32 words[5]; +}; -#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) +#define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3]) -enum { +enum { TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), @@ -80,7 +80,7 @@ enum { TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) -}; +}; /* * TCP general constants @@ -103,8 +103,8 @@ enum { #define TCP_QUICKACK 12 /* Block/reenable quick acks */ #define TCP_CONGESTION 13 /* Congestion control algorithm */ #define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ -#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ -#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ +#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ +#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ #define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ #define TCP_REPAIR 19 /* TCP sock is under repair right now */ #define TCP_REPAIR_QUEUE 20 -- cgit v1.2.3 From c6458e72f6fd6ac7e390da0d9abe8446084886e5 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 12 Jan 2021 12:34:22 +0000 Subject: bpf: Clarify return value of probe str helpers When the buffer is too small to contain the input string, these helpers return the length of the buffer, not the length of the original string. This tries to make the docs totally clear about that, since "the length of the [copied ]string" could also refer to the length of the input. Signed-off-by: Brendan Jackman Signed-off-by: Daniel Borkmann Acked-by: KP Singh Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112123422.2011234-1-jackmanb@google.com --- include/uapi/linux/bpf.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77d7c1bb2923..a1ad32456f89 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2993,10 +2993,10 @@ union bpf_attr { * string length is larger than *size*, just *size*-1 bytes are * copied and the last byte is set to NUL. * - * On success, the length of the copied string is returned. This - * makes this helper useful in tracing programs for reading - * strings, and more importantly to get its length at runtime. See - * the following snippet: + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: * * :: * @@ -3024,7 +3024,7 @@ union bpf_attr { * **->mm->env_start**: using this helper and the return value, * one can quickly iterate at the right offset of the memory area. * Return - * On success, the strictly positive length of the string, + * On success, the strictly positive length of the output string, * including the trailing NUL character. On error, a negative * value. * -- cgit v1.2.3 From 91c960b0056672e74627776655c926388350fa30 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:44 +0000 Subject: bpf: Rename BPF_XADD and prepare to encode other atomics in .imm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A subsequent patch will add additional atomic operations. These new operations will use the same opcode field as the existing XADD, with the immediate discriminating different operations. In preparation, rename the instruction mode BPF_ATOMIC and start calling the zero immediate BPF_ADD. This is possible (doesn't break existing valid BPF progs) because the immediate field is currently reserved MBZ and BPF_ADD is zero. All uses are removed from the tree but the BPF_XADD definition is kept around to avoid breaking builds for people including kernel headers. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210114181751.768687-5-jackmanb@google.com --- include/uapi/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a1ad32456f89..6b3996343e63 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -19,7 +19,8 @@ /* ld/ldx fields */ #define BPF_DW 0x18 /* double word (64-bit) */ -#define BPF_XADD 0xc0 /* exclusive add */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ /* alu/jmp fields */ #define BPF_MOV 0xb0 /* mov reg to reg */ @@ -2448,7 +2449,7 @@ union bpf_attr { * running simultaneously. * * A user should care about the synchronization by himself. - * For example, by using the **BPF_STX_XADD** instruction to alter + * For example, by using the **BPF_ATOMIC** instructions to alter * the shared data. * Return * A pointer to the local storage area. -- cgit v1.2.3 From 5ca419f2864a2c60940dcf4bbaeb69546200e36f Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:46 +0000 Subject: bpf: Add BPF_FETCH field / create atomic_fetch_add instruction The BPF_FETCH field can be set in bpf_insn.imm, for BPF_ATOMIC instructions, in order to have the previous value of the atomically-modified memory location loaded into the src register after an atomic op is carried out. Suggested-by: Yonghong Song Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210114181751.768687-7-jackmanb@google.com --- include/uapi/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6b3996343e63..ea262b009049 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -44,6 +44,9 @@ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* fetch previous value into src reg */ + /* Register numbers */ enum { BPF_REG_0 = 0, -- cgit v1.2.3 From 5ffa25502b5ab3d639829a2d1e316cff7f59a41e Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:47 +0000 Subject: bpf: Add instructions for atomic_[cmp]xchg This adds two atomic opcodes, both of which include the BPF_FETCH flag. XCHG without the BPF_FETCH flag would naturally encode atomic_set. This is not supported because it would be of limited value to userspace (it doesn't imply any barriers). CMPXCHG without BPF_FETCH woulud be an atomic compare-and-write. We don't have such an operation in the kernel so it isn't provided to BPF either. There are two significant design decisions made for the CMPXCHG instruction: - To solve the issue that this operation fundamentally has 3 operands, but we only have two register fields. Therefore the operand we compare against (the kernel's API calls it 'old') is hard-coded to be R0. x86 has similar design (and A64 doesn't have this problem). A potential alternative might be to encode the other operand's register number in the immediate field. - The kernel's atomic_cmpxchg returns the old value, while the C11 userspace APIs return a boolean indicating the comparison result. Which should BPF do? A64 returns the old value. x86 returns the old value in the hard-coded register (and also sets a flag). That means return-old-value is easier to JIT, so that's what we use. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210114181751.768687-8-jackmanb@google.com --- include/uapi/linux/bpf.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ea262b009049..c001766adcbc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -45,7 +45,9 @@ #define BPF_EXIT 0x90 /* function return */ /* atomic op type fields (stored in immediate) */ -#define BPF_FETCH 0x01 /* fetch previous value into src reg */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ /* Register numbers */ enum { -- cgit v1.2.3 From 88a16a1309333e43d328621ece3e9fa37027e8eb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jan 2021 14:40:44 +0100 Subject: perf: Add build id data in mmap2 event Adding support to carry build id data in mmap2 event. The build id data replaces maj/min/ino/ino_generation fields, which are also used to identify map's binary, so it's ok to replace them with build id data: union { struct { u32 maj; u32 min; u64 ino; u64 ino_generation; }; struct { u8 build_id_size; u8 __reserved_1; u16 __reserved_2; u8 build_id[20]; }; }; Replaced maj/min/ino/ino_generation fields give us size of 24 bytes. We use 20 bytes for build id data, 1 byte for size and rest is unused. There's new misc bit for mmap2 to signal there's build id data in it: #define PERF_RECORD_MISC_MMAP_BUILD_ID (1 << 14) Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210114134044.1418404-4-jolsa@kernel.org --- include/uapi/linux/perf_event.h | 42 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b15e3447cd9f..cb6f84103560 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -386,7 +386,8 @@ struct perf_event_attr { aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ text_poke : 1, /* include text poke events */ - __reserved_1 : 30; + build_id : 1, /* use build id in mmap2 events */ + __reserved_1 : 29; union { __u32 wakeup_events; /* wakeup every n events */ @@ -659,6 +660,22 @@ struct perf_event_mmap_page { __u64 aux_size; }; +/* + * The current state of perf_event_header::misc bits usage: + * ('|' used bit, '-' unused bit) + * + * 012 CDEF + * |||---------|||| + * + * Where: + * 0-2 CPUMODE_MASK + * + * C PROC_MAP_PARSE_TIMEOUT + * D MMAP_DATA / COMM_EXEC / FORK_EXEC / SWITCH_OUT + * E MMAP_BUILD_ID / EXACT_IP / SCHED_OUT_PREEMPT + * F (reserved) + */ + #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0) @@ -690,6 +707,7 @@ struct perf_event_mmap_page { * * PERF_RECORD_MISC_EXACT_IP - PERF_RECORD_SAMPLE of precise events * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events + * PERF_RECORD_MISC_MMAP_BUILD_ID - PERF_RECORD_MMAP2 event * * * PERF_RECORD_MISC_EXACT_IP: @@ -699,9 +717,13 @@ struct perf_event_mmap_page { * * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT: * Indicates that thread was preempted in TASK_RUNNING state. + * + * PERF_RECORD_MISC_MMAP_BUILD_ID: + * Indicates that mmap2 event carries build id data. */ #define PERF_RECORD_MISC_EXACT_IP (1 << 14) #define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) +#define PERF_RECORD_MISC_MMAP_BUILD_ID (1 << 14) /* * Reserve the last bit to indicate some extended misc field */ @@ -915,10 +937,20 @@ enum perf_event_type { * u64 addr; * u64 len; * u64 pgoff; - * u32 maj; - * u32 min; - * u64 ino; - * u64 ino_generation; + * union { + * struct { + * u32 maj; + * u32 min; + * u64 ino; + * u64 ino_generation; + * }; + * struct { + * u8 build_id_size; + * u8 __reserved_1; + * u16 __reserved_2; + * u8 build_id[20]; + * }; + * }; * u32 prot, flags; * char filename[]; * struct sample_id sample_id; -- cgit v1.2.3 From 9ab7e76aefc97a9aa664accb59d6e8dc5e52514a Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sat, 9 Jan 2021 23:00:21 -0800 Subject: GTP: add support for flow based tunneling API Following patch add support for flow based tunneling API to send and recv GTP tunnel packet over tunnel metadata API. This would allow this device integration with OVS or eBPF using flow based tunneling APIs. Signed-off-by: Pravin B Shelar Link: https://lore.kernel.org/r/20210110070021.26822-1-pbshelar@fb.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/gtp.h | 12 ++++++++++++ include/uapi/linux/if_link.h | 1 + include/uapi/linux/if_tunnel.h | 1 + 3 files changed, 14 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h index 79f9191bbb24..62aff78b7c56 100644 --- a/include/uapi/linux/gtp.h +++ b/include/uapi/linux/gtp.h @@ -2,6 +2,8 @@ #ifndef _UAPI_LINUX_GTP_H_ #define _UAPI_LINUX_GTP_H_ +#include + #define GTP_GENL_MCGRP_NAME "gtp" enum gtp_genl_cmds { @@ -34,4 +36,14 @@ enum gtp_attrs { }; #define GTPA_MAX (__GTPA_MAX + 1) +enum { + GTP_METADATA_V1 +}; + +struct gtpu_metadata { + __u8 ver; + __u8 flags; + __u8 type; +}; + #endif /* _UAPI_LINUX_GTP_H_ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 82708c6db432..2bd0d8bbcdb2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -809,6 +809,7 @@ enum { IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, + IFLA_GTP_COLLECT_METADATA, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 7d9105533c7b..802da679fab1 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -176,6 +176,7 @@ enum { #define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) #define TUNNEL_NOCACHE __cpu_to_be16(0x2000) #define TUNNEL_ERSPAN_OPT __cpu_to_be16(0x4000) +#define TUNNEL_GTPU_OPT __cpu_to_be16(0x8000) #define TUNNEL_OPTIONS_PRESENT \ (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT) -- cgit v1.2.3 From 7b8fc0103bb51d1d3e1fb5fd67958612e709f883 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Mon, 18 Jan 2021 20:09:27 -0500 Subject: bonding: add a vlan+srcmac tx hashing option This comes from an end-user request, where they're running multiple VMs on hosts with bonded interfaces connected to some interest switch topologies, where 802.3ad isn't an option. They're currently running a proprietary solution that effectively achieves load-balancing of VMs and bandwidth utilization improvements with a similar form of transmission algorithm. Basically, each VM has it's own vlan, so it always sends its traffic out the same interface, unless that interface fails. Traffic gets split between the interfaces, maintaining a consistent path, with failover still available if an interface goes down. Unlike bond_eth_hash(), this hash function is using the full source MAC address instead of just the last byte, as there are so few components to the hash, and in the no-vlan case, we would be returning just the last byte of the source MAC as the hash value. It's entirely possible to have two NICs in a bond with the same last byte of their MAC, but not the same MAC, so this adjustment should guarantee distinct hashes in all cases. This has been rudimetarily tested to provide similar results to the proprietary solution it is aiming to replace. A patch for iproute2 is also posted, to properly support the new mode there as well. Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Cc: Thomas Davis Signed-off-by: Jarod Wilson Link: https://lore.kernel.org/r/20210119010927.1191922-1-jarod@redhat.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bonding.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index 45f3750aa861..e8eb4ad03cf1 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -94,6 +94,7 @@ #define BOND_XMIT_POLICY_LAYER23 2 /* layer 2+3 (IP ^ MAC) */ #define BOND_XMIT_POLICY_ENCAP23 3 /* encapsulated layer 2+3 */ #define BOND_XMIT_POLICY_ENCAP34 4 /* encapsulated layer 3+4 */ +#define BOND_XMIT_POLICY_VLAN_SRCMAC 5 /* vlan + source MAC */ /* 802.3ad port state definitions (43.4.2.2 in the 802.3ad standard) */ #define LACP_STATE_LACP_ACTIVITY 0x1 -- cgit v1.2.3 From 7baf2429a1a965369b0ce44efb6315cdd515aa9c Mon Sep 17 00:00:00 2001 From: wenxu Date: Tue, 19 Jan 2021 16:31:50 +0800 Subject: net/sched: cls_flower add CT_FLAGS_INVALID flag support This patch add the TCA_FLOWER_KEY_CT_FLAGS_INVALID flag to match the ct_state with invalid for conntrack. Signed-off-by: wenxu Acked-by: Marcelo Ricardo Leitner Link: https://lore.kernel.org/r/1611045110-682-1-git-send-email-wenxu@ucloud.cn Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_cls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index ee95f42fb0ec..709668e264b0 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -591,6 +591,7 @@ enum { TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */ TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */ TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ + TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */ }; enum { -- cgit v1.2.3 From b8288837ef6bdaac331752b401f5ca3b59b37430 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 11 Dec 2020 22:12:13 -0800 Subject: devlink: Introduce PCI SF port flavour and port attribute A PCI sub-function (SF) represents a portion of the device similar to PCI VF. In an eswitch, PCI SF may have port which is normally represented using a representor netdevice. To have better visibility of eswitch port, its association with SF, and its representor netdevice, introduce a PCI SF port flavour. When devlink port flavour is PCI SF, fill up PCI SF attributes of the port. Extend port name creation using PCI PF and SF number scheme on best effort basis, so that vendor drivers can skip defining their own scheme. This is done as cApfNSfM, where A, N and M are controller, PCI PF and PCI SF number respectively. This is similar to existing naming for PCI PF and PCI VF ports. An example view of a PCI SF port: $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state active opstate attached $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- include/uapi/linux/devlink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index cf89c318f2ac..1a241b09a7f8 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -200,6 +200,10 @@ enum devlink_port_flavour { DEVLINK_PORT_FLAVOUR_UNUSED, /* Port which exists in the switch, but * is not used in any way. */ + DEVLINK_PORT_FLAVOUR_PCI_SF, /* Represents eswitch port + * for the PCI SF. It is an internal + * port that faces the PCI SF. + */ }; enum devlink_param_cmode { @@ -529,6 +533,7 @@ enum devlink_attr { DEVLINK_ATTR_RELOAD_ACTION_INFO, /* nested */ DEVLINK_ATTR_RELOAD_ACTION_STATS, /* nested */ + DEVLINK_ATTR_PORT_PCI_SF_NUMBER, /* u32 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, -- cgit v1.2.3 From a556dded9c23c51c82654f1ebe389cbc0bc22057 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 11 Dec 2020 22:12:15 -0800 Subject: devlink: Support get and set state of port function devlink port function can be in active or inactive state. Allow users to get and set port function's state. When the port function it activated, its operational state may change after a while when the device is created and driver binds to it. Similarly on deactivation flow. To clearly describe the state of the port function and its device's operational state in the host system, define state and opstate attributes. Example of a PCI SF port which supports a port function: $ devlink dev eswitch set pci/0000:06:00.0 mode switchdev $ devlink port show pci/0000:06:00.0/65535: type eth netdev ens2f0np0 flavour physical port 0 splittable false $ devlink port add pci/0000:06:00.0 flavour pcisf pfnum 0 sfnum 88 pci/0000:08:00.0/32768: type eth netdev eth6 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached $ devlink port show pci/0000:06:00.0/32768 pci/0000:06:00.0/32768: type eth netdev ens2f0npf0sf88 flavour pcisf controller 0 pfnum 0 sfnum 88 external false splittable false function: hw_addr 00:00:00:00:88:88 state inactive opstate detached $ devlink port function set pci/0000:06:00.0/32768 hw_addr 00:00:00:00:88:88 state active $ devlink port show pci/0000:06:00.0/32768 -jp { "port": { "pci/0000:06:00.0/32768": { "type": "eth", "netdev": "ens2f0npf0sf88", "flavour": "pcisf", "controller": 0, "pfnum": 0, "sfnum": 88, "external": false, "splittable": false, "function": { "hw_addr": "00:00:00:00:88:88", "state": "active", "opstate": "attached" } } } } Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- include/uapi/linux/devlink.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 1a241b09a7f8..f6008b2fa60f 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -583,9 +583,29 @@ enum devlink_resource_unit { enum devlink_port_function_attr { DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, /* binary */ + DEVLINK_PORT_FN_ATTR_STATE, /* u8 */ + DEVLINK_PORT_FN_ATTR_OPSTATE, /* u8 */ __DEVLINK_PORT_FUNCTION_ATTR_MAX, DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1 }; +enum devlink_port_fn_state { + DEVLINK_PORT_FN_STATE_INACTIVE, + DEVLINK_PORT_FN_STATE_ACTIVE, +}; + +/** + * enum devlink_port_fn_opstate - indicates operational state of the function + * @DEVLINK_PORT_FN_OPSTATE_ATTACHED: Driver is attached to the function. + * For graceful tear down of the function, after inactivation of the + * function, user should wait for operational state to turn DETACHED. + * @DEVLINK_PORT_FN_OPSTATE_DETACHED: Driver is detached from the function. + * It is safe to delete the port. + */ +enum devlink_port_fn_opstate { + DEVLINK_PORT_FN_OPSTATE_DETACHED, + DEVLINK_PORT_FN_OPSTATE_ATTACHED, +}; + #endif /* _UAPI_LINUX_DEVLINK_H_ */ -- cgit v1.2.3 From e7ed11ee945438b737e2ae2370e35591e16ec371 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Wed, 20 Jan 2021 12:41:55 -0800 Subject: tcp: add TTL to SCM_TIMESTAMPING_OPT_STATS This patch adds TCP_NLA_TTL to SCM_TIMESTAMPING_OPT_STATS that exports the time-to-live or hop limit of the latest incoming packet with SCM_TSTAMP_ACK. The value exported may not be from the packet that acks the sequence when incoming packets are aggregated. Exporting the time-to-live or hop limit value of incoming packets helps to estimate the hop count of the path of the flow that may change over time. Signed-off-by: Yousuk Seung Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Link: https://lore.kernel.org/r/20210120204155.552275-1-ysseung@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 768e93bd5b51..16dfa40bdac3 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -314,6 +314,7 @@ enum { TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */ + TCP_NLA_TTL, /* TTL or hop limit of a packet received */ }; /* for TCP_MD5SIG socket option */ -- cgit v1.2.3 From 7eeba1706eba6def15f6cb2fc7b3c3b9a2651edc Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Wed, 20 Jan 2021 16:41:48 -0800 Subject: tcp: Add receive timestamp support for receive zerocopy. tcp_recvmsg() uses the CMSG mechanism to receive control information like packet receive timestamps. This patch adds CMSG fields to struct tcp_zerocopy_receive, and provides receive timestamps if available to the user. Signed-off-by: Arjun Roy Signed-off-by: Jakub Kicinski --- include/uapi/linux/tcp.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 16dfa40bdac3..42fc5a640df4 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -354,5 +354,9 @@ struct tcp_zerocopy_receive { __u64 copybuf_address; /* in: copybuf address (small reads) */ __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */ __u32 flags; /* in: flags */ + __u64 msg_control; /* ancillary data */ + __u64 msg_controllen; + __u32 msg_flags; + /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ }; #endif /* _UAPI_LINUX_TCP_H */ -- cgit v1.2.3 From d03b195b5aa015f6c11988b86a3625f8d5dbac52 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Tue, 19 Jan 2021 14:08:13 +0200 Subject: sch_htb: Hierarchical QoS hardware offload HTB doesn't scale well because of contention on a single lock, and it also consumes CPU. This patch adds support for offloading HTB to hardware that supports hierarchical rate limiting. In the offload mode, HTB passes control commands to the driver using ndo_setup_tc. The driver has to replicate the whole hierarchy of classes and their settings (rate, ceil) in the NIC. Every modification of the HTB tree caused by the admin results in ndo_setup_tc being called. After this setup, the HTB algorithm is done completely in the NIC. An SQ (send queue) is created for every leaf class and attached to the hierarchy, so that the NIC can calculate and obey aggregated rate limits, too. In the future, it can be changed, so that multiple SQs will back a single leaf class. ndo_select_queue is responsible for selecting the right queue that serves the traffic class of each packet. The data path works as follows: a packet is classified by clsact, the driver selects a hardware queue according to its class, and the packet is enqueued into this queue's qdisc. This solution addresses two main problems of scaling HTB: 1. Contention by flow classification. Currently the filters are attached to the HTB instance as follows: # tc filter add dev eth0 parent 1:0 protocol ip flower dst_port 80 classid 1:10 It's possible to move classification to clsact egress hook, which is thread-safe and lock-free: # tc filter add dev eth0 egress protocol ip flower dst_port 80 action skbedit priority 1:10 This way classification still happens in software, but the lock contention is eliminated, and it happens before selecting the TX queue, allowing the driver to translate the class to the corresponding hardware queue in ndo_select_queue. Note that this is already compatible with non-offloaded HTB and doesn't require changes to the kernel nor iproute2. 2. Contention by handling packets. HTB is not multi-queue, it attaches to a whole net device, and handling of all packets takes the same lock. When HTB is offloaded, it registers itself as a multi-queue qdisc, similarly to mq: HTB is attached to the netdev, and each queue has its own qdisc. Some features of HTB may be not supported by some particular hardware, for example, the maximum number of classes may be limited, the granularity of rate and ceil parameters may be different, etc. - so, the offload is not enabled by default, a new parameter is used to enable it: # tc qdisc replace dev eth0 root handle 1: htb offload Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 9e7c2c607845..79a699f106b1 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -434,6 +434,7 @@ enum { TCA_HTB_RATE64, TCA_HTB_CEIL64, TCA_HTB_PAD, + TCA_HTB_OFFLOAD, __TCA_HTB_MAX, }; -- cgit v1.2.3 From 6b2e04bc240fe9be9e690059f710e9f95346d34d Mon Sep 17 00:00:00 2001 From: Praveen Chaudhary Date: Mon, 25 Jan 2021 13:44:30 -0800 Subject: net: allow user to set metric on default route learned via Router Advertisement For IPv4, default route is learned via DHCPv4 and user is allowed to change metric using config etc/network/interfaces. But for IPv6, default route can be learned via RA, for which, currently a fixed metric value 1024 is used. Ideally, user should be able to configure metric on default route for IPv6 similar to IPv4. This patch adds sysctl for the same. Logs: For IPv4: Config in etc/network/interfaces: auto eth0 iface eth0 inet dhcp metric 4261413864 IPv4 Kernel Route Table: $ ip route list default via 172.21.47.1 dev eth0 metric 4261413864 FRR Table, if a static route is configured: [In real scenario, it is useful to prefer BGP learned default route over DHCPv4 default route.] Codes: K - kernel route, C - connected, S - static, R - RIP, O - OSPF, I - IS-IS, B - BGP, P - PIM, E - EIGRP, N - NHRP, T - Table, v - VNC, V - VNC-Direct, A - Babel, D - SHARP, > - selected route, * - FIB route S>* 0.0.0.0/0 [20/0] is directly connected, eth0, 00:00:03 K 0.0.0.0/0 [254/1000] via 172.21.47.1, eth0, 6d08h51m i.e. User can prefer Default Router learned via Routing Protocol in IPv4. Similar behavior is not possible for IPv6, without this fix. After fix [for IPv6]: sudo sysctl -w net.ipv6.conf.eth0.net.ipv6.conf.eth0.ra_defrtr_metric=1996489705 IP monitor: [When IPv6 RA is received] default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489705 pref high Kernel IPv6 routing table $ ip -6 route list default via fe80::be16:65ff:feb3:ce8e dev eth0 proto ra metric 1996489705 expires 21sec hoplimit 64 pref high FRR Table, if a static route is configured: [In real scenario, it is useful to prefer BGP learned default route over IPv6 RA default route.] Codes: K - kernel route, C - connected, S - static, R - RIPng, O - OSPFv3, I - IS-IS, B - BGP, N - NHRP, T - Table, v - VNC, V - VNC-Direct, A - Babel, D - SHARP, > - selected route, * - FIB route S>* ::/0 [20/0] is directly connected, eth0, 00:00:06 K ::/0 [119/1001] via fe80::xx16:xxxx:feb3:ce8e, eth0, 6d07h43m If the metric is changed later, the effect will be seen only when next IPv6 RA is received, because the default route must be fully controlled by RA msg. Below metric is changed from 1996489705 to 1996489704. $ sudo sysctl -w net.ipv6.conf.eth0.ra_defrtr_metric=1996489704 net.ipv6.conf.eth0.ra_defrtr_metric = 1996489704 IP monitor: [On next IPv6 RA msg, Kernel deletes prev route and installs new route with updated metric] Deleted default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489705 expires 3sec hoplimit 64 pref high default via fe80::xx16:xxxx:feb3:ce8e dev eth0 proto ra metric 1996489704 pref high Signed-off-by: Praveen Chaudhary Signed-off-by: Zhenggen Xu Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20210125214430.24079-1-pchaudhary@linkedin.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ipv6.h | 1 + include/uapi/linux/sysctl.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 13e8751bf24a..70603775fe91 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -189,6 +189,7 @@ enum { DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN, DEVCONF_NDISC_TCLASS, DEVCONF_RPL_SEG_ENABLED, + DEVCONF_RA_DEFRTR_METRIC, DEVCONF_MAX }; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 458179df9b27..1e05d3caa712 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -571,6 +571,7 @@ enum { NET_IPV6_ACCEPT_SOURCE_ROUTE=25, NET_IPV6_ACCEPT_RA_FROM_LOCAL=26, NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27, + NET_IPV6_RA_DEFRTR_METRIC=28, __NET_IPV6_MAX }; -- cgit v1.2.3 From 2dba407f994e5b0eb3b70a8cb280e014ec4a7ff3 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 26 Jan 2021 11:35:33 +0200 Subject: net: bridge: multicast: make tracked EHT hosts limit configurable Add two new port attributes which make EHT hosts limit configurable and export the current number of tracked EHT hosts: - IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT: configure/retrieve current limit - IFLA_BRPORT_MCAST_EHT_HOSTS_CNT: current number of tracked hosts Setting IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT to 0 is currently not allowed. Note that we have to increase RTNL_SLAVE_MAX_TYPE to 38 minimum, I've increased it to 40 to have space for two more future entries. v2: move br_multicast_eht_set_hosts_limit() to br_multicast_eht.c, no functional change Signed-off-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_link.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2bd0d8bbcdb2..eb8018c3a737 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -525,6 +525,8 @@ enum { IFLA_BRPORT_BACKUP_PORT, IFLA_BRPORT_MRP_RING_OPEN, IFLA_BRPORT_MRP_IN_OPEN, + IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, + IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) -- cgit v1.2.3 From 8c85d18ce647ac2517a1a1bb01b02648e23700e6 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 27 Jan 2021 16:32:45 +0200 Subject: net/sched: cls_flower: Add match on the ct_state reply flag Add match on the ct_state reply flag. Example: $ tc filter add dev ens1f0_0 ingress prio 1 chain 1 proto ip flower \ ct_state +trk+est+rpl \ action mirred egress redirect dev ens1f0_1 $ tc filter add dev ens1f0_1 ingress prio 1 chain 1 proto ip flower \ ct_state +trk+est-rpl \ action mirred egress redirect dev ens1f0_0 Signed-off-by: Paul Blakey Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_cls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 709668e264b0..afe6836e44b1 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -592,6 +592,7 @@ enum { TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */ TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ TCA_FLOWER_KEY_CT_FLAGS_INVALID = 1 << 4, /* Conntrack is invalid. */ + TCA_FLOWER_KEY_CT_FLAGS_REPLY = 1 << 5, /* Packet is in the reply direction. */ }; enum { -- cgit v1.2.3 From 012ce4dd3102a0f4d80167de343e9d44b257c1b8 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Tue, 2 Feb 2021 20:06:06 +0200 Subject: ethtool: Extend link modes settings uAPI with lanes Currently, when auto negotiation is on, the user can advertise all the linkmodes which correspond to a specific speed, but does not have a similar selector for the number of lanes. This is significant when a specific speed can be achieved using different number of lanes. For example, 2x50 or 4x25. Add 'ETHTOOL_A_LINKMODES_LANES' attribute and expand 'struct ethtool_link_settings' with lanes field in order to implement a new lanes-selector that will enable the user to advertise a specific number of lanes as well. When auto negotiation is off, lanes parameter can be forced only if the driver supports it. Add a capability bit in 'struct ethtool_ops' that allows ethtool know if the driver can handle the lanes parameter when auto negotiation is off, so if it does not, an error message will be returned when trying to set lanes. Example: $ ethtool -s swp1 lanes 4 $ ethtool swp1 Settings for swp1: Supported ports: [ FIBRE ] Supported link modes: 1000baseKX/Full 10000baseKR/Full 40000baseCR4/Full 40000baseSR4/Full 40000baseLR4/Full 25000baseCR/Full 25000baseSR/Full 50000baseCR2/Full 100000baseSR4/Full 100000baseCR4/Full Supported pause frame use: Symmetric Receive-only Supports auto-negotiation: Yes Supported FEC modes: Not reported Advertised link modes: 40000baseCR4/Full 40000baseSR4/Full 40000baseLR4/Full 100000baseSR4/Full 100000baseCR4/Full Advertised pause frame use: No Advertised auto-negotiation: Yes Advertised FEC modes: Not reported Speed: Unknown! Duplex: Unknown! (255) Auto-negotiation: on Port: Direct Attach Copper PHYAD: 0 Transceiver: internal Link detected: no Signed-off-by: Danielle Ratson Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index e2bf36e6964b..a286635ac9b8 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -227,6 +227,7 @@ enum { ETHTOOL_A_LINKMODES_DUPLEX, /* u8 */ ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, /* u8 */ ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, /* u8 */ + ETHTOOL_A_LINKMODES_LANES, /* u32 */ /* add new constants above here */ __ETHTOOL_A_LINKMODES_CNT, -- cgit v1.2.3 From 49ecc587dca2754571791bebd36e9e36e2a0d973 Mon Sep 17 00:00:00 2001 From: Jonas Bonn Date: Wed, 3 Feb 2021 08:07:59 +0100 Subject: Revert "GTP: add support for flow based tunneling API" This reverts commit 9ab7e76aefc97a9aa664accb59d6e8dc5e52514a. This patch was committed without maintainer approval and despite a number of unaddressed concerns from review. There are several issues that impede the acceptance of this patch and that make a reversion of this particular instance of these changes the best way forward: i) the patch contains several logically separate changes that would be better served as smaller patches (for review purposes) ii) functionality like the handling of end markers has been introduced without further explanation iii) symmetry between the handling of GTPv0 and GTPv1 has been unnecessarily broken iv) the patchset produces 'broken' packets when extension headers are included v) there are no available userspace tools to allow for testing this functionality vi) there is an unaddressed Coverity report against the patch concering memory leakage vii) most importantly, the patch contains a large amount of superfluous churn that impedes other ongoing work with this driver This patch will be reworked into a series that aligns with other ongoing work and facilitates review. Signed-off-by: Jonas Bonn Acked-by: Harald Welte Acked-by: Pravin B Shelar Signed-off-by: Jakub Kicinski --- include/uapi/linux/gtp.h | 12 ------------ include/uapi/linux/if_link.h | 1 - include/uapi/linux/if_tunnel.h | 1 - 3 files changed, 14 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h index 62aff78b7c56..79f9191bbb24 100644 --- a/include/uapi/linux/gtp.h +++ b/include/uapi/linux/gtp.h @@ -2,8 +2,6 @@ #ifndef _UAPI_LINUX_GTP_H_ #define _UAPI_LINUX_GTP_H_ -#include - #define GTP_GENL_MCGRP_NAME "gtp" enum gtp_genl_cmds { @@ -36,14 +34,4 @@ enum gtp_attrs { }; #define GTPA_MAX (__GTPA_MAX + 1) -enum { - GTP_METADATA_V1 -}; - -struct gtpu_metadata { - __u8 ver; - __u8 flags; - __u8 type; -}; - #endif /* _UAPI_LINUX_GTP_H_ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index eb8018c3a737..91c8dda6d95d 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -811,7 +811,6 @@ enum { IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, IFLA_GTP_ROLE, - IFLA_GTP_COLLECT_METADATA, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 802da679fab1..7d9105533c7b 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -176,7 +176,6 @@ enum { #define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) #define TUNNEL_NOCACHE __cpu_to_be16(0x2000) #define TUNNEL_ERSPAN_OPT __cpu_to_be16(0x4000) -#define TUNNEL_GTPU_OPT __cpu_to_be16(0x8000) #define TUNNEL_OPTIONS_PRESENT \ (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT) -- cgit v1.2.3 From cfa55c6d47b1e75ccc4b950616e881f3fd07712e Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 1 Jan 2021 00:00:01 +0100 Subject: batman-adv: Drop publication years from copyright info The batman-adv source code was using the year of publication (to net-next) as "last" year for the copyright statement. The whole source code mentioned in the MAINTAINERS "BATMAN ADVANCED" section was handled as a single entity regarding the publishing year. This avoided having outdated (in sense of year information - not copyright holder) publishing information inside several files. But since the simple "update copyright year" commit (without other changes) in the file was not well received in the upstream kernel, the option to not have a copyright year (for initial and last publication) in the files are chosen instead. More detailed information about the years can still be retrieved from the SCM system. Signed-off-by: Sven Eckelmann Acked-by: Marek Lindner Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 2 +- include/uapi/linux/batman_adv.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 9c8604c5b5f6..ea4692c339ce 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */ -/* Copyright (C) 2007-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index bdb317faa1dc..35dc016c9bb4 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: MIT */ -/* Copyright (C) 2016-2020 B.A.T.M.A.N. contributors: +/* Copyright (C) B.A.T.M.A.N. contributors: * * Matthias Schiffer */ -- cgit v1.2.3 From 49fc251360a10e6bff0d886c9e3c62008a1c4caf Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Sun, 7 Feb 2021 10:22:49 +0200 Subject: rtnetlink: Add RTM_F_OFFLOAD_FAILED flag The flag indicates to user space that route offload failed. Previous patch set added the ability to emit RTM_NEWROUTE notifications whenever RTM_F_OFFLOAD/RTM_F_TRAP flags are changed, but if the offload fails there is no indication to user-space. The flag will be used in subsequent patches by netdevsim and mlxsw to indicate to user space that route offload failed, so that users will have better visibility into the offload process. Signed-off-by: Amit Cohen Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index b841caa4657e..91e4ca064d61 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -319,6 +319,11 @@ enum rt_scope_t { #define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */ #define RTM_F_OFFLOAD 0x4000 /* route is offloaded */ #define RTM_F_TRAP 0x8000 /* route is trapping packets */ +#define RTM_F_OFFLOAD_FAILED 0x20000000 /* route offload failed, this value + * is chosen to avoid conflicts with + * other flags defined in + * include/uapi/linux/ipv6_route.h + */ /* Reserved table identifiers */ -- cgit v1.2.3 From 1bcc51ac0731aab1b109b2cd5c3d495f1884e5ca Mon Sep 17 00:00:00 2001 From: wenxu Date: Tue, 9 Feb 2021 14:37:49 +0800 Subject: net/sched: cls_flower: Reject invalid ct_state flags rules Reject the unsupported and invalid ct_state flags of cls flower rules. Fixes: e0ace68af2ac ("net/sched: cls_flower: Add matching on conntrack info") Signed-off-by: wenxu Reviewed-by: Marcelo Ricardo Leitner Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index ee95f42fb0ec..88f4bf0047e7 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -591,6 +591,8 @@ enum { TCA_FLOWER_KEY_CT_FLAGS_ESTABLISHED = 1 << 1, /* Part of an existing connection. */ TCA_FLOWER_KEY_CT_FLAGS_RELATED = 1 << 2, /* Related to an established connection. */ TCA_FLOWER_KEY_CT_FLAGS_TRACKED = 1 << 3, /* Conntrack has occurred. */ + + __TCA_FLOWER_KEY_CT_FLAGS_MAX, }; enum { -- cgit v1.2.3 From 9ed9e9ba2337205311398a312796c213737bac35 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Feb 2021 19:36:31 -0800 Subject: bpf: Count the number of times recursion was prevented Add per-program counter for number of times recursion prevention mechanism was triggered and expose it via show_fdinfo and bpf_prog_info. Teach bpftool to print it. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210210033634.62081-7-alexei.starovoitov@gmail.com --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c001766adcbc..c547ad1ffe43 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4501,6 +4501,7 @@ struct bpf_prog_info { __aligned_u64 prog_tags; __u64 run_time_ns; __u64 run_cnt; + __u64 recursion_misses; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.2.3 From 07881ccbf40cc7893869f3f170301889ddca54ac Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Wed, 10 Feb 2021 12:14:02 +0100 Subject: bpf: Be less specific about socket cookies guarantees Since "92acdc58ab11 bpf, net: Rework cookie generator as per-cpu one" socket cookies are not guaranteed to be non-decreasing. The bpf_get_socket_cookie helper descriptions are currently specifying that cookies are non-decreasing but we don't want users to rely on that. Reported-by: Daniel Borkmann Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210210111406.785541-1-revest@chromium.org --- include/uapi/linux/bpf.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c547ad1ffe43..dbf10bf08582 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1656,22 +1656,22 @@ union bpf_attr { * networking traffic statistics as it provides a global socket * identifier that can be assumed unique. * Return - * A 8-byte long non-decreasing number on success, or 0 if the - * socket field is missing inside *skb*. + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. * * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return - * A 8-byte long non-decreasing number. + * A 8-byte long unique number. * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return -- cgit v1.2.3 From c5dbb89fc2ac013afe67b9e4fcb3743c02b567cd Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Wed, 10 Feb 2021 12:14:03 +0100 Subject: bpf: Expose bpf_get_socket_cookie to tracing programs This needs a new helper that: - can work in a sleepable context (using sock_gen_cookie) - takes a struct sock pointer and checks that it's not NULL Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210210111406.785541-2-revest@chromium.org --- include/uapi/linux/bpf.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dbf10bf08582..07cc2e404291 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1673,6 +1673,14 @@ union bpf_attr { * Return * A 8-byte long unique number. * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. + * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return * The owner UID of the socket associated to *skb*. If the socket -- cgit v1.2.3 From 3c5a2fd042d0bfac71a2dfb99515723d318df47b Mon Sep 17 00:00:00 2001 From: Arjun Roy Date: Thu, 11 Feb 2021 13:21:07 -0800 Subject: tcp: Sanitize CMSG flags and reserved args in tcp_zerocopy_receive. Explicitly define reserved field and require it and any subsequent fields to be zero-valued for now. Additionally, limit the valid CMSG flags that tcp_zerocopy_receive accepts. Fixes: 7eeba1706eba ("tcp: Add receive timestamp support for receive zerocopy.") Signed-off-by: Arjun Roy Signed-off-by: Eric Dumazet Signed-off-by: Soheil Hassas Yeganeh Suggested-by: David Ahern Suggested-by: Leon Romanovsky Suggested-by: Jakub Kicinski Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 42fc5a640df4..8fc09e8638b3 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -357,6 +357,6 @@ struct tcp_zerocopy_receive { __u64 msg_control; /* ancillary data */ __u64 msg_controllen; __u32 msg_flags; - /* __u32 hole; Next we must add >1 u32 otherwise length checks fail. */ + __u32 reserved; /* set to 0 for now */ }; #endif /* _UAPI_LINUX_TCP_H */ -- cgit v1.2.3 From b6db0f899a16a23f5a9ea6c8b0fafc7bbd38e03d Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 4 Feb 2021 06:46:10 -0800 Subject: cfg80211/mac80211: Support disabling HE mode Allow user to disable HE mode, similar to how VHT and HT can be disabled. Useful for testing. Signed-off-by: Ben Greear Link: https://lore.kernel.org/r/20210204144610.25971-1-greearb@candelatech.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 40832d13c2f1..5188fe581efc 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3045,6 +3045,8 @@ enum nl80211_attrs { NL80211_ATTR_SAR_SPEC, + NL80211_ATTR_DISABLE_HE, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, -- cgit v1.2.3 From 735a48481cca453525d9199772f9c3733a47cff4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 12 Feb 2021 10:50:23 +0100 Subject: nl80211: add documentation for HT/VHT/HE disable attributes These were missed earlier, add the necessary documentation and, while at it, clarify it. Signed-off-by: Johannes Berg Link: https://lore.kernel.org/r/20210212105023.895c3389f063.I46dea3bfc64385bc6f600c50d294007510994f8f@changeid Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5188fe581efc..ac78da99fccd 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1963,8 +1963,15 @@ enum nl80211_commands { * @NL80211_ATTR_PROBE_RESP: Probe Response template data. Contains the entire * probe-response frame. The DA field in the 802.11 header is zero-ed out, * to be filled by the FW. - * @NL80211_ATTR_DISABLE_HT: Force HT capable interfaces to disable - * this feature. Currently, only supported in mac80211 drivers. + * @NL80211_ATTR_DISABLE_HT: Force HT capable interfaces to disable + * this feature during association. This is a flag attribute. + * Currently only supported in mac80211 drivers. + * @NL80211_ATTR_DISABLE_VHT: Force VHT capable interfaces to disable + * this feature during association. This is a flag attribute. + * Currently only supported in mac80211 drivers. + * @NL80211_ATTR_DISABLE_HE: Force HE capable interfaces to disable + * this feature during association. This is a flag attribute. + * Currently only supported in mac80211 drivers. * @NL80211_ATTR_HT_CAPABILITY_MASK: Specify which bits of the * ATTR_HT_CAPABILITY to which attention should be paid. * Currently, only mac80211 NICs support this feature. -- cgit v1.2.3 From e1850ea9bd9eca3656820b4875967d6f9c11c237 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Feb 2021 14:38:19 +0100 Subject: bpf: bpf_fib_lookup return MTU value as output when looked up The BPF-helpers for FIB lookup (bpf_xdp_fib_lookup and bpf_skb_fib_lookup) can perform MTU check and return BPF_FIB_LKUP_RET_FRAG_NEEDED. The BPF-prog don't know the MTU value that caused this rejection. If the BPF-prog wants to implement PMTU (Path MTU Discovery) (rfc1191) it need to know this MTU value for the ICMP packet. Patch change lookup and result struct bpf_fib_lookup, to contain this MTU value as output via a union with 'tot_len' as this is the value used for the MTU lookup. V5: - Fixed uninit value spotted by Dan Carpenter. - Name struct output member mtu_result Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/161287789952.790810.13134700381067698781.stgit@firesoul --- include/uapi/linux/bpf.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 07cc2e404291..6b1f6058cccf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2239,6 +2239,9 @@ union bpf_attr { * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the * packet is not forwarded or needs assist from full stack * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + * * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) * Description * Add an entry to, or update a sockhash *map* referencing sockets. @@ -4990,9 +4993,13 @@ struct bpf_fib_lookup { __be16 sport; __be16 dport; - /* total length of packet from network header - used for MTU check */ - __u16 tot_len; + union { /* used for MTU check */ + /* input to lookup */ + __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ + /* output: MTU value */ + __u16 mtu_result; + }; /* input: L3 device index for lookup * output: device index from FIB lookup */ -- cgit v1.2.3 From 34b2021cc61642d61c3cf943d9e71925b827941b Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 9 Feb 2021 14:38:24 +0100 Subject: bpf: Add BPF-helper for MTU checking This BPF-helper bpf_check_mtu() works for both XDP and TC-BPF programs. The SKB object is complex and the skb->len value (accessible from BPF-prog) also include the length of any extra GRO/GSO segments, but without taking into account that these GRO/GSO segments get added transport (L4) and network (L3) headers before being transmitted. Thus, this BPF-helper is created such that the BPF-programmer don't need to handle these details in the BPF-prog. The API is designed to help the BPF-programmer, that want to do packet context size changes, which involves other helpers. These other helpers usually does a delta size adjustment. This helper also support a delta size (len_diff), which allow BPF-programmer to reuse arguments needed by these other helpers, and perform the MTU check prior to doing any actual size adjustment of the packet context. It is on purpose, that we allow the len adjustment to become a negative result, that will pass the MTU check. This might seem weird, but it's not this helpers responsibility to "catch" wrong len_diff adjustments. Other helpers will take care of these checks, if BPF-programmer chooses to do actual size adjustment. V14: - Improve man-page desc of len_diff. V13: - Enforce flag BPF_MTU_CHK_SEGS cannot use len_diff. V12: - Simplify segment check that calls skb_gso_validate_network_len. - Helpers should return long V9: - Use dev->hard_header_len (instead of ETH_HLEN) - Annotate with unlikely req from Daniel - Fix logic error using skb_gso_validate_network_len from Daniel V6: - Took John's advice and dropped BPF_MTU_CHK_RELAX - Returned MTU is kept at L3-level (like fib_lookup) V4: Lot of changes - ifindex 0 now use current netdev for MTU lookup - rename helper from bpf_mtu_check to bpf_check_mtu - fix bug for GSO pkt length (as skb->len is total len) - remove __bpf_len_adj_positive, simply allow negative len adj Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/161287790461.790810.3429728639563297353.stgit@firesoul --- include/uapi/linux/bpf.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6b1f6058cccf..4c24daa43bac 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3847,6 +3847,69 @@ union bpf_attr { * Return * A pointer to a struct socket on success or NULL if the file is * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + + * Check ctx packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing an *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, why it is not considered a + * failure. Other BPF-helpers are needed for performing the + * planned size change, why the responsability for catch a negative + * packet size belong in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TX length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. On input *mtu_len* must be a valid + * pointer and be initialized (to zero), else verifier will reject + * BPF program. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4012,6 +4075,7 @@ union bpf_attr { FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ FN(sock_from_file), \ + FN(check_mtu), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5045,6 +5109,17 @@ struct bpf_redir_neigh { }; }; +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + enum bpf_task_fd_type { BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ BPF_FD_TYPE_TRACEPOINT, /* tp name */ -- cgit v1.2.3 From b911c97c7dc771633c68ea9b8f15070f8af3d323 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 12 Feb 2021 16:00:01 -0800 Subject: mptcp: add netlink event support Allow userspace (mptcpd) to subscribe to mptcp genl multicast events. This implementation reuses the same event API as the mptcp kernel fork to ease integration of existing tools, e.g. mptcpd. Supported events include: 1. start and close of an mptcp connection 2. start and close of subflows (joins) 3. announce and withdrawals of addresses 4. subflow priority (backup/non-backup) change. Reviewed-by: Matthieu Baerts Signed-off-by: Florian Westphal Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 3674a451a18c..c91578aaab32 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -36,6 +36,7 @@ enum { /* netlink interface */ #define MPTCP_PM_NAME "mptcp_pm" #define MPTCP_PM_CMD_GRP_NAME "mptcp_pm_cmds" +#define MPTCP_PM_EV_GRP_NAME "mptcp_pm_events" #define MPTCP_PM_VER 0x1 /* @@ -104,4 +105,77 @@ struct mptcp_info { __u64 mptcpi_rcv_nxt; }; +/* + * MPTCP_EVENT_CREATED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport + * A new MPTCP connection has been created. It is the good time to allocate + * memory and send ADD_ADDR if needed. Depending on the traffic-patterns + * it can take a long time until the MPTCP_EVENT_ESTABLISHED is sent. + * + * MPTCP_EVENT_ESTABLISHED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport + * A MPTCP connection is established (can start new subflows). + * + * MPTCP_EVENT_CLOSED: token + * A MPTCP connection has stopped. + * + * MPTCP_EVENT_ANNOUNCED: token, rem_id, family, daddr4 | daddr6 [, dport] + * A new address has been announced by the peer. + * + * MPTCP_EVENT_REMOVED: token, rem_id + * An address has been lost by the peer. + * + * MPTCP_EVENT_SUB_ESTABLISHED: token, family, saddr4 | saddr6, + * daddr4 | daddr6, sport, dport, backup, + * if_idx [, error] + * A new subflow has been established. 'error' should not be set. + * + * MPTCP_EVENT_SUB_CLOSED: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, backup, if_idx [, error] + * A subflow has been closed. An error (copy of sk_err) could be set if an + * error has been detected for this subflow. + * + * MPTCP_EVENT_SUB_PRIORITY: token, family, saddr4 | saddr6, daddr4 | daddr6, + * sport, dport, backup, if_idx [, error] + * The priority of a subflow has changed. 'error' should not be set. + */ +enum mptcp_event_type { + MPTCP_EVENT_UNSPEC = 0, + MPTCP_EVENT_CREATED = 1, + MPTCP_EVENT_ESTABLISHED = 2, + MPTCP_EVENT_CLOSED = 3, + + MPTCP_EVENT_ANNOUNCED = 6, + MPTCP_EVENT_REMOVED = 7, + + MPTCP_EVENT_SUB_ESTABLISHED = 10, + MPTCP_EVENT_SUB_CLOSED = 11, + + MPTCP_EVENT_SUB_PRIORITY = 13, +}; + +enum mptcp_event_attr { + MPTCP_ATTR_UNSPEC = 0, + + MPTCP_ATTR_TOKEN, /* u32 */ + MPTCP_ATTR_FAMILY, /* u16 */ + MPTCP_ATTR_LOC_ID, /* u8 */ + MPTCP_ATTR_REM_ID, /* u8 */ + MPTCP_ATTR_SADDR4, /* be32 */ + MPTCP_ATTR_SADDR6, /* struct in6_addr */ + MPTCP_ATTR_DADDR4, /* be32 */ + MPTCP_ATTR_DADDR6, /* struct in6_addr */ + MPTCP_ATTR_SPORT, /* be16 */ + MPTCP_ATTR_DPORT, /* be16 */ + MPTCP_ATTR_BACKUP, /* u8 */ + MPTCP_ATTR_ERROR, /* u8 */ + MPTCP_ATTR_FLAGS, /* u16 */ + MPTCP_ATTR_TIMEOUT, /* u32 */ + MPTCP_ATTR_IF_IDX, /* s32 */ + + __MPTCP_ATTR_AFTER_LAST +}; + +#define MPTCP_ATTR_MAX (__MPTCP_ATTR_AFTER_LAST - 1) + #endif /* _UAPI_MPTCP_H */ -- cgit v1.2.3 From 6001a930ce0378b62210d4f83583fc88a903d89d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 15 Feb 2021 12:28:07 +0100 Subject: netfilter: nftables: introduce table ownership A userspace daemon like firewalld might need to monitor for netlink updates to detect its ruleset removal by the (global) flush ruleset command to ensure ruleset persistency. This adds extra complexity from userspace and, for some little time, the firewall policy is not in place. This patch adds the NFT_TABLE_F_OWNER flag which allows a userspace program to own the table that creates in exclusivity. Tables that are owned... - can only be updated and removed by the owner, non-owners hit EPERM if they try to update it or remove it. - are destroyed when the owner closes the netlink socket or the process is gone (implicit netlink socket closure). - are skipped by the global flush ruleset command. - are listed in the global ruleset. The userspace process that sets on the NFT_TABLE_F_OWNER flag need to leave open the netlink socket. A new NFTA_TABLE_OWNER netlink attribute specifies the netlink port ID to identify the owner from userspace. This patch also updates error reporting when an unknown table flag is specified to change it from EINVAL to EOPNOTSUPP given that EINVAL is usually reserved to report for malformed netlink messages to userspace. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b1633e7ba529..79bab7a36b30 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -164,7 +164,10 @@ enum nft_hook_attributes { */ enum nft_table_flags { NFT_TABLE_F_DORMANT = 0x1, + NFT_TABLE_F_OWNER = 0x2, }; +#define NFT_TABLE_F_MASK (NFT_TABLE_F_DORMANT | \ + NFT_TABLE_F_OWNER) /** * enum nft_table_attributes - nf_tables table netlink attributes @@ -173,6 +176,7 @@ enum nft_table_flags { * @NFTA_TABLE_FLAGS: bitmask of enum nft_table_flags (NLA_U32) * @NFTA_TABLE_USE: number of chains in this table (NLA_U32) * @NFTA_TABLE_USERDATA: user data (NLA_BINARY) + * @NFTA_TABLE_OWNER: owner of this table through netlink portID (NLA_U32) */ enum nft_table_attributes { NFTA_TABLE_UNSPEC, @@ -182,6 +186,7 @@ enum nft_table_attributes { NFTA_TABLE_HANDLE, NFTA_TABLE_PAD, NFTA_TABLE_USERDATA, + NFTA_TABLE_OWNER, __NFTA_TABLE_MAX }; #define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1) -- cgit v1.2.3 From 0caf3ada24e4623d4b2c938a5b6d2d09e4ccee18 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 12 Feb 2021 16:52:02 -0800 Subject: mptcp: add local addr info in mptcp_info Add mptcpi_local_addr_used and mptcpi_local_addr_max in struct mptcp_info. Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index c91578aaab32..e1172c1ffdfd 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -103,6 +103,8 @@ struct mptcp_info { __u64 mptcpi_write_seq; __u64 mptcpi_snd_una; __u64 mptcpi_rcv_nxt; + __u8 mptcpi_local_addr_used; + __u8 mptcpi_local_addr_max; }; /* -- cgit v1.2.3