From 47316f4a305367794fc04f23e5c778678d8f1d8e Mon Sep 17 00:00:00 2001 From: Zvi Effron Date: Wed, 7 Jul 2021 22:16:55 +0000 Subject: bpf: Support input xdp_md context in BPF_PROG_TEST_RUN Support passing a xdp_md via ctx_in/ctx_out in bpf_attr for BPF_PROG_TEST_RUN. The intended use case is to pass some XDP meta data to the test runs of XDP programs that are used as tail calls. For programs that use bpf_prog_test_run_xdp, support xdp_md input and output. Unlike with an actual xdp_md during a non-test run, data_meta must be 0 because it must point to the start of the provided user data. From the initial xdp_md, use data and data_end to adjust the pointers in the generated xdp_buff. All other non-zero fields are prohibited (with EINVAL). If the user has set ctx_out/ctx_size_out, copy the (potentially different) xdp_md back to the userspace. We require all fields of input xdp_md except the ones we explicitly support to be set to zero. The expectation is that in the future we might add support for more fields and we want to fail explicitly if the user runs the program on the kernel where we don't yet support them. Co-developed-by: Cody Haas Co-developed-by: Lisa Watanabe Signed-off-by: Cody Haas Signed-off-by: Lisa Watanabe Signed-off-by: Zvi Effron Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210707221657.3985075-3-zeffron@riotgames.com --- include/uapi/linux/bpf.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bf9252c7381e..b46a383e8db7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -324,9 +324,6 @@ union bpf_iter_link_info { * **BPF_PROG_TYPE_SK_LOOKUP** * *data_in* and *data_out* must be NULL. * - * **BPF_PROG_TYPE_XDP** - * *ctx_in* and *ctx_out* must be NULL. - * * **BPF_PROG_TYPE_RAW_TRACEPOINT**, * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** * -- cgit v1.2.3 From f170acda7ffaf0473d06e1e17c12cd9fd63904f5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 14 Jul 2021 21:43:17 +0900 Subject: bpf: Fix a typo of reuseport map in bpf.h. Fix s/BPF_MAP_TYPE_REUSEPORT_ARRAY/BPF_MAP_TYPE_REUSEPORT_SOCKARRAY/ typo in bpf.h. Fixes: 2dbb9b9e6df6 ("bpf: Introduce BPF_PROG_TYPE_SK_REUSEPORT") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210714124317.67526-1-kuniyu@amazon.co.jp --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b46a383e8db7..bafb6282032b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3246,7 +3246,7 @@ union bpf_attr { * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) * Description * Select a **SO_REUSEPORT** socket from a - * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. * It checks the selected socket is matching the incoming * request in the socket buffer. * Return -- cgit v1.2.3 From b00628b1c7d595ae5b544e059c27b1f5828314b4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 14 Jul 2021 17:54:09 -0700 Subject: bpf: Introduce bpf timers. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce 'struct bpf_timer { __u64 :64; __u64 :64; };' that can be embedded in hash/array/lru maps as a regular field and helpers to operate on it: // Initialize the timer. // First 4 bits of 'flags' specify clockid. // Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, int flags); // Configure the timer to call 'callback_fn' static function. long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn); // Arm the timer to expire 'nsec' nanoseconds from the current time. long bpf_timer_start(struct bpf_timer *timer, u64 nsec, u64 flags); // Cancel the timer and wait for callback_fn to finish if it was running. long bpf_timer_cancel(struct bpf_timer *timer); Here is how BPF program might look like: struct map_elem { int counter; struct bpf_timer timer; }; struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 1000); __type(key, int); __type(value, struct map_elem); } hmap SEC(".maps"); static int timer_cb(void *map, int *key, struct map_elem *val); /* val points to particular map element that contains bpf_timer. */ SEC("fentry/bpf_fentry_test1") int BPF_PROG(test1, int a) { struct map_elem *val; int key = 0; val = bpf_map_lookup_elem(&hmap, &key); if (val) { bpf_timer_init(&val->timer, &hmap, CLOCK_REALTIME); bpf_timer_set_callback(&val->timer, timer_cb); bpf_timer_start(&val->timer, 1000 /* call timer_cb2 in 1 usec */, 0); } } This patch adds helper implementations that rely on hrtimers to call bpf functions as timers expire. The following patches add necessary safety checks. Only programs with CAP_BPF are allowed to use bpf_timer. The amount of timers used by the program is constrained by the memcg recorded at map creation time. The bpf_timer_init() helper needs explicit 'map' argument because inner maps are dynamic and not known at load time. While the bpf_timer_set_callback() is receiving hidden 'aux->prog' argument supplied by the verifier. The prog pointer is needed to do refcnting of bpf program to make sure that program doesn't get freed while the timer is armed. This approach relies on "user refcnt" scheme used in prog_array that stores bpf programs for bpf_tail_call. The bpf_timer_set_callback() will increment the prog refcnt which is paired with bpf_timer_cancel() that will drop the prog refcnt. The ops->map_release_uref is responsible for cancelling the timers and dropping prog refcnt when user space reference to a map reaches zero. This uref approach is done to make sure that Ctrl-C of user space process will not leave timers running forever unless the user space explicitly pinned a map that contained timers in bpffs. bpf_timer_init() and bpf_timer_set_callback() will return -EPERM if map doesn't have user references (is not held by open file descriptor from user space and not pinned in bpffs). The bpf_map_delete_elem() and bpf_map_update_elem() operations cancel and free the timer if given map element had it allocated. "bpftool map update" command can be used to cancel timers. The 'struct bpf_timer' is explicitly __attribute__((aligned(8))) because '__u64 :64' has 1 byte alignment of 8 byte padding. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210715005417.78572-4-alexei.starovoitov@gmail.com --- include/uapi/linux/bpf.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bafb6282032b..3544ec5234f0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4777,6 +4777,70 @@ union bpf_attr { * Execute close syscall for given FD. * Return * A syscall result. + * + * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) + * Description + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * Return + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) + * Description + * Configure the timer to call *callback_fn* static function. + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) + * Description + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + * + * long bpf_timer_cancel(struct bpf_timer *timer) + * Description + * Cancel the timer and wait for callback_fn to finish if it was running. + * Return + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4948,6 +5012,10 @@ union bpf_attr { FN(sys_bpf), \ FN(btf_find_by_name_kind), \ FN(sys_close), \ + FN(timer_init), \ + FN(timer_set_callback), \ + FN(timer_start), \ + FN(timer_cancel), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -6074,6 +6142,11 @@ struct bpf_spin_lock { __u32 val; }; +struct bpf_timer { + __u64 :64; + __u64 :64; +} __attribute__((aligned(8))); + struct bpf_sysctl { __u32 write; /* Sysctl is being read (= 0) or written (= 1). * Allows 1,2,4-byte read, but no write. -- cgit v1.2.3 From 9b99edcae5c80c8fb9f8e7149bae528c9e610a72 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Jul 2021 11:43:55 +0200 Subject: bpf: Add bpf_get_func_ip helper for tracing programs Adding bpf_get_func_ip helper for BPF_PROG_TYPE_TRACING programs, specifically for all trampoline attach types. The trampoline's caller IP address is stored in (ctx - 8) address. so there's no reason to actually call the helper, but rather fixup the call instruction and return [ctx - 8] value directly. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210714094400.396467-4-jolsa@kernel.org --- include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3544ec5234f0..89688f16ad60 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4841,6 +4841,12 @@ union bpf_attr { * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its * own timer which would have led to a deadlock otherwise. + * + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing programs). + * Return + * Address of the traced function. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5016,6 +5022,7 @@ union bpf_attr { FN(timer_set_callback), \ FN(timer_start), \ FN(timer_cancel), \ + FN(get_func_ip), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 9ffd9f3ff7193933dae171740ab70a103d460065 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 14 Jul 2021 11:43:56 +0200 Subject: bpf: Add bpf_get_func_ip helper for kprobe programs Adding bpf_get_func_ip helper for BPF_PROG_TYPE_KPROBE programs, so it's now possible to call bpf_get_func_ip from both kprobe and kretprobe programs. Taking the caller's address from 'struct kprobe::addr', which is defined for both kprobe and kretprobe. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20210714094400.396467-5-jolsa@kernel.org --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 89688f16ad60..2db6925e04f4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4844,7 +4844,7 @@ union bpf_attr { * * u64 bpf_get_func_ip(void *ctx) * Description - * Get address of the traced function (for tracing programs). + * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. */ -- cgit v1.2.3 From b83d23a2a38b1770da0491257ae81d52307f7816 Mon Sep 17 00:00:00 2001 From: Mark Gray Date: Thu, 15 Jul 2021 08:27:54 -0400 Subject: openvswitch: Introduce per-cpu upcall dispatch The Open vSwitch kernel module uses the upcall mechanism to send packets from kernel space to user space when it misses in the kernel space flow table. The upcall sends packets via a Netlink socket. Currently, a Netlink socket is created for every vport. In this way, there is a 1:1 mapping between a vport and a Netlink socket. When a packet is received by a vport, if it needs to be sent to user space, it is sent via the corresponding Netlink socket. This mechanism, with various iterations of the corresponding user space code, has seen some limitations and issues: * On systems with a large number of vports, there is a correspondingly large number of Netlink sockets which can limit scaling. (https://bugzilla.redhat.com/show_bug.cgi?id=1526306) * Packet reordering on upcalls. (https://bugzilla.redhat.com/show_bug.cgi?id=1844576) * A thundering herd issue. (https://bugzilla.redhat.com/show_bug.cgi?id=1834444) This patch introduces an alternative, feature-negotiated, upcall mode using a per-cpu dispatch rather than a per-vport dispatch. In this mode, the Netlink socket to be used for the upcall is selected based on the CPU of the thread that is executing the upcall. In this way, it resolves the issues above as: a) The number of Netlink sockets scales with the number of CPUs rather than the number of vports. b) Ordering per-flow is maintained as packets are distributed to CPUs based on mechanisms such as RSS and flows are distributed to a single user space thread. c) Packets from a flow can only wake up one user space thread. The corresponding user space code can be found at: https://mail.openvswitch.org/pipermail/ovs-dev/2021-July/385139.html Bugzilla: https://bugzilla.redhat.com/1844576 Signed-off-by: Mark Gray Acked-by: Flavio Leitner Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 8d16744edc31..6571b57b2268 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -70,6 +70,8 @@ enum ovs_datapath_cmd { * set on the datapath port (for OVS_ACTION_ATTR_MISS). Only valid on * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should * not be sent. + * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when + * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set. * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the * datapath. Always present in notifications. * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the @@ -87,6 +89,9 @@ enum ovs_datapath_attr { OVS_DP_ATTR_USER_FEATURES, /* OVS_DP_F_* */ OVS_DP_ATTR_PAD, OVS_DP_ATTR_MASKS_CACHE_SIZE, + OVS_DP_ATTR_PER_CPU_PIDS, /* Netlink PIDS to receive upcalls in per-cpu + * dispatch mode + */ __OVS_DP_ATTR_MAX }; @@ -127,6 +132,9 @@ struct ovs_vport_stats { /* Allow tc offload recirc sharing */ #define OVS_DP_F_TC_RECIRC_SHARING (1 << 2) +/* Allow per-cpu dispatch of upcalls */ +#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU (1 << 3) + /* Fixed logical ports. */ #define OVSP_LOCAL ((__u32)0) -- cgit v1.2.3 From f4b7002a7076f025dce59647a77c8251175d2b34 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 19 Jul 2021 20:06:28 +0300 Subject: net: bridge: add vlan mcast snooping knob Add a global knob that controls if vlan multicast snooping is enabled. The proper contexts (vlan or bridge-wide) will be chosen based on the knob when processing packets and changing bridge device state. Note that vlans have their individual mcast snooping enabled by default, but this knob is needed to turn on bridge vlan snooping. It is disabled by default. To enable the knob vlan filtering must also be enabled, it doesn't make sense to have vlan mcast snooping without vlan filtering since that would lead to inconsistencies. Disabling vlan filtering will also automatically disable vlan mcast snooping. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 6b56a7549531..7927ad80ee86 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -720,12 +720,14 @@ struct br_mcast_stats { /* bridge boolean options * BR_BOOLOPT_NO_LL_LEARN - disable learning from link-local packets + * BR_BOOLOPT_MCAST_VLAN_SNOOPING - control vlan multicast snooping * * IMPORTANT: if adding a new option do not forget to handle * it in br_boolopt_toggle/get and bridge sysfs */ enum br_boolopt_id { BR_BOOLOPT_NO_LL_LEARN, + BR_BOOLOPT_MCAST_VLAN_SNOOPING, BR_BOOLOPT_MAX }; -- cgit v1.2.3 From 1e9ca45662d6bb65fb60d3fbb7737b081d9cffc9 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 19 Jul 2021 20:06:33 +0300 Subject: net: bridge: multicast: include router port vlan id in notifications Use the port multicast context to check if the router port is a vlan and in case it is include its vlan id in the notification. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 7927ad80ee86..90ac9e11c15b 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -629,6 +629,7 @@ enum { MDBA_ROUTER_PATTR_TYPE, MDBA_ROUTER_PATTR_INET_TIMER, MDBA_ROUTER_PATTR_INET6_TIMER, + MDBA_ROUTER_PATTR_VID, __MDBA_ROUTER_PATTR_MAX }; #define MDBA_ROUTER_PATTR_MAX (__MDBA_ROUTER_PATTR_MAX - 1) -- cgit v1.2.3 From 47ecd2dbd8ec43125ea75d7d2e73c888cda8663f Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 19 Jul 2021 20:06:34 +0300 Subject: net: bridge: vlan: add support for global options We can have two types of vlan options depending on context: - per-device vlan options (split in per-bridge and per-port) - global vlan options The second type wasn't supported in the bridge until now, but we need them for per-vlan multicast support, per-vlan STP support and other options which require global vlan context. They are contained in the global bridge vlan context even if the vlan is not configured on the bridge device itself. This patch adds initial netlink attributes and support for setting these global vlan options, they can only be set (RTM_NEWVLAN) and the operation must use the bridge device. Since there are no such options yet it shouldn't have any functional effect. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 90ac9e11c15b..4ed57d1a5d89 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -485,10 +485,15 @@ enum { * [BRIDGE_VLANDB_ENTRY_INFO] * ... * } + * [BRIDGE_VLANDB_GLOBAL_OPTIONS] = { + * [BRIDGE_VLANDB_GOPTS_ID] + * ... + * } */ enum { BRIDGE_VLANDB_UNSPEC, BRIDGE_VLANDB_ENTRY, + BRIDGE_VLANDB_GLOBAL_OPTIONS, __BRIDGE_VLANDB_MAX, }; #define BRIDGE_VLANDB_MAX (__BRIDGE_VLANDB_MAX - 1) @@ -538,6 +543,14 @@ enum { }; #define BRIDGE_VLANDB_STATS_MAX (__BRIDGE_VLANDB_STATS_MAX - 1) +enum { + BRIDGE_VLANDB_GOPTS_UNSPEC, + BRIDGE_VLANDB_GOPTS_ID, + BRIDGE_VLANDB_GOPTS_RANGE, + __BRIDGE_VLANDB_GOPTS_MAX +}; +#define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) + /* Bridge multicast database attributes * [MDBA_MDB] = { * [MDBA_MDB_ENTRY] = { -- cgit v1.2.3 From 743a53d9636aad83da63a8638e8365e817ef6365 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 19 Jul 2021 20:06:35 +0300 Subject: net: bridge: vlan: add support for dumping global vlan options Add a new vlan options dump flag which causes only global vlan options to be dumped. The dumps are done only with bridge devices, ports are ignored. They support vlan compression if the options in sequential vlans are equal (currently always true). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 4ed57d1a5d89..946ccf33dc53 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -479,6 +479,7 @@ enum { /* flags used in BRIDGE_VLANDB_DUMP_FLAGS attribute to affect dumps */ #define BRIDGE_VLANDB_DUMPF_STATS (1 << 0) /* Include stats in the dump */ +#define BRIDGE_VLANDB_DUMPF_GLOBAL (1 << 1) /* Dump global vlan options only */ /* Bridge vlan RTM attributes * [BRIDGE_VLANDB_ENTRY] = { -- cgit v1.2.3 From 9dee572c384846f4ece029ab5688faed0682e48a Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 19 Jul 2021 20:06:37 +0300 Subject: net: bridge: vlan: add mcast snooping control Add a new global vlan option which controls whether multicast snooping is enabled or disabled for a single vlan. It controls the vlan private flag: BR_VLFLAG_GLOBAL_MCAST_ENABLED. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 946ccf33dc53..5aca85874447 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -548,6 +548,7 @@ enum { BRIDGE_VLANDB_GOPTS_UNSPEC, BRIDGE_VLANDB_GOPTS_ID, BRIDGE_VLANDB_GOPTS_RANGE, + BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From 2d151d39073aff498358543801fca0f670fea981 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Sun, 18 Jul 2021 09:11:06 +0200 Subject: xfrm: Add possibility to set the default to block if we have no policy As the default we assume the traffic to pass, if we have no matching IPsec policy. With this patch, we have a possibility to change this default from allow to block. It can be configured via netlink. Each direction (input/output/forward) can be configured separately. With the default to block configuered, we need allow policies for all packet flows we accept. We do not use default policy lookup for the loopback device. v1->v2 - fix compiling when XFRM is disabled - Reported-by: kernel test robot Co-developed-by: Christian Langrock Signed-off-by: Christian Langrock Co-developed-by: Antony Antony Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index ffc6a5391bb7..6e8095106192 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -213,6 +213,11 @@ enum { XFRM_MSG_GETSPDINFO, #define XFRM_MSG_GETSPDINFO XFRM_MSG_GETSPDINFO + XFRM_MSG_SETDEFAULT, +#define XFRM_MSG_SETDEFAULT XFRM_MSG_SETDEFAULT + XFRM_MSG_GETDEFAULT, +#define XFRM_MSG_GETDEFAULT XFRM_MSG_GETDEFAULT + XFRM_MSG_MAPPING, #define XFRM_MSG_MAPPING XFRM_MSG_MAPPING __XFRM_MSG_MAX @@ -508,6 +513,11 @@ struct xfrm_user_offload { #define XFRM_OFFLOAD_IPV6 1 #define XFRM_OFFLOAD_INBOUND 2 +struct xfrm_userpolicy_default { + __u8 dirmask; + __u8 action; +}; + #ifndef __KERNEL__ /* backwards compatibility for userspace */ #define XFRMGRP_ACQUIRE 1 -- cgit v1.2.3 From db67f219fc9365a0c456666ed7c134d43ab0be8a Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 20 Jul 2021 21:42:56 +0200 Subject: uapi: IPv6 IOAM headers definition This patch provides the IPv6 IOAM option header [1] as well as the IOAM Trace header [2]. An IOAM option must be 4n-aligned. Here is an overview of a Hop-by-Hop with an IOAM Trace option: +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Next header | Hdr Ext Len | Padding | Padding | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Option Type | Opt Data Len | Reserved | IOAM Type | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | Namespace-ID | NodeLen | Flags | RemainingLen| +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | IOAM-Trace-Type | Reserved | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+<-+ | | | | node data [n] | | | | | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ D | | a | node data [n-1] | t | | a +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ ~ ... ~ S +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ p | | a | node data [1] | c | | e +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | | | | node data [0] | | | | | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+<-+ The IOAM option header starts at "Option Type" and ends after "IOAM Type". The IOAM Trace header starts at "Namespace-ID" and ends after "IOAM-Trace-Type/Reserved". IOAM Type: either Pre-allocated Trace (=0), Incremental Trace (=1), Proof-of-Transit (=2) or Edge-to-Edge (=3). Note that both the Pre-allocated Trace and the Incremental Trace look the same. The two others are not implemented. Namespace-ID: IOAM namespace identifier, not to be confused with network namespaces. It adds further context to IOAM options and associated data, and allows devices which are IOAM capable to determine whether IOAM options must be processed or ignored. It can also be used by an operator to distinguish different operational domains or to identify different sets of devices. NodeLen: Length of data added by each node. It depends on the Trace Type. Flags: Only the Overflow (O) flag for now. The O flag is set by a transit node when there are not enough octets left to record its data. RemainingLen: Remaining free space to record data. IOAM-Trace-Type: Bit field where each bit corresponds to a specific kind of IOAM data. See [2] for a detailed list. [1] https://tools.ietf.org/html/draft-ietf-ippm-ioam-ipv6-options [2] https://tools.ietf.org/html/draft-ietf-ippm-ioam-data Signed-off-by: Justin Iurman Signed-off-by: David S. Miller --- include/uapi/linux/ioam6.h | 123 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 include/uapi/linux/ioam6.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ioam6.h b/include/uapi/linux/ioam6.h new file mode 100644 index 000000000000..2177e4e49566 --- /dev/null +++ b/include/uapi/linux/ioam6.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * IPv6 IOAM implementation + * + * Author: + * Justin Iurman + */ + +#ifndef _UAPI_LINUX_IOAM6_H +#define _UAPI_LINUX_IOAM6_H + +#include +#include + +/* + * IPv6 IOAM Option Header + */ +struct ioam6_hdr { + __u8 opt_type; + __u8 opt_len; + __u8 :8; /* reserved */ +#define IOAM6_TYPE_PREALLOC 0 + __u8 type; +} __attribute__((packed)); + +/* + * IOAM Trace Header + */ +struct ioam6_trace_hdr { + __be16 namespace_id; + +#if defined(__LITTLE_ENDIAN_BITFIELD) + + __u8 :1, /* unused */ + :1, /* unused */ + overflow:1, + nodelen:5; + + __u8 remlen:7, + :1; /* unused */ + + union { + __be32 type_be32; + + struct { + __u32 bit7:1, + bit6:1, + bit5:1, + bit4:1, + bit3:1, + bit2:1, + bit1:1, + bit0:1, + bit15:1, /* unused */ + bit14:1, /* unused */ + bit13:1, /* unused */ + bit12:1, /* unused */ + bit11:1, + bit10:1, + bit9:1, + bit8:1, + bit23:1, /* reserved */ + bit22:1, + bit21:1, /* unused */ + bit20:1, /* unused */ + bit19:1, /* unused */ + bit18:1, /* unused */ + bit17:1, /* unused */ + bit16:1, /* unused */ + :8; /* reserved */ + } type; + }; + +#elif defined(__BIG_ENDIAN_BITFIELD) + + __u8 nodelen:5, + overflow:1, + :1, /* unused */ + :1; /* unused */ + + __u8 :1, /* unused */ + remlen:7; + + union { + __be32 type_be32; + + struct { + __u32 bit0:1, + bit1:1, + bit2:1, + bit3:1, + bit4:1, + bit5:1, + bit6:1, + bit7:1, + bit8:1, + bit9:1, + bit10:1, + bit11:1, + bit12:1, /* unused */ + bit13:1, /* unused */ + bit14:1, /* unused */ + bit15:1, /* unused */ + bit16:1, /* unused */ + bit17:1, /* unused */ + bit18:1, /* unused */ + bit19:1, /* unused */ + bit20:1, /* unused */ + bit21:1, /* unused */ + bit22:1, + bit23:1, /* reserved */ + :8; /* reserved */ + } type; + }; + +#else +#error "Please fix " +#endif + + __u8 data[0]; +} __attribute__((packed)); + +#endif /* _UAPI_LINUX_IOAM6_H */ -- cgit v1.2.3 From 9ee11f0fff205b4b3df9750bff5e94f97c71b6a0 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 20 Jul 2021 21:42:57 +0200 Subject: ipv6: ioam: Data plane support for Pre-allocated Trace Implement support for processing the IOAM Pre-allocated Trace with IPv6, see [1] and [2]. Introduce a new IPv6 Hop-by-Hop TLV option, see IANA [3]. A new per-interface sysctl is introduced. The value is a boolean to accept (=1) or ignore (=0, by default) IPv6 IOAM options on ingress for an interface: - net.ipv6.conf.XXX.ioam6_enabled Two other sysctls are introduced to define IOAM IDs, represented by an integer. They are respectively per-namespace and per-interface: - net.ipv6.ioam6_id - net.ipv6.conf.XXX.ioam6_id The value of the first one represents the IOAM ID of the node itself (u32; max and default value = U32_MAX>>8, due to hop limit concatenation) while the other represents the IOAM ID of an interface (u16; max and default value = U16_MAX). Each "ioam6_id" sysctl has a "_wide" equivalent: - net.ipv6.ioam6_id_wide - net.ipv6.conf.XXX.ioam6_id_wide The value of the first one represents the wide IOAM ID of the node itself (u64; max and default value = U64_MAX>>8, due to hop limit concatenation) while the other represents the wide IOAM ID of an interface (u32; max and default value = U32_MAX). The use of short and wide equivalents is not exclusive, a deployment could choose to leverage both. For example, net.ipv6.conf.XXX.ioam6_id (short format) could be an identifier for a physical interface, whereas net.ipv6.conf.XXX.ioam6_id_wide (wide format) could be an identifier for a logical sub-interface. Documentation about new sysctls is provided at the end of this patchset. Two relativistic hash tables are used: one for IOAM namespaces, the other for IOAM schemas. A namespace can only have a single active schema and a schema can only be attached to a single namespace (1:1 relationship). [1] https://tools.ietf.org/html/draft-ietf-ippm-ioam-ipv6-options [2] https://tools.ietf.org/html/draft-ietf-ippm-ioam-data [3] https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml#ipv6-parameters-2 Signed-off-by: Justin Iurman Signed-off-by: David S. Miller --- include/uapi/linux/in6.h | 1 + include/uapi/linux/ioam6.h | 9 +++++++++ include/uapi/linux/ipv6.h | 3 +++ 3 files changed, 13 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 5ad396a57eb3..c4c53a9ab959 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -145,6 +145,7 @@ struct in6_flowlabel_req { #define IPV6_TLV_PADN 1 #define IPV6_TLV_ROUTERALERT 5 #define IPV6_TLV_CALIPSO 7 /* RFC 5570 */ +#define IPV6_TLV_IOAM 49 /* TEMPORARY IANA allocation for IOAM */ #define IPV6_TLV_JUMBO 194 #define IPV6_TLV_HAO 201 /* home address option */ diff --git a/include/uapi/linux/ioam6.h b/include/uapi/linux/ioam6.h index 2177e4e49566..23ba6e85582f 100644 --- a/include/uapi/linux/ioam6.h +++ b/include/uapi/linux/ioam6.h @@ -12,6 +12,15 @@ #include #include +#define IOAM6_U16_UNAVAILABLE U16_MAX +#define IOAM6_U32_UNAVAILABLE U32_MAX +#define IOAM6_U64_UNAVAILABLE U64_MAX + +#define IOAM6_DEFAULT_ID (IOAM6_U32_UNAVAILABLE >> 8) +#define IOAM6_DEFAULT_ID_WIDE (IOAM6_U64_UNAVAILABLE >> 8) +#define IOAM6_DEFAULT_IF_ID IOAM6_U16_UNAVAILABLE +#define IOAM6_DEFAULT_IF_ID_WIDE IOAM6_U32_UNAVAILABLE + /* * IPv6 IOAM Option Header */ diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 70603775fe91..b243a53fa985 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -190,6 +190,9 @@ enum { DEVCONF_NDISC_TCLASS, DEVCONF_RPL_SEG_ENABLED, DEVCONF_RA_DEFRTR_METRIC, + DEVCONF_IOAM6_ENABLED, + DEVCONF_IOAM6_ID, + DEVCONF_IOAM6_ID_WIDE, DEVCONF_MAX }; -- cgit v1.2.3 From 8c6f6fa6772696be0c047a711858084b38763728 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 20 Jul 2021 21:42:58 +0200 Subject: ipv6: ioam: IOAM Generic Netlink API Add Generic Netlink commands to allow userspace to configure IOAM namespaces and schemas. The target is iproute2 and the patch is ready. It will be posted as soon as this patchset is merged. Here is an overview: $ ip ioam Usage: ip ioam { COMMAND | help } ip ioam namespace show ip ioam namespace add ID [ data DATA32 ] [ wide DATA64 ] ip ioam namespace del ID ip ioam schema show ip ioam schema add ID DATA ip ioam schema del ID ip ioam namespace set ID schema { ID | none } Signed-off-by: Justin Iurman Signed-off-by: David S. Miller --- include/uapi/linux/ioam6_genl.h | 52 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 include/uapi/linux/ioam6_genl.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ioam6_genl.h b/include/uapi/linux/ioam6_genl.h new file mode 100644 index 000000000000..ca4b22833754 --- /dev/null +++ b/include/uapi/linux/ioam6_genl.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * IPv6 IOAM Generic Netlink API + * + * Author: + * Justin Iurman + */ + +#ifndef _UAPI_LINUX_IOAM6_GENL_H +#define _UAPI_LINUX_IOAM6_GENL_H + +#define IOAM6_GENL_NAME "IOAM6" +#define IOAM6_GENL_VERSION 0x1 + +enum { + IOAM6_ATTR_UNSPEC, + + IOAM6_ATTR_NS_ID, /* u16 */ + IOAM6_ATTR_NS_DATA, /* u32 */ + IOAM6_ATTR_NS_DATA_WIDE,/* u64 */ + +#define IOAM6_MAX_SCHEMA_DATA_LEN (255 * 4) + IOAM6_ATTR_SC_ID, /* u32 */ + IOAM6_ATTR_SC_DATA, /* Binary */ + IOAM6_ATTR_SC_NONE, /* Flag */ + + IOAM6_ATTR_PAD, + + __IOAM6_ATTR_MAX, +}; + +#define IOAM6_ATTR_MAX (__IOAM6_ATTR_MAX - 1) + +enum { + IOAM6_CMD_UNSPEC, + + IOAM6_CMD_ADD_NAMESPACE, + IOAM6_CMD_DEL_NAMESPACE, + IOAM6_CMD_DUMP_NAMESPACES, + + IOAM6_CMD_ADD_SCHEMA, + IOAM6_CMD_DEL_SCHEMA, + IOAM6_CMD_DUMP_SCHEMAS, + + IOAM6_CMD_NS_SET_SCHEMA, + + __IOAM6_CMD_MAX, +}; + +#define IOAM6_CMD_MAX (__IOAM6_CMD_MAX - 1) + +#endif /* _UAPI_LINUX_IOAM6_GENL_H */ -- cgit v1.2.3 From 3edede08ff37c6a9370510508d5eeb54890baf47 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Tue, 20 Jul 2021 21:42:59 +0200 Subject: ipv6: ioam: Support for IOAM injection with lwtunnels Add support for the IOAM inline insertion (only for the host-to-host use case) which is per-route configured with lightweight tunnels. The target is iproute2 and the patch is ready. It will be posted as soon as this patchset is merged. Here is an overview: $ ip -6 ro ad fc00::1/128 encap ioam6 trace type 0x800000 ns 1 size 12 dev eth0 This example configures an IOAM Pre-allocated Trace option attached to the fc00::1/128 prefix. The IOAM namespace (ns) is 1, the size of the pre-allocated trace data block is 12 octets (size) and only the first IOAM data (bit 0: hop_limit + node id) is included in the trace (type) represented as a bitfield. The reason why the in-transit (IPv6-in-IPv6 encapsulation) use case is not implemented is explained on the patchset cover. Signed-off-by: Justin Iurman Signed-off-by: David S. Miller --- include/uapi/linux/ioam6.h | 1 + include/uapi/linux/ioam6_iptunnel.h | 20 ++++++++++++++++++++ include/uapi/linux/lwtunnel.h | 1 + 3 files changed, 22 insertions(+) create mode 100644 include/uapi/linux/ioam6_iptunnel.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ioam6.h b/include/uapi/linux/ioam6.h index 23ba6e85582f..ac4de376f0ce 100644 --- a/include/uapi/linux/ioam6.h +++ b/include/uapi/linux/ioam6.h @@ -126,6 +126,7 @@ struct ioam6_trace_hdr { #error "Please fix " #endif +#define IOAM6_TRACE_DATA_SIZE_MAX 244 __u8 data[0]; } __attribute__((packed)); diff --git a/include/uapi/linux/ioam6_iptunnel.h b/include/uapi/linux/ioam6_iptunnel.h new file mode 100644 index 000000000000..bae14636a8c8 --- /dev/null +++ b/include/uapi/linux/ioam6_iptunnel.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * IPv6 IOAM Lightweight Tunnel API + * + * Author: + * Justin Iurman + */ + +#ifndef _UAPI_LINUX_IOAM6_IPTUNNEL_H +#define _UAPI_LINUX_IOAM6_IPTUNNEL_H + +enum { + IOAM6_IPTUNNEL_UNSPEC, + IOAM6_IPTUNNEL_TRACE, /* struct ioam6_trace_hdr */ + __IOAM6_IPTUNNEL_MAX, +}; + +#define IOAM6_IPTUNNEL_MAX (__IOAM6_IPTUNNEL_MAX - 1) + +#endif /* _UAPI_LINUX_IOAM6_IPTUNNEL_H */ diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 568a4303ccce..2e206919125c 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -14,6 +14,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_BPF, LWTUNNEL_ENCAP_SEG6_LOCAL, LWTUNNEL_ENCAP_RPL, + LWTUNNEL_ENCAP_IOAM6, __LWTUNNEL_ENCAP_MAX, }; -- cgit v1.2.3 From e4252cb66637b846b916cca7c2cdb4ed22ab2fc3 Mon Sep 17 00:00:00 2001 From: Mark Gray Date: Fri, 23 Jul 2021 10:24:12 -0400 Subject: openvswitch: update kdoc OVS_DP_ATTR_PER_CPU_PIDS Signed-off-by: Mark Gray Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 6571b57b2268..0e436a3755f1 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -70,7 +70,7 @@ enum ovs_datapath_cmd { * set on the datapath port (for OVS_ACTION_ATTR_MISS). Only valid on * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should * not be sent. - * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when + * @OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set. * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the * datapath. Always present in notifications. -- cgit v1.2.3 From 784dcfa56e0453bb197601ba0b8196f6f892ebcb Mon Sep 17 00:00:00 2001 From: Mark Gray Date: Fri, 23 Jul 2021 10:24:13 -0400 Subject: openvswitch: fix alignment issues Signed-off-by: Mark Gray Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 0e436a3755f1..150bcff49b1c 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -89,8 +89,8 @@ enum ovs_datapath_attr { OVS_DP_ATTR_USER_FEATURES, /* OVS_DP_F_* */ OVS_DP_ATTR_PAD, OVS_DP_ATTR_MASKS_CACHE_SIZE, - OVS_DP_ATTR_PER_CPU_PIDS, /* Netlink PIDS to receive upcalls in per-cpu - * dispatch mode + OVS_DP_ATTR_PER_CPU_PIDS, /* Netlink PIDS to receive upcalls in + * per-cpu dispatch mode */ __OVS_DP_ATTR_MAX }; -- cgit v1.2.3 From 56af5e749f20c3a540310c207dcc373f4f09156e Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 27 Jul 2021 18:33:15 -0700 Subject: net/sched: act_skbmod: Add SKBMOD_F_ECN option support Currently, when doing rate limiting using the tc-police(8) action, the easiest way is to simply drop the packets which exceed or conform the configured bandwidth limit. Add a new option to tc-skbmod(8), so that users may use the ECN [1] extension to explicitly inform the receiver about the congestion instead of dropping packets "on the floor". The 2 least significant bits of the Traffic Class field in IPv4 and IPv6 headers are used to represent different ECN states [2]: 0b00: "Non ECN-Capable Transport", Non-ECT 0b10: "ECN Capable Transport", ECT(0) 0b01: "ECN Capable Transport", ECT(1) 0b11: "Congestion Encountered", CE As an example: $ tc filter add dev eth0 parent 1: protocol ip prio 10 \ matchall action skbmod ecn Doing the above marks all ECT(0) and ECT(1) packets as CE. It does NOT affect Non-ECT or non-IP packets. In the tc-police scenario mentioned above, users may pipe a tc-police action and a tc-skbmod "ecn" action together to achieve ECN-based rate limiting. For TCP connections, upon receiving a CE packet, the receiver will respond with an ECE packet, asking the sender to reduce their congestion window. However ECN also works with other L4 protocols e.g. DCCP and SCTP [2], and our implementation does not touch or care about L4 headers. The updated tc-skbmod SYNOPSIS looks like the following: tc ... action skbmod { set SETTABLE | swap SWAPPABLE | ecn } ... Only one of "set", "swap" or "ecn" shall be used in a single tc-skbmod command. Trying to use more than one of them at a time is considered undefined behavior; pipe multiple tc-skbmod commands together instead. "set" and "swap" only affect Ethernet packets, while "ecn" only affects IPv{4,6} packets. It is also worth mentioning that, in theory, the same effect could be achieved by piping a "police" action and a "bpf" action using the bpf_skb_ecn_set_ce() helper, but this requires eBPF programming from the user, thus impractical. Depends on patch "net/sched: act_skbmod: Skip non-Ethernet packets". [1] https://datatracker.ietf.org/doc/html/rfc3168 [2] https://en.wikipedia.org/wiki/Explicit_Congestion_Notification Reviewed-by: Cong Wang Signed-off-by: Peilin Ye Signed-off-by: David S. Miller --- include/uapi/linux/tc_act/tc_skbmod.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h index c525b3503797..af6ef2cfbf3d 100644 --- a/include/uapi/linux/tc_act/tc_skbmod.h +++ b/include/uapi/linux/tc_act/tc_skbmod.h @@ -17,6 +17,7 @@ #define SKBMOD_F_SMAC 0x2 #define SKBMOD_F_ETYPE 0x4 #define SKBMOD_F_SWAPMAC 0x8 +#define SKBMOD_F_ECN 0x10 struct tc_skbmod { tc_gen; -- cgit v1.2.3 From 5d8dbb7fb82b8661c16d496644b931c0e2e3a12e Mon Sep 17 00:00:00 2001 From: Pavel Skripkin Date: Wed, 28 Jul 2021 19:38:18 +0300 Subject: net: xfrm: fix shift-out-of-bounce We need to check up->dirmask to avoid shift-out-of-bounce bug, since up->dirmask comes from userspace. Also, added XFRM_USERPOLICY_DIRMASK_MAX constant to uapi to inform user-space that up->dirmask has maximum possible value Fixes: 2d151d39073a ("xfrm: Add possibility to set the default to block if we have no policy") Reported-and-tested-by: syzbot+9cd5837a045bbee5b810@syzkaller.appspotmail.com Signed-off-by: Pavel Skripkin Signed-off-by: Steffen Klassert --- include/uapi/linux/xfrm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 6e8095106192..b96c1ea7166d 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -514,6 +514,7 @@ struct xfrm_user_offload { #define XFRM_OFFLOAD_INBOUND 2 struct xfrm_userpolicy_default { +#define XFRM_USERPOLICY_DIRMASK_MAX (sizeof(__u8) * 8) __u8 dirmask; __u8 action; }; -- cgit v1.2.3 From bc49d8169aa72295104f1558830c568efb946315 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Thu, 29 Jul 2021 10:20:39 +0800 Subject: mctp: Add MCTP base Add basic Kconfig, an initial (empty) af_mctp source object, and {AF,PF}_MCTP definitions, and the required definitions for a new protocol type. Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/uapi/linux/mctp.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 include/uapi/linux/mctp.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h new file mode 100644 index 000000000000..2640a589c14c --- /dev/null +++ b/include/uapi/linux/mctp.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Management Component Transport Protocol (MCTP) + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#ifndef __UAPI_MCTP_H +#define __UAPI_MCTP_H + +struct sockaddr_mctp { +}; + +#endif /* __UAPI_MCTP_H */ -- cgit v1.2.3 From 60fc63981693f807baa0e404104dedea0e8b4e61 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Thu, 29 Jul 2021 10:20:42 +0800 Subject: mctp: Add sockaddr_mctp to uapi This change introduces the user-visible MCTP header, containing the protocol-specific addressing definitions. Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/uapi/linux/mctp.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h index 2640a589c14c..52b54d13f385 100644 --- a/include/uapi/linux/mctp.h +++ b/include/uapi/linux/mctp.h @@ -9,7 +9,28 @@ #ifndef __UAPI_MCTP_H #define __UAPI_MCTP_H +#include + +typedef __u8 mctp_eid_t; + +struct mctp_addr { + mctp_eid_t s_addr; +}; + struct sockaddr_mctp { + unsigned short int smctp_family; + int smctp_network; + struct mctp_addr smctp_addr; + __u8 smctp_type; + __u8 smctp_tag; }; +#define MCTP_NET_ANY 0x0 + +#define MCTP_ADDR_NULL 0x00 +#define MCTP_ADDR_ANY 0xff + +#define MCTP_TAG_MASK 0x07 +#define MCTP_TAG_OWNER 0x08 + #endif /* __UAPI_MCTP_H */ -- cgit v1.2.3 From 4b2e69305cbbc7c32ecbd946110b505c4ff6071a Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Thu, 29 Jul 2021 10:20:43 +0800 Subject: mctp: Add initial driver infrastructure Add an empty drivers/net/mctp/, for future interface drivers. Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/uapi/linux/if_arp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h index c3cc5a9e5eaf..4783af9fe520 100644 --- a/include/uapi/linux/if_arp.h +++ b/include/uapi/linux/if_arp.h @@ -54,6 +54,7 @@ #define ARPHRD_X25 271 /* CCITT X.25 */ #define ARPHRD_HWX25 272 /* Boards with X.25 in firmware */ #define ARPHRD_CAN 280 /* Controller Area Network */ +#define ARPHRD_MCTP 290 #define ARPHRD_PPP 512 #define ARPHRD_CISCO 513 /* Cisco HDLC */ #define ARPHRD_HDLC ARPHRD_CISCO -- cgit v1.2.3 From 583be982d93479ea3d85091b0fd0b01201ede87d Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Thu, 29 Jul 2021 10:20:44 +0800 Subject: mctp: Add device handling and netlink interface This change adds the infrastructure for managing MCTP netdevices; we add a pointer to the AF_MCTP-specific data to struct netdevice, and hook up the rtnetlink operations for adding and removing addresses. Includes changes from Matt Johnston . Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 3 +++ include/uapi/linux/if_link.h | 10 ++++++++++ include/uapi/linux/mctp.h | 1 + 3 files changed, 14 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index a0b637911d3c..5f589c7a8382 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -151,6 +151,9 @@ #define ETH_P_MAP 0x00F9 /* Qualcomm multiplexing and * aggregation protocol */ +#define ETH_P_MCTP 0x00FA /* Management component transport + * protocol packets + */ /* * This is an Ethernet frame header. diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 4882e81514b6..49b22afab78f 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1260,4 +1260,14 @@ struct ifla_rmnet_flags { __u32 mask; }; +/* MCTP section */ + +enum { + IFLA_MCTP_UNSPEC, + IFLA_MCTP_NET, + __IFLA_MCTP_MAX, +}; + +#define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1) + #endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h index 52b54d13f385..a9d8edb3402b 100644 --- a/include/uapi/linux/mctp.h +++ b/include/uapi/linux/mctp.h @@ -26,6 +26,7 @@ struct sockaddr_mctp { }; #define MCTP_NET_ANY 0x0 +#define MCTP_NET_DEFAULT 0x0 #define MCTP_ADDR_NULL 0x00 #define MCTP_ADDR_ANY 0xff -- cgit v1.2.3 From 03f2bbc4ee57ca53b2fa1d9caabc5006e0b8f375 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Thu, 29 Jul 2021 10:20:52 +0800 Subject: mctp: Allow per-netns default networks Currently we have a compile-time default network (MCTP_INITIAL_DEFAULT_NET). This change introduces a default_net field on the net namespace, allowing future configuration for new interfaces. Signed-off-by: Matt Johnston Signed-off-by: David S. Miller --- include/uapi/linux/mctp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h index a9d8edb3402b..52b54d13f385 100644 --- a/include/uapi/linux/mctp.h +++ b/include/uapi/linux/mctp.h @@ -26,7 +26,6 @@ struct sockaddr_mctp { }; #define MCTP_NET_ANY 0x0 -#define MCTP_NET_DEFAULT 0x0 #define MCTP_ADDR_NULL 0x00 #define MCTP_ADDR_ANY 0xff -- cgit v1.2.3 From 695176bfe5dec2051f950bdac0ae0b21e29e6de3 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 29 Jul 2021 16:12:14 -0700 Subject: net_sched: refactor TC action init API TC action ->init() API has 10 parameters, it becomes harder to read. Some of them are just boolean and can be replaced by flags. Similarly for the internal API tcf_action_init() and tcf_exts_validate(). This patch converts them to flags and fold them into the upper 16 bits of "flags", whose lower 16 bits are still reserved for user-space. More specifically, the following kernel flags are introduced: TCA_ACT_FLAGS_POLICE replace 'name' in a few contexts, to distinguish whether it is compatible with policer. TCA_ACT_FLAGS_BIND replaces 'bind', to indicate whether this action is bound to a filter. TCA_ACT_FLAGS_REPLACE replaces 'ovr' in most contexts, means we are replacing an existing action. TCA_ACT_FLAGS_NO_RTNL replaces 'rtnl_held' but has the opposite meaning, because we still hold RTNL in most cases. The only user-space flag TCA_ACT_FLAGS_NO_PERCPU_STATS is untouched and still stored as before. I have tested this patch with tdc and I do not see any failure related to this patch. Tested-by: Vlad Buslov Acked-by: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 025c40fef93d..6836ccb9c45d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -22,6 +22,7 @@ enum { __TCA_ACT_MAX }; +/* See other TCA_ACT_FLAGS_ * flags in include/net/act_api.h. */ #define TCA_ACT_FLAGS_NO_PERCPU_STATS 1 /* Don't use percpu allocator for * actions stats. */ -- cgit v1.2.3 From 2d3e5caf96b9449af951e63476657acd759c1a30 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sat, 31 Jul 2021 12:08:30 -0500 Subject: net/ipv4: Replace one-element array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Use an anonymous union with a couple of anonymous structs in order to keep userspace unchanged: $ pahole -C ip_msfilter net/ipv4/ip_sockglue.o struct ip_msfilter { union { struct { __be32 imsf_multiaddr_aux; /* 0 4 */ __be32 imsf_interface_aux; /* 4 4 */ __u32 imsf_fmode_aux; /* 8 4 */ __u32 imsf_numsrc_aux; /* 12 4 */ __be32 imsf_slist[1]; /* 16 4 */ }; /* 0 20 */ struct { __be32 imsf_multiaddr; /* 0 4 */ __be32 imsf_interface; /* 4 4 */ __u32 imsf_fmode; /* 8 4 */ __u32 imsf_numsrc; /* 12 4 */ __be32 imsf_slist_flex[0]; /* 16 0 */ }; /* 0 16 */ }; /* 0 20 */ /* size: 20, cachelines: 1, members: 1 */ /* last cacheline: 20 bytes */ }; Also, refactor the code accordingly and make use of the struct_size() and flex_array_size() helpers. This helps with the ongoing efforts to globally enable -Warray-bounds and get us closer to being able to tighten the FORTIFY_SOURCE routines on memcpy(). [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Link: https://github.com/KSPP/linux/issues/109 Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- include/uapi/linux/in.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index d1b327036ae4..193b7cf1f0ac 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -188,11 +188,22 @@ struct ip_mreq_source { }; struct ip_msfilter { - __be32 imsf_multiaddr; - __be32 imsf_interface; - __u32 imsf_fmode; - __u32 imsf_numsrc; - __be32 imsf_slist[1]; + union { + struct { + __be32 imsf_multiaddr_aux; + __be32 imsf_interface_aux; + __u32 imsf_fmode_aux; + __u32 imsf_numsrc_aux; + __be32 imsf_slist[1]; + }; + struct { + __be32 imsf_multiaddr; + __be32 imsf_interface; + __u32 imsf_fmode; + __u32 imsf_numsrc; + __be32 imsf_slist_flex[]; + }; + }; }; #define IP_MSFILTER_SIZE(numsrc) \ -- cgit v1.2.3 From 3a755cd8b7c601f756cbbf908b84f7cc8c04a02b Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 2 Aug 2021 11:02:19 +0800 Subject: bonding: add new option lacp_active Add an option lacp_active, which is similar with team's runner.active. This option specifies whether to send LACPDU frames periodically. If set on, the LACPDU frames are sent along with the configured lacp_rate setting. If set off, the LACPDU frames acts as "speak when spoken to". Note, the LACPDU state frames still will be sent when init or unbind port. v2: remove module parameter Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 49b22afab78f..5310003523ce 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -855,6 +855,7 @@ enum { IFLA_BOND_AD_ACTOR_SYSTEM, IFLA_BOND_TLB_DYNAMIC_LB, IFLA_BOND_PEER_NOTIF_DELAY, + IFLA_BOND_AD_LACP_ACTIVE, __IFLA_BOND_MAX, }; -- cgit v1.2.3 From 5b9272e93f2efe3f6cda60cc2c26817b2ce49386 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 7 Jul 2021 11:48:54 +0200 Subject: can: j1939: extend UAPI to notify about RX status To be able to create applications with user friendly feedback, we need be able to provide receive status information. Typical ETP transfer may take seconds or even hours. To give user some clue or show a progress bar, the stack should push status updates. Same as for the TX information, the socket error queue will be used with following new signals: - J1939_EE_INFO_RX_RTS - received and accepted request to send signal. - J1939_EE_INFO_RX_DPO - received data package offset signal - J1939_EE_INFO_RX_ABORT - RX session was aborted Instead of completion signal, user will get data package. To activate this signals, application should set SOF_TIMESTAMPING_RX_SOFTWARE to the SO_TIMESTAMPING socket option. This will avoid unpredictable application behavior for the old software. Link: https://lore.kernel.org/r/20210707094854.30781-3-o.rempel@pengutronix.de Signed-off-by: Oleksij Rempel Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/j1939.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/can/j1939.h b/include/uapi/linux/can/j1939.h index df6e821075c1..38936460f668 100644 --- a/include/uapi/linux/can/j1939.h +++ b/include/uapi/linux/can/j1939.h @@ -78,11 +78,20 @@ enum { enum { J1939_NLA_PAD, J1939_NLA_BYTES_ACKED, + J1939_NLA_TOTAL_SIZE, + J1939_NLA_PGN, + J1939_NLA_SRC_NAME, + J1939_NLA_DEST_NAME, + J1939_NLA_SRC_ADDR, + J1939_NLA_DEST_ADDR, }; enum { J1939_EE_INFO_NONE, J1939_EE_INFO_TX_ABORT, + J1939_EE_INFO_RX_RTS, + J1939_EE_INFO_RX_DPO, + J1939_EE_INFO_RX_ABORT, }; struct j1939_filter { -- cgit v1.2.3 From 04190bf8944deb7e3ac165a1a494db23aa0160a9 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Wed, 4 Aug 2021 10:55:56 +0300 Subject: sock: allow reading and changing sk_userlocks with setsockopt SOCK_SNDBUF_LOCK and SOCK_RCVBUF_LOCK flags disable automatic socket buffers adjustment done by kernel (see tcp_fixup_rcvbuf() and tcp_sndbuf_expand()). If we've just created a new socket this adjustment is enabled on it, but if one changes the socket buffer size by setsockopt(SO_{SND,RCV}BUF*) it becomes disabled. CRIU needs to call setsockopt(SO_{SND,RCV}BUF*) on each socket on restore as it first needs to increase buffer sizes for packet queues restore and second it needs to restore back original buffer sizes. So after CRIU restore all sockets become non-auto-adjustable, which can decrease network performance of restored applications significantly. CRIU need to be able to restore sockets with enabled/disabled adjustment to the same state it was before dump, so let's add special setsockopt for it. Let's also export SOCK_SNDBUF_LOCK and SOCK_RCVBUF_LOCK flags to uAPI so that using these interface one can reenable automatic socket buffer adjustment on their sockets. Signed-off-by: Pavel Tikhomirov Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/socket.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h index c3409c8ec0dd..eb0a9a5b6e71 100644 --- a/include/uapi/linux/socket.h +++ b/include/uapi/linux/socket.h @@ -26,4 +26,9 @@ struct __kernel_sockaddr_storage { }; }; +#define SOCK_SNDBUF_LOCK 1 +#define SOCK_RCVBUF_LOCK 2 + +#define SOCK_BUF_LOCK_MASK (SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK) + #endif /* _UAPI_LINUX_SOCKET_H */ -- cgit v1.2.3 From db243b796439c0caba47865564d8acd18a301d18 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 4 Aug 2021 15:45:36 -0500 Subject: net/ipv4/ipv6: Replace one-element arraya with flexible-array members MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. Use an anonymous union with a couple of anonymous structs in order to keep userspace unchanged and refactor the related code accordingly: $ pahole -C group_filter net/ipv4/ip_sockglue.o struct group_filter { union { struct { __u32 gf_interface_aux; /* 0 4 */ /* XXX 4 bytes hole, try to pack */ struct __kernel_sockaddr_storage gf_group_aux; /* 8 128 */ /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */ __u32 gf_fmode_aux; /* 136 4 */ __u32 gf_numsrc_aux; /* 140 4 */ struct __kernel_sockaddr_storage gf_slist[1]; /* 144 128 */ }; /* 0 272 */ struct { __u32 gf_interface; /* 0 4 */ /* XXX 4 bytes hole, try to pack */ struct __kernel_sockaddr_storage gf_group; /* 8 128 */ /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */ __u32 gf_fmode; /* 136 4 */ __u32 gf_numsrc; /* 140 4 */ struct __kernel_sockaddr_storage gf_slist_flex[0]; /* 144 0 */ }; /* 0 144 */ }; /* 0 272 */ /* size: 272, cachelines: 5, members: 1 */ /* last cacheline: 16 bytes */ }; $ pahole -C compat_group_filter net/ipv4/ip_sockglue.o struct compat_group_filter { union { struct { __u32 gf_interface_aux; /* 0 4 */ struct __kernel_sockaddr_storage gf_group_aux __attribute__((__aligned__(4))); /* 4 128 */ /* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */ __u32 gf_fmode_aux; /* 132 4 */ __u32 gf_numsrc_aux; /* 136 4 */ struct __kernel_sockaddr_storage gf_slist[1] __attribute__((__aligned__(4))); /* 140 128 */ } __attribute__((__packed__)) __attribute__((__aligned__(4))); /* 0 268 */ struct { __u32 gf_interface; /* 0 4 */ struct __kernel_sockaddr_storage gf_group __attribute__((__aligned__(4))); /* 4 128 */ /* --- cacheline 2 boundary (128 bytes) was 4 bytes ago --- */ __u32 gf_fmode; /* 132 4 */ __u32 gf_numsrc; /* 136 4 */ struct __kernel_sockaddr_storage gf_slist_flex[0] __attribute__((__aligned__(4))); /* 140 0 */ } __attribute__((__packed__)) __attribute__((__aligned__(4))); /* 0 140 */ } __attribute__((__aligned__(1))); /* 0 268 */ /* size: 268, cachelines: 5, members: 1 */ /* forced alignments: 1 */ /* last cacheline: 12 bytes */ } __attribute__((__packed__)); This helps with the ongoing efforts to globally enable -Warray-bounds and get us closer to being able to tighten the FORTIFY_SOURCE routines on memcpy(). [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.10/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/79 Link: https://github.com/KSPP/linux/issues/109 Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- include/uapi/linux/in.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 193b7cf1f0ac..14168225cecd 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -222,11 +222,22 @@ struct group_source_req { }; struct group_filter { - __u32 gf_interface; /* interface index */ - struct __kernel_sockaddr_storage gf_group; /* multicast address */ - __u32 gf_fmode; /* filter mode */ - __u32 gf_numsrc; /* number of sources */ - struct __kernel_sockaddr_storage gf_slist[1]; /* interface index */ + union { + struct { + __u32 gf_interface_aux; /* interface index */ + struct __kernel_sockaddr_storage gf_group_aux; /* multicast address */ + __u32 gf_fmode_aux; /* filter mode */ + __u32 gf_numsrc_aux; /* number of sources */ + struct __kernel_sockaddr_storage gf_slist[1]; /* interface index */ + }; + struct { + __u32 gf_interface; /* interface index */ + struct __kernel_sockaddr_storage gf_group; /* multicast address */ + __u32 gf_fmode; /* filter mode */ + __u32 gf_numsrc; /* number of sources */ + struct __kernel_sockaddr_storage gf_slist_flex[]; /* interface index */ + }; + }; }; #define GROUP_FILTER_SIZE(numsrc) \ -- cgit v1.2.3 From 9344988d2979ce9eefe136a69efcf692615ebba8 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 30 Jul 2021 15:14:22 +0200 Subject: netfilter: ctnetlink: allow to filter dump by status bits If CTA_STATUS is present, but CTA_STATUS_MASK is not, then the mask is automatically set to 'status', so that kernel returns those entries that have all of the requested bits set. This makes more sense than using a all-one mask since we'd hardly ever find a match. There are no other checks for status bits, so if e.g. userspace sets impossible combinations it will get an empty dump. If kernel would reject unknown status bits, then a program that works on a future kernel that has IPS_FOO bit fails on old kernels. Same for 'impossible' combinations: Kernel never sets ASSURED without first having set SEEN_REPLY, but its possible that a future kernel could do so. Therefore no sanity tests other than a 0-mask. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_conntrack.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index d8484be72fdc..c6e6d7d7d538 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -56,6 +56,7 @@ enum ctattr_type { CTA_LABELS_MASK, CTA_SYNPROXY, CTA_FILTER, + CTA_STATUS_MASK, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) -- cgit v1.2.3 From df271cd641f101decaa4f7c1dd5c62939900bd4c Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 10 Aug 2021 18:29:19 +0300 Subject: net: bridge: vlan: add support for mcast igmp/mld version global options Add support to change and retrieve global vlan IGMP/MLD versions. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 5aca85874447..5188b9f6da28 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -549,6 +549,8 @@ enum { BRIDGE_VLANDB_GOPTS_ID, BRIDGE_VLANDB_GOPTS_RANGE, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING, + BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION, + BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From 931ba87d2017f3869d656f3c705883549bfeb97f Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 10 Aug 2021 18:29:20 +0300 Subject: net: bridge: vlan: add support for mcast last member count global option Add support to change and retrieve global vlan multicast last member count option. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 5188b9f6da28..d7a150034376 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -551,6 +551,7 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION, + BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From 50725f6e6b217e7661ca696b7cc1f1b9aa7bda84 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 10 Aug 2021 18:29:21 +0300 Subject: net: bridge: vlan: add support for mcast startup query count global option Add support to change and retrieve global vlan multicast startup query count option. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index d7a150034376..082b413e1342 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -552,6 +552,7 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT, + BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From 77f6ababa299112092a264cac96bedf1a87015ef Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 10 Aug 2021 18:29:22 +0300 Subject: net: bridge: vlan: add support for mcast last member interval global option Add support to change and retrieve global vlan multicast last member interval option. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 082b413e1342..950ad175610e 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -553,6 +553,8 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT, + BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL, + BRIDGE_VLANDB_GOPTS_PAD, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From 2da0aea21f1c40d003af6680551eaa5471103164 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 10 Aug 2021 18:29:23 +0300 Subject: net: bridge: vlan: add support for mcast membership interval global option Add support to change and retrieve global vlan multicast membership interval option. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 950ad175610e..93f1f16617c8 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -555,6 +555,7 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL, BRIDGE_VLANDB_GOPTS_PAD, + BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From cd9269d463107bc4a53a0965d90a57efeee9ae11 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 10 Aug 2021 18:29:24 +0300 Subject: net: bridge: vlan: add support for mcast querier interval global option Add support to change and retrieve global vlan multicast querier interval option. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 93f1f16617c8..fdc264c57009 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -556,6 +556,7 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL, BRIDGE_VLANDB_GOPTS_PAD, BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL, + BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From d6c08aba4f29f606769939eb6156efceb7dbb790 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 10 Aug 2021 18:29:25 +0300 Subject: net: bridge: vlan: add support for mcast query interval global option Add support to change and retrieve global vlan multicast query interval option. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index fdc264c57009..1517aea738f4 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -557,6 +557,7 @@ enum { BRIDGE_VLANDB_GOPTS_PAD, BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL, + BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From 425214508b1bd3596edb31da8d9aedee30f2b4f5 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 10 Aug 2021 18:29:26 +0300 Subject: net: bridge: vlan: add support for mcast query response interval global option Add support to change and retrieve global vlan multicast query response interval option. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 1517aea738f4..2627a657c3b3 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -558,6 +558,7 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL, + BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From 941121ee22a69935252473f03976f1f1200b9ae9 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 10 Aug 2021 18:29:27 +0300 Subject: net: bridge: vlan: add support for mcast startup query interval global option Add support to change and retrieve global vlan multicast startup query interval option. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 2627a657c3b3..b5d01538acd4 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -559,6 +559,7 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL, + BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From 62938182c35906c0ed4beb7845b93b8ffb937597 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 10 Aug 2021 18:29:30 +0300 Subject: net: bridge: vlan: add support for mcast querier global option Add support to change and retrieve global vlan multicast querier state. We just need to pass multicast context to br_multicast_set_querier instead of bridge device and the rest of the logic remains the same. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index b5d01538acd4..03fd14a4e377 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -560,6 +560,7 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL, + BRIDGE_VLANDB_GOPTS_MCAST_QUERIER, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From a97df080b6a86c105f98052ca3a9d66149b461b3 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 10 Aug 2021 18:29:31 +0300 Subject: net: bridge: vlan: add support for mcast router global option Add support to change and retrieve global vlan multicast router state which is used for the bridge itself. We just need to pass multicast context to br_multicast_set_router instead of bridge device and the rest of the logic remains the same. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 03fd14a4e377..2104dd3557b4 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -561,6 +561,7 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER, + BRIDGE_VLANDB_GOPTS_MCAST_ROUTER, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From dc002875c22b56c795ec24dc987ac2dd2081588e Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 10 Aug 2021 18:29:33 +0300 Subject: net: bridge: vlan: use br_rports_fill_info() to export mcast router ports Embed the standard multicast router port export by br_rports_fill_info() into a new global vlan attribute BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS. In order to have the same format for the global bridge mcast context and the per-vlan mcast context we need a double-nesting: - BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS - MDBA_ROUTER Currently we don't compare router lists, if any router port exists in the bridge mcast contexts we consider their option sets as different and export them separately. In addition we export the router port vlan id when dumping similar to the router port notification format. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 2104dd3557b4..620d86e825b8 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -562,6 +562,7 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER, + BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From 3d2a2544eae93987f0688c2d6ec06c76f9e1477b Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Fri, 25 Jun 2021 11:17:16 +0300 Subject: nl80211: vendor-cmd: add Intel vendor commands for iwlmei usage iwlmei allows to integrate with the CSME firmware. There are flows that are prioprietary for this purpose: * Get the information of the AP the CSME firmware is connected to. This is useful when we need to speed up the connection process in case the CSME firmware has a TCP connection that must be kept alive across the ownership transition. * Forbid roaming, which will happen when the CSME firmware wants to tell the user space not disrupt the connection. * Request ownership, upon driver boot when the CSME firmware owns the device. This is a notification sent by the kernel. All those commands are expected to be used by any software managing the connection (mainly NetworkManager). Those commands are expected to be used only in case the CSME firmware owns the device and doesn't want to release the device unless the host made sure that it can keep the connectivity. Here are the steps of the expected flow: 1) The machine boots while AMT has an active TCP connection 2) iwlwifi starts and tries to access the device 3) The device is not available because of the active TCP connection. (If there are no active connections, the CSME firmware would have allowed iwlwifi to use the device) Note that all the steps up to here don't involve iwlmei. All this happens in iwlwifi (in iwl_pcie_prepare_card_hw). 4) iwlmei establishes a connection to the CSME firmware (through SAP) Here iwlwifi uses iwlmei to access the device's capabilities (since it can't touch the device), but this is not relevant for the vendor commands. 5) The CSME firmware tells iwlmei that it uses the NIC and that there is an acitve TCP connection, and hence, the host needs to think twice before asking the CSME firmware to release the device 6) iwlmei tells iwlwifi to report HW RFKILL with a special reason Up to here, there was no user space involved. 7) The user space (NetworkManager) boots and sees that the device is in RFKILL because the host doesn't own the device 8) The user space asks the kernel what AP the CSME firmware is connected to (with the first vendor command mentionned above) 9) The user space checks if it has a profile that matches the reply from the CSME firmware 10) The user space installs a network to the wpa_supplicant with a specific BSSID and a specific frequency 11) The user space prevents any type of full scan 12) The user space asks iwlmei to request ownership on the device (with the third vendor command) 13) iwlmei request ownership from the CSME firmware 14) The CSME firmware grants ownership 15) iwlmei tells iwlwifi to lift the RFKILL 16) RFKILL OFF is reported to userspace 17) The host boots the device, loads the firwmare, and connect to a specific BSSID without scanning including IP in less than 600ms (this is what I measured, of course it depends on many factors) 18) The host reports to the CSME firmware that there is a connection 19) The TCP connection is preserved and the host has now connectivity 20) Later, the TCP connection to the CSME firmware is terminated 21) The CSME firmware tells iwlmei that it is now free to do whatever it likes 22) iwlwifi sends the second vendor command to tell the user space that it can remove the special network configuration and pick any SSID / BSSID it likes. Co-Developed-by: Ayala Beker Signed-off-by: Emmanuel Grumbach Link: https://lore.kernel.org/r/20210625081717.7680-4-emmanuel.grumbach@intel.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211-vnd-intel.h | 77 ++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 include/uapi/linux/nl80211-vnd-intel.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211-vnd-intel.h b/include/uapi/linux/nl80211-vnd-intel.h new file mode 100644 index 000000000000..0bf177b84fd9 --- /dev/null +++ b/include/uapi/linux/nl80211-vnd-intel.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2012-2014, 2018-2021 Intel Corporation + * Copyright (C) 2013-2015 Intel Mobile Communications GmbH + * Copyright (C) 2016-2017 Intel Deutschland GmbH + */ +#ifndef __VENDOR_CMD_INTEL_H__ +#define __VENDOR_CMD_INTEL_H__ + +#define INTEL_OUI 0x001735 + +/** + * enum iwl_mvm_vendor_cmd - supported vendor commands + * @IWL_MVM_VENDOR_CMD_GET_CSME_CONN_INFO: reports CSME connection info. + * @IWL_MVM_VENDOR_CMD_HOST_GET_OWNERSHIP: asks for ownership on the device. + * @IWL_MVM_VENDOR_CMD_ROAMING_FORBIDDEN_EVENT: notifies if roaming is allowed. + * It contains a &IWL_MVM_VENDOR_ATTR_ROAMING_FORBIDDEN and a + * &IWL_MVM_VENDOR_ATTR_VIF_ADDR attributes. + */ + +enum iwl_mvm_vendor_cmd { + IWL_MVM_VENDOR_CMD_GET_CSME_CONN_INFO = 0x2d, + IWL_MVM_VENDOR_CMD_HOST_GET_OWNERSHIP = 0x30, + IWL_MVM_VENDOR_CMD_ROAMING_FORBIDDEN_EVENT = 0x32, +}; + +enum iwl_vendor_auth_akm_mode { + IWL_VENDOR_AUTH_OPEN, + IWL_VENDOR_AUTH_RSNA = 0x6, + IWL_VENDOR_AUTH_RSNA_PSK, + IWL_VENDOR_AUTH_SAE = 0x9, + IWL_VENDOR_AUTH_MAX, +}; + +/** + * enum iwl_mvm_vendor_attr - attributes used in vendor commands + * @__IWL_MVM_VENDOR_ATTR_INVALID: attribute 0 is invalid + * @IWL_MVM_VENDOR_ATTR_VIF_ADDR: interface MAC address + * @IWL_MVM_VENDOR_ATTR_ADDR: MAC address + * @IWL_MVM_VENDOR_ATTR_SSID: SSID (binary attribute, 0..32 octets) + * @IWL_MVM_VENDOR_ATTR_STA_CIPHER: the cipher to use for the station with the + * mac address specified in &IWL_MVM_VENDOR_ATTR_ADDR. + * @IWL_MVM_VENDOR_ATTR_ROAMING_FORBIDDEN: u8 attribute. Indicates whether + * roaming is forbidden or not. Value 1 means roaming is forbidden, + * 0 mean roaming is allowed. + * @IWL_MVM_VENDOR_ATTR_AUTH_MODE: u32 attribute. Authentication mode type + * as specified in &enum iwl_vendor_auth_akm_mode. + * @IWL_MVM_VENDOR_ATTR_CHANNEL_NUM: u8 attribute. Contains channel number. + * @IWL_MVM_VENDOR_ATTR_BAND: u8 attribute. + * 0 for 2.4 GHz band, 1 for 5.2GHz band and 2 for 6GHz band. + * @IWL_MVM_VENDOR_ATTR_COLLOC_CHANNEL: u32 attribute. Channel number of + * collocated AP. Relevant for 6GHz AP info. + * @IWL_MVM_VENDOR_ATTR_COLLOC_ADDR: MAC address of a collocated AP. + * Relevant for 6GHz AP info. + * + * @NUM_IWL_MVM_VENDOR_ATTR: number of vendor attributes + * @MAX_IWL_MVM_VENDOR_ATTR: highest vendor attribute number + + */ +enum iwl_mvm_vendor_attr { + __IWL_MVM_VENDOR_ATTR_INVALID = 0x00, + IWL_MVM_VENDOR_ATTR_VIF_ADDR = 0x02, + IWL_MVM_VENDOR_ATTR_ADDR = 0x0a, + IWL_MVM_VENDOR_ATTR_SSID = 0x3d, + IWL_MVM_VENDOR_ATTR_STA_CIPHER = 0x51, + IWL_MVM_VENDOR_ATTR_ROAMING_FORBIDDEN = 0x64, + IWL_MVM_VENDOR_ATTR_AUTH_MODE = 0x65, + IWL_MVM_VENDOR_ATTR_CHANNEL_NUM = 0x66, + IWL_MVM_VENDOR_ATTR_BAND = 0x69, + IWL_MVM_VENDOR_ATTR_COLLOC_CHANNEL = 0x70, + IWL_MVM_VENDOR_ATTR_COLLOC_ADDR = 0x71, + + NUM_IWL_MVM_VENDOR_ATTR, + MAX_IWL_MVM_VENDOR_ATTR = NUM_IWL_MVM_VENDOR_ATTR - 1, +}; + +#endif /* __VENDOR_CMD_INTEL_H__ */ -- cgit v1.2.3 From c7fa1d9b1fb179375e889ff076a1566ecc997bfc Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 13 Aug 2021 18:00:00 +0300 Subject: net: bridge: mcast: dump ipv4 querier state Add support for dumping global IPv4 querier state, we dump the state only if our own querier is enabled or there has been another external querier which has won the election. For the bridge global state we use a new attribute IFLA_BR_MCAST_QUERIER_STATE and embed the state inside. The structure is: [IFLA_BR_MCAST_QUERIER_STATE] `[BRIDGE_QUERIER_IP_ADDRESS] - ip address of the querier `[BRIDGE_QUERIER_IP_PORT] - bridge port ifindex where the querier was seen (set only if external querier) `[BRIDGE_QUERIER_IP_OTHER_TIMER] - other querier timeout Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 10 ++++++++++ include/uapi/linux/if_link.h | 1 + 2 files changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 620d86e825b8..e0fff67fcd88 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -770,4 +770,14 @@ struct br_boolopt_multi { __u32 optval; __u32 optmask; }; + +enum { + BRIDGE_QUERIER_UNSPEC, + BRIDGE_QUERIER_IP_ADDRESS, + BRIDGE_QUERIER_IP_PORT, + BRIDGE_QUERIER_IP_OTHER_TIMER, + BRIDGE_QUERIER_PAD, + __BRIDGE_QUERIER_MAX +}; +#define BRIDGE_QUERIER_MAX (__BRIDGE_QUERIER_MAX - 1) #endif /* _UAPI_LINUX_IF_BRIDGE_H */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 5310003523ce..8aad65b69054 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -479,6 +479,7 @@ enum { IFLA_BR_MCAST_MLD_VERSION, IFLA_BR_VLAN_STATS_PER_PORT, IFLA_BR_MULTI_BOOLOPT, + IFLA_BR_MCAST_QUERIER_STATE, __IFLA_BR_MAX, }; -- cgit v1.2.3 From 85b4108211742c5dd4f9f56c1d0704b4e0d4c98e Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 13 Aug 2021 18:00:01 +0300 Subject: net: bridge: mcast: dump ipv6 querier state Add support for dumping global IPv6 querier state, we dump the state only if our own querier is enabled or there has been another external querier which has won the election. For the bridge global state we use a new attribute IFLA_BR_MCAST_QUERIER_STATE and embed the state inside. The structure is: [IFLA_BR_MCAST_QUERIER_STATE] `[BRIDGE_QUERIER_IPV6_ADDRESS] - ip address of the querier `[BRIDGE_QUERIER_IPV6_PORT] - bridge port ifindex where the querier was seen (set only if external querier) `[BRIDGE_QUERIER_IPV6_OTHER_TIMER] - other querier timeout IPv4 and IPv6 attributes are embedded at the same level of IFLA_BR_MCAST_QUERIER_STATE. If we didn't dump anything we cancel the nest and return. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index e0fff67fcd88..eceaad200bf6 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -777,6 +777,9 @@ enum { BRIDGE_QUERIER_IP_PORT, BRIDGE_QUERIER_IP_OTHER_TIMER, BRIDGE_QUERIER_PAD, + BRIDGE_QUERIER_IPV6_ADDRESS, + BRIDGE_QUERIER_IPV6_PORT, + BRIDGE_QUERIER_IPV6_OTHER_TIMER, __BRIDGE_QUERIER_MAX }; #define BRIDGE_QUERIER_MAX (__BRIDGE_QUERIER_MAX - 1) -- cgit v1.2.3 From ddc649d158c560c6685be1701900a6e456ecceac Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 13 Aug 2021 18:00:02 +0300 Subject: net: bridge: vlan: dump mcast ctx querier state Use the new mcast querier state dump infrastructure and export vlans' mcast context querier state embedded in attribute BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index eceaad200bf6..f71a81fdbbc6 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -563,6 +563,7 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_QUERIER, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS, + BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE, __BRIDGE_VLANDB_GOPTS_MAX }; #define BRIDGE_VLANDB_GOPTS_MAX (__BRIDGE_VLANDB_GOPTS_MAX - 1) -- cgit v1.2.3 From 6c9b40844751ea30c72f7a2f92f4d704bc6b2927 Mon Sep 17 00:00:00 2001 From: Cai Huoqing Date: Fri, 13 Aug 2021 20:08:02 +0800 Subject: net: Remove net/ipx.h and uapi/linux/ipx.h header files commit <47595e32869f> ("") indicated the ipx network layer as obsolete in Jan 2018, updated in the MAINTAINERS file now, after being exposed for 3 years to refactoring, so to delete uapi/linux/ipx.h and net/ipx.h header files for good. additionally, there is no module that depends on ipx.h except a broken staging driver(r8188eu) Signed-off-by: Cai Huoqing Signed-off-by: David S. Miller --- include/uapi/linux/ipx.h | 87 ------------------------------------------------ 1 file changed, 87 deletions(-) delete mode 100644 include/uapi/linux/ipx.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ipx.h b/include/uapi/linux/ipx.h deleted file mode 100644 index 3168137adae8..000000000000 --- a/include/uapi/linux/ipx.h +++ /dev/null @@ -1,87 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _IPX_H_ -#define _IPX_H_ -#include /* for compatibility with glibc netipx/ipx.h */ -#include -#include -#include -#define IPX_NODE_LEN 6 -#define IPX_MTU 576 - -#if __UAPI_DEF_SOCKADDR_IPX -struct sockaddr_ipx { - __kernel_sa_family_t sipx_family; - __be16 sipx_port; - __be32 sipx_network; - unsigned char sipx_node[IPX_NODE_LEN]; - __u8 sipx_type; - unsigned char sipx_zero; /* 16 byte fill */ -}; -#endif /* __UAPI_DEF_SOCKADDR_IPX */ - -/* - * So we can fit the extra info for SIOCSIFADDR into the address nicely - */ -#define sipx_special sipx_port -#define sipx_action sipx_zero -#define IPX_DLTITF 0 -#define IPX_CRTITF 1 - -#if __UAPI_DEF_IPX_ROUTE_DEFINITION -struct ipx_route_definition { - __be32 ipx_network; - __be32 ipx_router_network; - unsigned char ipx_router_node[IPX_NODE_LEN]; -}; -#endif /* __UAPI_DEF_IPX_ROUTE_DEFINITION */ - -#if __UAPI_DEF_IPX_INTERFACE_DEFINITION -struct ipx_interface_definition { - __be32 ipx_network; - unsigned char ipx_device[16]; - unsigned char ipx_dlink_type; -#define IPX_FRAME_NONE 0 -#define IPX_FRAME_SNAP 1 -#define IPX_FRAME_8022 2 -#define IPX_FRAME_ETHERII 3 -#define IPX_FRAME_8023 4 -#define IPX_FRAME_TR_8022 5 /* obsolete */ - unsigned char ipx_special; -#define IPX_SPECIAL_NONE 0 -#define IPX_PRIMARY 1 -#define IPX_INTERNAL 2 - unsigned char ipx_node[IPX_NODE_LEN]; -}; -#endif /* __UAPI_DEF_IPX_INTERFACE_DEFINITION */ - -#if __UAPI_DEF_IPX_CONFIG_DATA -struct ipx_config_data { - unsigned char ipxcfg_auto_select_primary; - unsigned char ipxcfg_auto_create_interfaces; -}; -#endif /* __UAPI_DEF_IPX_CONFIG_DATA */ - -/* - * OLD Route Definition for backward compatibility. - */ - -#if __UAPI_DEF_IPX_ROUTE_DEF -struct ipx_route_def { - __be32 ipx_network; - __be32 ipx_router_network; -#define IPX_ROUTE_NO_ROUTER 0 - unsigned char ipx_router_node[IPX_NODE_LEN]; - unsigned char ipx_device[16]; - unsigned short ipx_flags; -#define IPX_RT_SNAP 8 -#define IPX_RT_8022 4 -#define IPX_RT_BLUEBOOK 2 -#define IPX_RT_ROUTED 1 -}; -#endif /* __UAPI_DEF_IPX_ROUTE_DEF */ - -#define SIOCAIPXITFCRT (SIOCPROTOPRIVATE) -#define SIOCAIPXPRISLT (SIOCPROTOPRIVATE + 1) -#define SIOCIPXCFGDATA (SIOCPROTOPRIVATE + 2) -#define SIOCIPXNCPCONN (SIOCPROTOPRIVATE + 3) -#endif /* _IPX_H_ */ -- cgit v1.2.3 From 5b4ecc3d4c4aab8d002fe6358885c10e7b57e432 Mon Sep 17 00:00:00 2001 From: Guangbin Huang Date: Mon, 16 Aug 2021 10:15:27 +0800 Subject: ethtool: add two link extended substates of bad signal integrity ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST means the input external clock signal for SerDes is too weak or lost. ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS means the received signal for SerDes is too weak because analog loss of signal. Signed-off-by: Guangbin Huang Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 67aa7134b301..b6db6590baf0 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -639,6 +639,8 @@ enum ethtool_link_ext_substate_link_logical_mismatch { enum ethtool_link_ext_substate_bad_signal_integrity { ETHTOOL_LINK_EXT_SUBSTATE_BSI_LARGE_NUMBER_OF_PHYSICAL_ERRORS = 1, ETHTOOL_LINK_EXT_SUBSTATE_BSI_UNSUPPORTED_RATE, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_REFERENCE_CLOCK_LOST, + ETHTOOL_LINK_EXT_SUBSTATE_BSI_SERDES_ALOS, }; /* More information in addition to ETHTOOL_LINK_EXT_STATE_CABLE_ISSUE. */ -- cgit v1.2.3 From b89fbfbb854c9afc3047e8273cc3a694650b802e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:57 -0700 Subject: bpf: Implement minimal BPF perf link Introduce a new type of BPF link - BPF perf link. This brings perf_event-based BPF program attachments (perf_event, tracepoints, kprobes, and uprobes) into the common BPF link infrastructure, allowing to list all active perf_event based attachments, auto-detaching BPF program from perf_event when link's FD is closed, get generic BPF link fdinfo/get_info functionality. BPF_LINK_CREATE command expects perf_event's FD as target_fd. No extra flags are currently supported. Force-detaching and atomic BPF program updates are not yet implemented, but with perf_event-based BPF links we now have common framework for this without the need to extend ioctl()-based perf_event interface. One interesting consideration is a new value for bpf_attach_type, which BPF_LINK_CREATE command expects. Generally, it's either 1-to-1 mapping from bpf_attach_type to bpf_prog_type, or many-to-1 mapping from a subset of bpf_attach_types to one bpf_prog_type (e.g., see BPF_PROG_TYPE_SK_SKB or BPF_PROG_TYPE_CGROUP_SOCK). In this case, though, we have three different program types (KPROBE, TRACEPOINT, PERF_EVENT) using the same perf_event-based mechanism, so it's many bpf_prog_types to one bpf_attach_type. I chose to define a single BPF_PERF_EVENT attach type for all of them and adjust link_create()'s logic for checking correspondence between attach type and program type. The alternative would be to define three new attach types (e.g., BPF_KPROBE, BPF_TRACEPOINT, and BPF_PERF_EVENT), but that seemed like unnecessary overkill and BPF_KPROBE will cause naming conflicts with BPF_KPROBE() macro, defined by libbpf. I chose to not do this to avoid unnecessary proliferation of bpf_attach_type enum values and not have to deal with naming conflicts. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210815070609.987780-5-andrii@kernel.org --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2db6925e04f4..94fe8329b28f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -993,6 +993,7 @@ enum bpf_attach_type { BPF_SK_SKB_VERDICT, BPF_SK_REUSEPORT_SELECT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, + BPF_PERF_EVENT, __MAX_BPF_ATTACH_TYPE }; @@ -1006,6 +1007,7 @@ enum bpf_link_type { BPF_LINK_TYPE_ITER = 4, BPF_LINK_TYPE_NETNS = 5, BPF_LINK_TYPE_XDP = 6, + BPF_LINK_TYPE_PERF_EVENT = 7, MAX_BPF_LINK_TYPE, }; -- cgit v1.2.3 From 82e6b1eee6a8875ef4eacfd60711cce6965c6b04 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:58 -0700 Subject: bpf: Allow to specify user-provided bpf_cookie for BPF perf links Add ability for users to specify custom u64 value (bpf_cookie) when creating BPF link for perf_event-backed BPF programs (kprobe/uprobe, perf_event, tracepoints). This is useful for cases when the same BPF program is used for attaching and processing invocation of different tracepoints/kprobes/uprobes in a generic fashion, but such that each invocation is distinguished from each other (e.g., BPF program can look up additional information associated with a specific kernel function without having to rely on function IP lookups). This enables new use cases to be implemented simply and efficiently that previously were possible only through code generation (and thus multiple instances of almost identical BPF program) or compilation at runtime (BCC-style) on target hosts (even more expensive resource-wise). For uprobes it is not even possible in some cases to know function IP before hand (e.g., when attaching to shared library without PID filtering, in which case base load address is not known for a library). This is done by storing u64 bpf_cookie in struct bpf_prog_array_item, corresponding to each attached and run BPF program. Given cgroup BPF programs already use two 8-byte pointers for their needs and cgroup BPF programs don't have (yet?) support for bpf_cookie, reuse that space through union of cgroup_storage and new bpf_cookie field. Make it available to kprobe/tracepoint BPF programs through bpf_trace_run_ctx. This is set by BPF_PROG_RUN_ARRAY, used by kprobe/uprobe/tracepoint BPF program execution code, which luckily is now also split from BPF_PROG_RUN_ARRAY_CG. This run context will be utilized by a new BPF helper giving access to this user-provided cookie value from inside a BPF program. Generic perf_event BPF programs will access this value from perf_event itself through passed in BPF program context. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210815070609.987780-6-andrii@kernel.org --- include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 94fe8329b28f..63ee482d50e1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1448,6 +1448,13 @@ union bpf_attr { __aligned_u64 iter_info; /* extra bpf_iter_link_info */ __u32 iter_info_len; /* iter_info length */ }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; }; } link_create; -- cgit v1.2.3 From 7adfc6c9b315e174cf8743b21b7b691c8766791b Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sun, 15 Aug 2021 00:05:59 -0700 Subject: bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value Add new BPF helper, bpf_get_attach_cookie(), which can be used by BPF programs to get access to a user-provided bpf_cookie value, specified during BPF program attachment (BPF link creation) time. Naming is hard, though. With the concept being named "BPF cookie", I've considered calling the helper: - bpf_get_cookie() -- seems too unspecific and easily mistaken with socket cookie; - bpf_get_bpf_cookie() -- too much tautology; - bpf_get_link_cookie() -- would be ok, but while we create a BPF link to attach BPF program to BPF hook, it's still an "attachment" and the bpf_cookie is associated with BPF program attachment to a hook, not a BPF link itself. Technically, we could support bpf_cookie with old-style cgroup programs.So I ultimately rejected it in favor of bpf_get_attach_cookie(). Currently all perf_event-backed BPF program types support bpf_get_attach_cookie() helper. Follow-up patches will add support for fentry/fexit programs as well. While at it, mark bpf_tracing_func_proto() as static to make it obvious that it's only used from within the kernel/trace/bpf_trace.c. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210815070609.987780-7-andrii@kernel.org --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 63ee482d50e1..c4f7892edb2b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4856,6 +4856,21 @@ union bpf_attr { * Get address of the traced function (for tracing and kprobe programs). * Return * Address of the traced function. + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * Return + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5032,6 +5047,7 @@ union bpf_attr { FN(timer_start), \ FN(timer_cancel), \ FN(get_func_ip), \ + FN(get_attach_cookie), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 0d2ab3aea50bb02ff0c9c3d53c7b2b4b21cdd59d Mon Sep 17 00:00:00 2001 From: John Crispin Date: Fri, 2 Jul 2021 19:44:07 +0200 Subject: nl80211: add support for BSS coloring This patch adds support for BSS color collisions to the wireless subsystem. Add the required functionality to nl80211 that will notify about color collisions, triggering the color change and notifying when it is completed. Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: John Crispin Link: https://lore.kernel.org/r/500b3582aec8fe2c42ef46f3117b148cb7cbceb5.1625247619.git.lorenzo@kernel.org [remove unnecessary NULL initialisation] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index db474994fa73..c2efea98e060 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1185,6 +1185,21 @@ * passed using %NL80211_ATTR_SAR_SPEC. %NL80211_ATTR_WIPHY is used to * specify the wiphy index to be applied to. * + * @NL80211_CMD_OBSS_COLOR_COLLISION: This notification is sent out whenever + * mac80211/drv detects a bss color collision. + * + * @NL80211_CMD_COLOR_CHANGE_REQUEST: This command is used to indicate that + * userspace wants to change the BSS color. + * + * @NL80211_CMD_COLOR_CHANGE_STARTED: Notify userland, that a color change has + * started + * + * @NL80211_CMD_COLOR_CHANGE_ABORTED: Notify userland, that the color change has + * been aborted + * + * @NL80211_CMD_COLOR_CHANGE_COMPLETED: Notify userland that the color change + * has completed + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1417,6 +1432,14 @@ enum nl80211_commands { NL80211_CMD_SET_SAR_SPECS, + NL80211_CMD_OBSS_COLOR_COLLISION, + + NL80211_CMD_COLOR_CHANGE_REQUEST, + + NL80211_CMD_COLOR_CHANGE_STARTED, + NL80211_CMD_COLOR_CHANGE_ABORTED, + NL80211_CMD_COLOR_CHANGE_COMPLETED, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2560,6 +2583,16 @@ enum nl80211_commands { * disassoc events to indicate that an immediate reconnect to the AP * is desired. * + * @NL80211_ATTR_OBSS_COLOR_BITMAP: bitmap of the u64 BSS colors for the + * %NL80211_CMD_OBSS_COLOR_COLLISION event. + * + * @NL80211_ATTR_COLOR_CHANGE_COUNT: u8 attribute specifying the number of TBTT's + * until the color switch event. + * @NL80211_ATTR_COLOR_CHANGE_COLOR: u8 attribute specifying the color that we are + * switching to + * @NL80211_ATTR_COLOR_CHANGE_ELEMS: Nested set of attributes containing the IE + * information for the time while performing a color switch. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3057,6 +3090,12 @@ enum nl80211_attrs { NL80211_ATTR_DISABLE_HE, + NL80211_ATTR_OBSS_COLOR_BITMAP, + + NL80211_ATTR_COLOR_CHANGE_COUNT, + NL80211_ATTR_COLOR_CHANGE_COLOR, + NL80211_ATTR_COLOR_CHANGE_ELEMS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5953,6 +5992,9 @@ enum nl80211_feature_flags { * frame protection for all management frames exchanged during the * negotiation and range measurement procedure. * + * @NL80211_EXT_FEATURE_BSS_COLOR: The driver supports BSS color collision + * detection and change announcemnts. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -6017,6 +6059,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SECURE_LTF, NL80211_EXT_FEATURE_SECURE_RTT, NL80211_EXT_FEATURE_PROT_RANGE_NEGO_AND_MEASURE, + NL80211_EXT_FEATURE_BSS_COLOR, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From 2843ff6f36db7074e17bf5d637a14da08c54aed8 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 17 Aug 2021 15:07:23 -0700 Subject: mptcp: remote addresses fullmesh This patch added and managed a new per endpoint flag, named MPTCP_PM_ADDR_FLAG_FULLMESH. In mptcp_pm_create_subflow_or_signal_addr(), if such flag is set, instead of: remote_address((struct sock_common *)sk, &remote); fill a temporary allocated array of all known remote address. After releaseing the pm lock loop on such array and create a subflow for each remote address from the given local. Note that the we could still use an array even for non 'fullmesh' endpoint: with a single entry corresponding to the primary MPC subflow remote address. Suggested-by: Paolo Abeni Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/uapi/linux/mptcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 7b05f7102321..f66038b9551f 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -73,6 +73,7 @@ enum { #define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) #define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) #define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) +#define MPTCP_PM_ADDR_FLAG_FULLMESH (1 << 3) enum { MPTCP_PM_CMD_UNSPEC, -- cgit v1.2.3 From 2796d846d74a18cc6563e96eff8bf28c5e06f912 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 20 Aug 2021 15:42:55 +0300 Subject: net: bridge: vlan: convert mcast router global option to per-vlan entry The per-vlan router option controls the port/vlan and host vlan entries' mcast router config. The global option controlled only the host vlan config, but that is unnecessary and incosistent as it's not really a global vlan option, but rather bridge option to control host router config, so convert BRIDGE_VLANDB_GOPTS_MCAST_ROUTER to BRIDGE_VLANDB_ENTRY_MCAST_ROUTER which can be used to control both host vlan and port vlan mcast router config. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index f71a81fdbbc6..2711c3522010 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -506,6 +506,7 @@ enum { BRIDGE_VLANDB_ENTRY_STATE, BRIDGE_VLANDB_ENTRY_TUNNEL_INFO, BRIDGE_VLANDB_ENTRY_STATS, + BRIDGE_VLANDB_ENTRY_MCAST_ROUTER, __BRIDGE_VLANDB_ENTRY_MAX, }; #define BRIDGE_VLANDB_ENTRY_MAX (__BRIDGE_VLANDB_ENTRY_MAX - 1) @@ -561,7 +562,6 @@ enum { BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER, - BRIDGE_VLANDB_GOPTS_MCAST_ROUTER, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE, __BRIDGE_VLANDB_GOPTS_MAX -- cgit v1.2.3 From 6fc88c354f3af83ffa2c285b86e76c759755693f Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Thu, 19 Aug 2021 02:24:20 -0700 Subject: bpf: Migrate cgroup_bpf to internal cgroup_bpf_attach_type enum Add an enum (cgroup_bpf_attach_type) containing only valid cgroup_bpf attach types and a function to map bpf_attach_type values to the new enum. Inspired by netns_bpf_attach_type. Then, migrate cgroup_bpf to use cgroup_bpf_attach_type wherever possible. Functionality is unchanged as attach_type_to_prog_type switches in bpf/syscall.c were preventing non-cgroup programs from making use of the invalid cgroup_bpf array slots. As a result struct cgroup_bpf uses 504 fewer bytes relative to when its arrays were sized using MAX_BPF_ATTACH_TYPE. bpf_cgroup_storage is notably not migrated as struct bpf_cgroup_storage_key is part of uapi and contains a bpf_attach_type member which is not meant to be opaque. Similarly, bpf_cgroup_link continues to report its bpf_attach_type member to userspace via fdinfo and bpf_link_info. To ease disambiguation, bpf_attach_type variables are renamed from 'type' to 'atype' when changed to cgroup_bpf_attach_type. Signed-off-by: Dave Marchevsky Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210819092420.1984861-2-davemarchevsky@fb.com --- include/uapi/linux/bpf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c4f7892edb2b..191f0b286ee3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -84,7 +84,7 @@ struct bpf_lpm_trie_key { struct bpf_cgroup_storage_key { __u64 cgroup_inode_id; /* cgroup inode id */ - __u32 attach_type; /* program attach type */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ }; union bpf_iter_link_info { -- cgit v1.2.3 From 029ee6b14356b94120bedb852dcdaefc0a282cf1 Mon Sep 17 00:00:00 2001 From: Yufeng Mo Date: Fri, 20 Aug 2021 15:35:17 +0800 Subject: ethtool: add two coalesce attributes for CQE mode Currently, there are many drivers who support CQE mode configuration, some configure it as a fixed when initialized, some provide an interface to change it by ethtool private flags. In order to make it more generic, add two new 'ETHTOOL_A_COALESCE_USE_CQE_TX' and 'ETHTOOL_A_COALESCE_USE_CQE_RX' coalesce attributes, then these parameters can be accessed by ethtool netlink coalesce uAPI. Also add an new structure kernel_ethtool_coalesce, then the new parameter can be added into this struct. Signed-off-by: Yufeng Mo Signed-off-by: Huazhong Tan Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index b3b93710eff7..5545f1ca9237 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -377,6 +377,8 @@ enum { ETHTOOL_A_COALESCE_TX_USECS_HIGH, /* u32 */ ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, /* u32 */ ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, /* u32 */ + ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, /* u8 */ + ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, /* u8 */ /* add new constants above here */ __ETHTOOL_A_COALESCE_CNT, -- cgit v1.2.3 From dd6e10fbd9fb86a571d925602c8a24bb4d09a2a7 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Mon, 23 Aug 2021 19:43:49 -0700 Subject: bpf: Add bpf_task_pt_regs() helper The motivation behind this helper is to access userspace pt_regs in a kprobe handler. uprobe's ctx is the userspace pt_regs. kprobe's ctx is the kernelspace pt_regs. bpf_task_pt_regs() allows accessing userspace pt_regs in a kprobe handler. The final case (kernelspace pt_regs in uprobe) is pretty rare (usermode helper) so I think that can be solved later if necessary. More concretely, this helper is useful in doing BPF-based DWARF stack unwinding. Currently the kernel can only do framepointer based stack unwinds for userspace code. This is because the DWARF state machines are too fragile to be computed in kernelspace [0]. The idea behind DWARF-based stack unwinds w/ BPF is to copy a chunk of the userspace stack (while in prog context) and send it up to userspace for unwinding (probably with libunwind) [1]. This would effectively enable profiling applications with -fomit-frame-pointer using kprobes and uprobes. [0]: https://lkml.org/lkml/2012/2/10/356 [1]: https://github.com/danobi/bpf-dwarf-walk Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/e2718ced2d51ef4268590ab8562962438ab82815.1629772842.git.dxu@dxuuu.xyz --- include/uapi/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 191f0b286ee3..791f31dd0abe 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4871,6 +4871,12 @@ union bpf_attr { * Return * Value specified by user at BPF link creation/attachment time * or 0, if it was not specified. + * + * long bpf_task_pt_regs(struct task_struct *task) + * Description + * Get the struct pt_regs associated with **task**. + * Return + * A pointer to struct pt_regs. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5048,6 +5054,7 @@ union bpf_attr { FN(timer_cancel), \ FN(get_func_ip), \ FN(get_attach_cookie), \ + FN(task_pt_regs), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 49b99da2c9ce13ffcd93fe3a0f5670791c1d76f7 Mon Sep 17 00:00:00 2001 From: Rocco Yue Date: Fri, 27 Aug 2021 23:04:12 +0800 Subject: ipv6: add IFLA_INET6_RA_MTU to expose mtu value The kernel provides a "/proc/sys/net/ipv6/conf//mtu" file, which can temporarily record the mtu value of the last received RA message when the RA mtu value is lower than the interface mtu, but this proc has following limitations: (1) when the interface mtu (/sys/class/net//mtu) is updeated, mtu6 (/proc/sys/net/ipv6/conf//mtu) will be updated to the value of interface mtu; (2) mtu6 (/proc/sys/net/ipv6/conf//mtu) only affect ipv6 connection, and not affect ipv4. Therefore, when the mtu option is carried in the RA message, there will be a problem that the user sometimes cannot obtain RA mtu value correctly by reading mtu6. After this patch set, if a RA message carries the mtu option, you can send a netlink msg which nlmsg_type is RTM_GETLINK, and then by parsing the attribute of IFLA_INET6_RA_MTU to get the mtu value carried in the RA message received on the inet6 device. In addition, you can also get a link notification when ra_mtu is updated so it doesn't have to poll. In this way, if the MTU values that the device receives from the network in the PCO IPv4 and the RA IPv6 procedures are different, the user can obtain the correct ipv6 ra_mtu value and compare the value of ra_mtu and ipv4 mtu, then the device can use the lower MTU value for both IPv4 and IPv6. Signed-off-by: Rocco Yue Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20210827150412.9267-1-rocco.yue@mediatek.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8aad65b69054..eebd3894fe89 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -417,6 +417,7 @@ enum { IFLA_INET6_ICMP6STATS, /* statistics (icmpv6) */ IFLA_INET6_TOKEN, /* device token */ IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */ + IFLA_INET6_RA_MTU, /* mtu carried in the RA message */ __IFLA_INET6_MAX }; -- cgit v1.2.3